In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('Census.csv')

In [2]:
#filling missing(NaN) values in the DataFrame df with the value 0.
df = df.fillna(0)

In [3]:
# Define categorical columns
categorical_columns = ["Region", "Residence Type", "Family Composition", "Population Base", "Age", "Sex", 
                     "Marital Status", "Country of Birth", "Student", "Health", "Ethnic Group", "Religion", 
                     "Economic Activity", "Occupation", "Industry", "Hours worked per week", "Approximated Social Grade"]

# Initialize LabelEncoder
LE = LabelEncoder()

# Apply label encoding to each categorical column
df[categorical_columns] = df[categorical_columns].apply(lambda col: LE.fit_transform(col))

In [4]:
df.drop(columns=['Hours worked per week'], inplace=True)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Assuming df is DataFrame

# Split the dataset into features (X) and the target variable (y)
X = df.drop(["No of hours"], axis=1)
y = df["No of hours"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Algorithm 1: Linear Regression
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

# Algorithm 2: Random Forest Regression
random_forest_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_regressor.fit(X_train, y_train)

# Evaluate both models
linear_regression_predictions = linear_regression.predict(X_test)
random_forest_predictions = random_forest_regressor.predict(X_test)

# Calculate metrics for linear regression
linear_regression_rmse = np.sqrt(mean_squared_error(y_test, linear_regression_predictions))
linear_regression_r2 = r2_score(y_test, linear_regression_predictions)
linear_regression_mae = mean_absolute_error(y_test, linear_regression_predictions)

# Calculate metrics for random forest regression
random_forest_rmse = np.sqrt(mean_squared_error(y_test, random_forest_predictions))
random_forest_r2 = r2_score(y_test, random_forest_predictions)
random_forest_mae = mean_absolute_error(y_test, random_forest_predictions)

# Compare and discuss the results
print("Linear Regression Results:")
print(f"RMSE: {linear_regression_rmse:.2f}")
print(f"R-squared (R2): {linear_regression_r2:.2f}")
print(f"Mean Absolute Error: {linear_regression_mae:.2f}")
print()

print("Random Forest Regression Results:")
print(f"RMSE: {random_forest_rmse:.2f}")
print(f"R-squared (R2): {random_forest_r2:.2f}")
print(f"Mean Absolute Error: {random_forest_mae:.2f}")
print()

# Conclusion and discussion
if linear_regression_r2 > random_forest_r2:
    print("Linear Regression is best when compared to other.")
else:
    print("Random Forest Regression is best when compared to other.")

    

Linear Regression Results:
RMSE: 13.67
R-squared (R2): 0.53
Mean Absolute Error: 11.01

Random Forest Regression Results:
RMSE: 8.18
R-squared (R2): 0.83
Mean Absolute Error: 4.39

Random Forest Regression is best when compared to other.
