#### Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

#### Loading Cleaned Dataset

In [1052]:
#Loading cleaned dataset
df=pd.read_csv("clean_data.csv")
df.head(5)

Unnamed: 0,Age,Gender,Country,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,37,female,United States,No,Yes,Often,6-25,No,Yes,Yes,...,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,44,male,United States,No,No,Rarely,More than 1000,No,No,Don't know,...,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,32,male,Canada,No,No,Rarely,6-25,No,Yes,No,...,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,31,male,United Kingdom,Yes,Yes,Often,26-100,No,Yes,No,...,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,31,male,United States,No,No,Never,100-500,Yes,Yes,Yes,...,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No


#### Encoding Categorial Features

In [1053]:
features=['Gender','remote_work','tech_company','wellness_program','treatment']
target=['Age']

data= df[features + target].dropna().copy()

X=data[features]
y=data[target].values

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
preprocessor = ColumnTransformer([('category', ohe, features)])
X_encoded = preprocessor.fit_transform(X)

#### Preprocessing

In [1054]:
#standardization
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X_encoded)

#splitting 
X_train, X_test, y_train ,y_test = train_test_split( X_scaled, y, test_size=0.2, random_state=42)

scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

#### Linear Regression

In [1055]:
lr=LinearRegression()
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Linear Regression")
print("RMSE:", round(rmse,3))
print("MAE :", round(mae,3))
print("R²  :", round(r2,3))

Linear Regression
RMSE: 7.152
MAE : 5.484
R²  : 0.022


#### Random Forest Regressor

In [1056]:
rfr=RandomForestRegressor(n_estimators=100,random_state=42)
rfr.fit(X_train,y_train)
y_pred=rfr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Random Forest Regressor")
print("RMSE:",round (rmse,3))
print("MAE :", round(mae,3))
print("R²  :", round(r2,3))

Random Forest Regressor
RMSE: 7.214
MAE : 5.579
R²  : 0.005


### Interpretation
##### RMSE (Root Mean Squared Error):
  Measures average prediction error in years, so Linear Regression errs by 7.139 years, while Random Forest does better at 7.22 years 
##### MAE (Mean Absolute Error):
  Linear model off by 5.504 years on average; Random Forest improves to 5.572 years.
##### R² Score: 
  This is giving poor value which indicates that the features we are considering are not related to the age hence, they're lacking in predicting age.

#### Let's check R² score for each feature.

In [1057]:
feat_check = ['Gender','remote_work', 'tech_company', 'no_employees','work_interfere','family_history',
              'treatment','benefits','care_options','seek_help','wellness_program','anonymity',
              'leave','mental_health_consequence','phys_health_consequence','coworkers','supervisor',
              'mental_health_interview','phys_health_interview','mental_vs_physical','obs_consequence']

X_check=df[feat_check]

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
preprocessor = ColumnTransformer([('category', ohe, feat_check)])
X_check_encoded = preprocessor.fit_transform(X_check)

X_check_scaled=scaler.fit_transform(X_check_encoded)

In [1058]:
from sklearn.model_selection import cross_val_score

for feat in feat_check:
    Xf = X_check_scaled[:, [feat_check.index(feat)]]
    cv = cross_val_score(LinearRegression(), Xf, y, cv=5, scoring='r2')
    print(feat, cv.mean())

Gender -0.04048769742582572
remote_work -0.04005107715624619
tech_company -0.03755480986503157
no_employees -0.02186254534908383
work_interfere -0.02186254534908383
family_history -0.034884771552950486
treatment -0.034884771552950444
benefits -0.039708299252199805
care_options -0.03931969595819407
seek_help -0.03219401031133138
wellness_program -0.036350857984311744
anonymity -0.026201497383810102
leave -0.020375022365065255
mental_health_consequence -0.04122272921330761
phys_health_consequence -0.03776912129957459
coworkers -0.037860899983575405
supervisor -0.03877394985087255
mental_health_interview -0.0328833970409105
phys_health_interview -0.039335328037824316
mental_vs_physical -0.039335328037824316
obs_consequence -0.032656403949145575


This indicates that none of the features are good for predicting the age of the person.

#### Let's try some other models
 1. Lasso CV
 2. Ridge Cv
 3. Elastic Net

In [1059]:
from sklearn.linear_model import ElasticNet, LassoCV, RidgeCV

#LassoCV
model=LassoCV()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
r2=r2_score(y_test,y_pred)
mae=mean_absolute_error(y_test,y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Lasso CV")
print("RMSE:",round (rmse,3))
print("MAE :", round(mae,3))
print("R²  :", round(r2,3))
print("="*12)

#RisgeCV
model=RidgeCV()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
r2=r2_score(y_test,y_pred)
mae=mean_absolute_error(y_test,y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Ridge CV")
print("RMSE:",round (rmse,3))
print("MAE :", round(mae,3))
print("R²  :", round(r2,3))
print("="*12)

#ElasticNet
model=ElasticNet()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
r2=r2_score(y_test,y_pred)
mae=mean_absolute_error(y_test,y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Elastic Net")
print("RMSE:",round (rmse,3))
print("MAE :", round(mae,3))
print("R²  :", round(r2,3))
print("="*12)

Lasso CV
RMSE: 7.149
MAE : 5.484
R²  : 0.023
Ridge CV
RMSE: 7.151
MAE : 5.484
R²  : 0.022
Elastic Net
RMSE: 7.188
MAE : 5.553
R²  : 0.012


##### Even LassoCV, RidgeCv, ElasticNet didn't perform well. Hence, we will move on the Unsupervised learning and use PCA for better results