In [78]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

import warnings
warnings.filterwarnings('ignore')

In [79]:
df=pd.read_csv("data/data_cleaned.csv")
df.head()

Unnamed: 0,Age,Gender,Region,Family_Income,Family_History_Diabetes,Parent_Diabetes_Type,Genetic_Risk_Score,BMI,Physical_Activity_Level,Dietary_Habits,...,Smoking,Alcohol_Consumption,Fasting_Blood_Sugar,HbA1c,Cholesterol_Level,Prediabetes,Diabetes_Type,Sleep_Hours,Stress_Level,Screen_Time
0,21,Male,North,2209393,No,,6,31.4,Sedentary,Moderate,...,Yes,No,95.6,9.5,163.3,Yes,,7.7,7,6.8
1,18,Female,Central,387650,No,,5,24.4,Active,Unhealthy,...,No,No,164.9,5.0,169.1,Yes,,7.9,8,6.0
2,25,Male,North,383333,No,,6,20.0,Moderate,Moderate,...,No,No,110.5,8.3,296.3,Yes,Type 1,7.6,8,4.6
3,22,Male,Northeast,2443733,No,,4,39.8,Moderate,Unhealthy,...,No,Yes,160.7,4.6,252.8,No,,9.5,2,10.9
4,19,Male,Central,1449463,No,,4,19.2,Moderate,Moderate,...,No,Yes,73.7,5.3,252.3,No,,6.4,2,1.3


### Divide Independant and Dependant Features

In [80]:
X=df.drop(columns=['Family_Income'],axis=1)
X.head()

Unnamed: 0,Age,Gender,Region,Family_History_Diabetes,Parent_Diabetes_Type,Genetic_Risk_Score,BMI,Physical_Activity_Level,Dietary_Habits,Fast_Food_Intake,Smoking,Alcohol_Consumption,Fasting_Blood_Sugar,HbA1c,Cholesterol_Level,Prediabetes,Diabetes_Type,Sleep_Hours,Stress_Level,Screen_Time
0,21,Male,North,No,,6,31.4,Sedentary,Moderate,1,Yes,No,95.6,9.5,163.3,Yes,,7.7,7,6.8
1,18,Female,Central,No,,5,24.4,Active,Unhealthy,5,No,No,164.9,5.0,169.1,Yes,,7.9,8,6.0
2,25,Male,North,No,,6,20.0,Moderate,Moderate,2,No,No,110.5,8.3,296.3,Yes,Type 1,7.6,8,4.6
3,22,Male,Northeast,No,,4,39.8,Moderate,Unhealthy,4,No,Yes,160.7,4.6,252.8,No,,9.5,2,10.9
4,19,Male,Central,No,,4,19.2,Moderate,Moderate,0,No,Yes,73.7,5.3,252.3,No,,6.4,2,1.3


In [81]:
y=df['Family_Income']

In [82]:
num_features=X.select_dtypes(exclude='object').columns
cat_features=X.select_dtypes(include='object').columns

numeric_transformer=StandardScaler()
oh_transformer=OneHotEncoder()

preprocessor=ColumnTransformer(
    [
        ("OneHotEncoder",oh_transformer,cat_features),
        ("StandaredScaler",numeric_transformer,num_features)
    ]
)

In [83]:
X=preprocessor.fit_transform(X)

### Train Test Split

In [84]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=27)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((75000, 39), (25000, 39), (75000,), (25000,))

### Model Evaluation Function

In [85]:
def evaluate_model(true,predicted):
    mse=mean_squared_error(true,predicted)
    mae=mean_absolute_error(true,predicted)
    #rmse=np.sqrt(mse)
    rsquared=r2_score(true,predicted)
    return mse,mae,rsquared

In [86]:
print(len(y_train), len(y_train_pred))  # Should be equal
print(len(y_test), len(y_test_predict))  # Should also be equal

75000 75000
25000 25000


In [87]:
models={
    "Linear Regression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "K-Nearest Neighbour":KNeighborsRegressor(),
    "Decision Tree":DecisionTreeRegressor(),
    "Random Forest Regressor":RandomForestRegressor(),
    "XGBoost Regressor":XGBRegressor(),
    "Cat Boost regressor":CatBoostRegressor(),
    "Ada Boost Regressor":AdaBoostRegressor()
}

model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    
    # Make Predictions
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 598767.0211
- Mean Absolute Error: 478392382391.0748
- R2 Score: 0.0007
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 598894.9607
- Mean Absolute Error: 479306910233.9035
- R2 Score: -0.0009


Lasso
Model performance for Training set
- Root Mean Squared Error: 598767.0936
- Mean Absolute Error: 478392382522.3835
- R2 Score: 0.0007
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 598894.8166
- Mean Absolute Error: 479306497872.9915
- R2 Score: -0.0009


Ridge
Model performance for Training set
- Root Mean Squared Error: 598767.0288
- Mean Absolute Error: 478392382393.4647
- R2 Score: 0.0007
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 598894.9328
- Mean Absolute Error: 479306849782.6308
- R2 Score: -0.0009


K-Nearest Neighbour
Model performance for Training set
- Root Me

In [88]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'rsquared']).sort_values(by=["rsquared"],ascending=False)

Unnamed: 0,Model Name,rsquared
8,Ada Boost Regressor,-9.7e-05
1,Lasso,-0.000879
2,Ridge,-0.00088
0,Linear Regression,-0.00088
7,Cat Boost regressor,-0.013827
5,Random Forest Regressor,-0.015466
6,XGBoost Regressor,-0.036733
3,K-Nearest Neighbour,-0.20442
4,Decision Tree,-1.034142
