## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [187]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
import warnings

#### Import the CSV Data as Pandas DataFrame

In [188]:
df = pd.read_csv('../../data/preproccedData/Augmented_PreProccedCommonParameters.csv')

In [189]:
df.head()

Unnamed: 0,Age,Gender,Height,Weight,Waist_Circumference,Family_History,Blood_Pressure,Cholesterol_Lipid_Levels,Thirst,Fatigue,Urination,Vision Changes,BMI,DiabetesRisk,RiskLevel
0,24,1,163.641009,49.662909,29.456176,0,0,0.0,0.0,0.001007,0.015439,0.003209,17.716927,30.774236,0.996751
1,24,1,161.668433,54.650443,29.447969,0,0,0.0,0.0,0.0,0.0,0.900428,19.968942,32.703477,0.993202
2,28,0,149.916813,47.691071,27.526779,0,0,0.0,0.0,0.0,0.002154,0.0,20.254781,28.756909,0.0
3,24,0,159.721617,55.654345,30.44506,0,1,0.0,0.015828,0.008413,0.0,0.89923,20.815496,37.04964,0.987855
4,22,0,150.911548,45.721399,27.489003,0,0,0.0,0.0,0.0,0.0,0.013653,19.167236,24.567752,0.0


#### Preparing X and Y variables

In [190]:
X = df.drop(columns=['DiabetesRisk'],axis=1)

In [191]:
X.head()

Unnamed: 0,Age,Gender,Height,Weight,Waist_Circumference,Family_History,Blood_Pressure,Cholesterol_Lipid_Levels,Thirst,Fatigue,Urination,Vision Changes,BMI,RiskLevel
0,24,1,163.641009,49.662909,29.456176,0,0,0.0,0.0,0.001007,0.015439,0.003209,17.716927,0.996751
1,24,1,161.668433,54.650443,29.447969,0,0,0.0,0.0,0.0,0.0,0.900428,19.968942,0.993202
2,28,0,149.916813,47.691071,27.526779,0,0,0.0,0.0,0.0,0.002154,0.0,20.254781,0.0
3,24,0,159.721617,55.654345,30.44506,0,1,0.0,0.015828,0.008413,0.0,0.89923,20.815496,0.987855
4,22,0,150.911548,45.721399,27.489003,0,0,0.0,0.0,0.0,0.0,0.013653,19.167236,0.0


In [194]:
y = df['DiabetesRisk']

In [195]:
y

0       30.774236
1       32.703477
2       28.756909
3       37.049640
4       24.567752
          ...    
2062    66.542625
2063    73.623166
2064    58.735971
2065    52.574598
2066    63.154794
Name: DiabetesRisk, Length: 2067, dtype: float64

In [196]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [197]:
X = preprocessor.fit_transform(X)

In [198]:
df

Unnamed: 0,Age,Gender,Height,Weight,Waist_Circumference,Family_History,Blood_Pressure,Cholesterol_Lipid_Levels,Thirst,Fatigue,Urination,Vision Changes,BMI,DiabetesRisk,RiskLevel
0,24,1,163.641009,49.662909,29.456176,0,0,0.000000,0.000000,0.001007,0.015439,0.003209,17.716927,30.774236,0.996751
1,24,1,161.668433,54.650443,29.447969,0,0,0.000000,0.000000,0.000000,0.000000,0.900428,19.968942,32.703477,0.993202
2,28,0,149.916813,47.691071,27.526779,0,0,0.000000,0.000000,0.000000,0.002154,0.000000,20.254781,28.756909,0.000000
3,24,0,159.721617,55.654345,30.445060,0,1,0.000000,0.015828,0.008413,0.000000,0.899230,20.815496,37.049640,0.987855
4,22,0,150.911548,45.721399,27.489003,0,0,0.000000,0.000000,0.000000,0.000000,0.013653,19.167236,24.567752,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2062,48,0,171.329665,99.517244,31.532740,0,1,0.948361,0.000000,0.014228,0.000000,1.039971,40.236245,66.542625,0.918635
2063,41,0,198.111903,105.273142,37.417691,0,0,0.931158,0.008401,0.000674,0.010095,0.005622,31.818514,73.623166,1.852258
2064,44,1,188.476713,87.079244,32.513203,0,0,0.005306,0.000000,0.000000,0.011730,1.038978,29.096864,58.735971,0.929787
2065,42,1,190.630613,81.613144,31.517074,0,0,0.000000,0.006990,0.000000,0.012997,0.000000,26.660086,52.574598,0.918645


In [199]:
X.shape

(2067, 14)

In [200]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((1653, 14), (414, 14))

In [201]:
df.head

<bound method NDFrame.head of       Age  Gender      Height      Weight  Waist_Circumference  \
0      24       1  163.641009   49.662909            29.456176   
1      24       1  161.668433   54.650443            29.447969   
2      28       0  149.916813   47.691071            27.526779   
3      24       0  159.721617   55.654345            30.445060   
4      22       0  150.911548   45.721399            27.489003   
...   ...     ...         ...         ...                  ...   
2062   48       0  171.329665   99.517244            31.532740   
2063   41       0  198.111903  105.273142            37.417691   
2064   44       1  188.476713   87.079244            32.513203   
2065   42       1  190.630613   81.613144            31.517074   
2066   45       1  189.551247   80.574808            34.475794   

      Family_History  Blood_Pressure  Cholesterol_Lipid_Levels    Thirst  \
0                  0               0                  0.000000  0.000000   
1                  0     

In [202]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Define the preprocessing for numerical and categorical features
numerical_features = ['Age', 'Height', 'Weight', 'Waist_Circumference', 'Blood_Pressure', 'Cholesterol_Lipid_Levels']
categorical_features = ['Gender', 'Family_History', 'Thirst']

# Create a numerical transformer (standardize and impute missing values)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with the median
    ('scaler', StandardScaler())  # Scale numerical features
])

# Create a categorical transformer (encode and impute missing values)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent category
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # Encode categorical variables
])

# Combine both transformations in a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Apply the transformations
X_transformed = preprocessor.fit_transform(df)


In [203]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [204]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming evaluate_model() function is defined as:
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

models = {
    "MLPRegressor": MLPRegressor(max_iter=500),  # Increasing max_iter for convergence
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor(),
}

model_list = []
r2_list_train = []
r2_list_test = []

# Split the data (assuming X and y are your features and target variables)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for model_name, model in models.items():
    print(f"Training {model_name}...")

    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    
    # Print model performance
    print(f"Model performance for {model_name} (Training set):")
    print(f"- RMSE: {model_train_rmse:.4f}")
    print(f"- MAE: {model_train_mae:.4f}")
    print(f"- R2: {model_train_r2:.4f}")
    print("-" * 40)
    
    print(f"Model performance for {model_name} (Test set):")
    print(f"- RMSE: {model_test_rmse:.4f}")
    print(f"- MAE: {model_test_mae:.4f}")
    print(f"- R2: {model_test_r2:.4f}")
    print("=" * 40)
    
    # Store model performance
    model_list.append(model_name)
    r2_list_train.append(model_train_r2)
    r2_list_test.append(model_test_r2)

# Create a DataFrame for comparison (optional)
import pandas as pd
model_performance_df = pd.DataFrame({
    'Model': model_list,
    'Train R2': r2_list_train,
    'Test R2': r2_list_test
})

print(model_performance_df)


Training MLPRegressor...




Model performance for MLPRegressor (Training set):
- RMSE: 2.4600
- MAE: 1.9576
- R2: 0.9748
----------------------------------------
Model performance for MLPRegressor (Test set):
- RMSE: 2.9021
- MAE: 2.3356
- R2: 0.9668
Training Linear Regression...
Model performance for Linear Regression (Training set):
- RMSE: 5.1628
- MAE: 3.7291
- R2: 0.8890
----------------------------------------
Model performance for Linear Regression (Test set):
- RMSE: 5.2697
- MAE: 3.8827
- R2: 0.8907
Training Lasso...
Model performance for Lasso (Training set):
- RMSE: 5.4392
- MAE: 3.9608
- R2: 0.8768
----------------------------------------
Model performance for Lasso (Test set):
- RMSE: 5.4063
- MAE: 4.0025
- R2: 0.8850
Training Ridge...




Model performance for Ridge (Training set):
- RMSE: 5.1628
- MAE: 3.7294
- R2: 0.8890
----------------------------------------
Model performance for Ridge (Test set):
- RMSE: 5.2693
- MAE: 3.8830
- R2: 0.8907
Training K-Neighbors Regressor...
Model performance for K-Neighbors Regressor (Training set):
- RMSE: 3.7734
- MAE: 2.7181
- R2: 0.9407
----------------------------------------
Model performance for K-Neighbors Regressor (Test set):
- RMSE: 4.5973
- MAE: 3.3486
- R2: 0.9168
Training Decision Tree...




Model performance for Decision Tree (Training set):
- RMSE: 0.0000
- MAE: 0.0000
- R2: 1.0000
----------------------------------------
Model performance for Decision Tree (Test set):
- RMSE: 2.9118
- MAE: 1.5750
- R2: 0.9666
Training Random Forest Regressor...




Model performance for Random Forest Regressor (Training set):
- RMSE: 0.7036
- MAE: 0.4320
- R2: 0.9979
----------------------------------------
Model performance for Random Forest Regressor (Test set):
- RMSE: 1.6435
- MAE: 1.0628
- R2: 0.9894
Training XGBRegressor...




Model performance for XGBRegressor (Training set):
- RMSE: 0.1121
- MAE: 0.0791
- R2: 0.9999
----------------------------------------
Model performance for XGBRegressor (Test set):
- RMSE: 1.5304
- MAE: 0.9770
- R2: 0.9908
Training CatBoosting Regressor...




Model performance for CatBoosting Regressor (Training set):
- RMSE: 0.4708
- MAE: 0.3516
- R2: 0.9991
----------------------------------------
Model performance for CatBoosting Regressor (Test set):
- RMSE: 1.1252
- MAE: 0.7865
- R2: 0.9950
Training AdaBoost Regressor...
Model performance for AdaBoost Regressor (Training set):
- RMSE: 3.8172
- MAE: 3.1041
- R2: 0.9393
----------------------------------------
Model performance for AdaBoost Regressor (Test set):
- RMSE: 3.8189
- MAE: 3.1148
- R2: 0.9426
                     Model  Train R2   Test R2
0             MLPRegressor  0.974804  0.966847
1        Linear Regression  0.889028  0.890689
2                    Lasso  0.876827  0.884951
3                    Ridge  0.889028  0.890709
4    K-Neighbors Regressor  0.940719  0.916805
5            Decision Tree  1.000000  0.966626
6  Random Forest Regressor  0.997939  0.989368
7             XGBRegressor  0.999948  0.990780
8    CatBoosting Regressor  0.999077  0.995016
9       AdaBoost Regres



In [205]:
lin_model = LinearRegression(fit_intercept=True)
lin_model = lin_model.fit(X_train, y_train)
y_pred = lin_model.predict(X_test)
score = r2_score(y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)

 Accuracy of the model is 89.07
