<p style="font-weight:600; font-size:30px; color:yellow; padding-left:8px;">
    <b>Model Training and Evaluation</b>
</p>

<h2 style="color:cyan; font-weight:bold;">📑 Table of Contents</h2>

<ol style="font-size:18px; line-height:1.8; color:white;">
  <li><a href="#1-libraries" style="text-decoration:none; color:yellow;">Libraries</a></li>
  <li><a href="#2--data-ingestion" style="text-decoration:none; color:yellow;">Data Ingestion</a></li>
  <li><a href="#3-train-test-split" style="text-decoration:none; color:yellow;">Train Test Split</a></li>
  <li><a href="#4-model-training-and-evaluation" style="text-decoration:none; color:yellow;">Model Training and Evaluation</a></li>   
</ol>


<a id="1-libraries"></a>
<h2 style="color:#FFD700; font-weight:700; font-size:22px;">1. Libraries</h2>


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings

warnings.filterwarnings('ignore')

<a id="2--data-ingestion"></a>
<h2 style="color:#FFD500; font-weight:700; font-size:22px;">2. Data Ingestion</h2>



In [2]:
df = pd.read_csv('Scaled_Cleaned_Dataset.csv')
df

Unnamed: 0,Gender,Age,Sleep Duration,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder,Systolic_BP,Diastolic_BP,Quality_of_Sleep
0,1,-1.753096,-1.298887,-0.825418,0.347021,3,1.654719,-1.619584,1,-0.330002,-0.330002,6
1,1,-1.637643,-1.173036,0.039844,1.475592,0,1.170474,1.970077,1,-0.459239,-0.459239,6
2,1,-1.637643,-1.173036,0.039844,1.475592,0,1.170474,1.970077,1,-0.459239,-0.459239,6
3,1,-1.637643,-1.550588,-1.402260,1.475592,2,3.591698,-2.362273,2,1.479309,1.479309,4
4,1,-1.637643,-1.550588,-1.402260,1.475592,2,3.591698,-2.362273,2,1.479309,1.479309,4
...,...,...,...,...,...,...,...,...,...,...,...,...
369,0,1.941401,1.218127,0.760896,-1.345836,3,-0.524383,0.113356,2,1.479309,1.479309,9
370,0,1.941401,1.092276,0.760896,-1.345836,3,-0.524383,0.113356,2,1.479309,1.479309,9
371,0,1.941401,1.218127,0.760896,-1.345836,3,-0.524383,0.113356,2,1.479309,1.479309,9
372,0,1.941401,1.218127,0.760896,-1.345836,3,-0.524383,0.113356,2,1.479309,1.479309,9


In [3]:
data = df.copy()
data

Unnamed: 0,Gender,Age,Sleep Duration,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder,Systolic_BP,Diastolic_BP,Quality_of_Sleep
0,1,-1.753096,-1.298887,-0.825418,0.347021,3,1.654719,-1.619584,1,-0.330002,-0.330002,6
1,1,-1.637643,-1.173036,0.039844,1.475592,0,1.170474,1.970077,1,-0.459239,-0.459239,6
2,1,-1.637643,-1.173036,0.039844,1.475592,0,1.170474,1.970077,1,-0.459239,-0.459239,6
3,1,-1.637643,-1.550588,-1.402260,1.475592,2,3.591698,-2.362273,2,1.479309,1.479309,4
4,1,-1.637643,-1.550588,-1.402260,1.475592,2,3.591698,-2.362273,2,1.479309,1.479309,4
...,...,...,...,...,...,...,...,...,...,...,...,...
369,0,1.941401,1.218127,0.760896,-1.345836,3,-0.524383,0.113356,2,1.479309,1.479309,9
370,0,1.941401,1.092276,0.760896,-1.345836,3,-0.524383,0.113356,2,1.479309,1.479309,9
371,0,1.941401,1.218127,0.760896,-1.345836,3,-0.524383,0.113356,2,1.479309,1.479309,9
372,0,1.941401,1.218127,0.760896,-1.345836,3,-0.524383,0.113356,2,1.479309,1.479309,9


In [4]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

<a id="3-train-test-split"></a>
<h2 style="color:#FFD500; font-weight:700; font-size:22px;">3. Train Test Split</h2>


In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,  y, test_size=0.25, random_state=1)

In [7]:
print(X_train.shape, X_test.shape)

(280, 11) (94, 11)


<a id="4-model-training-and-evaluation"></a>
<h2 style="color:#FFD500; font-weight:700; font-size:22px;">4. Model Training and Evaluation</h2>


In [8]:
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define models in a dictionary
models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor()
}

predictions = []
scores = {}

# Train each model and collect predictions
for name, model in models.items():
    # Model training
    model.fit(X_train, y_train)
    
    # Save the trained model
    joblib.dump(model, f"{name}_model.pkl")
    
    # Prediction
    y_pred = model.predict(X_test)
    
    # Store predictions for ensemble
    predictions.append(y_pred)   
    
    # Evaluate each model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    scores[name] = {"MSE": mse, "R2": r2}

# Convert predictions list to array
predictions = np.array(predictions)

# Ensemble prediction by averaging
final_pred = np.mean(predictions, axis=0)

# Evaluate ensemble
ensemble_mse = mean_squared_error(y_test, final_pred)
ensemble_r2 = r2_score(y_test, final_pred)

print("---------------------------------------------------------------------")

print("Individual Model Scores:\n")
for name, score in scores.items():
    print(f"{name}: MSE={score['MSE']:.4f}, R2={score['R2']:.4f}")

print("\n---------------------------------------------------------------------")

print("\nEnsemble Model:")
print(f"MSE={ensemble_mse:.4f}, R2={ensemble_r2:.4f}")

print("\n---------------------------------------------------------------------")


---------------------------------------------------------------------
Individual Model Scores:

LinearRegression: MSE=0.0784, R2=0.9541
DecisionTree: MSE=0.0426, R2=0.9751
RandomForest: MSE=0.0254, R2=0.9851
GradientBoosting: MSE=0.0356, R2=0.9791
SVR: MSE=0.0578, R2=0.9662
KNN: MSE=0.0817, R2=0.9521

---------------------------------------------------------------------

Ensemble Model:
MSE=0.0323, R2=0.9811

---------------------------------------------------------------------


In [9]:
scores

{'LinearRegression': {'MSE': 0.0784168377066017, 'R2': 0.9540553558798798},
 'DecisionTree': {'MSE': 0.0425531914893617, 'R2': 0.9750679663152311},
 'RandomForest': {'MSE': 0.025442553191489355, 'R2': 0.9850931370598767},
 'GradientBoosting': {'MSE': 0.035633098029879164, 'R2': 0.97912246839122},
 'SVR': {'MSE': 0.05775476863604765, 'R2': 0.9661613198283856},
 'KNN': {'MSE': 0.08170212765957446, 'R2': 0.9521304953252437}}