In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from joblib import dump

# Load the new dataset
file_path = 'C:/Users/Sumit Singh Chauhan/Desktop/AIML Study Material/student_health_data.csv'  # Uploaded dataset path
df = pd.read_csv(file_path)

# Inspect the data
print("Dataset Preview:")
print(df.head())

# Drop rows with missing values
df = df.dropna()

# Identify the target variable and features
# Update these columns based on your dataset structure
target_column = 'Stress_Level_Biosensor'
drop_columns = ['Sleep_Quality', 'Blood_Pressure_Diastolic'] 

X = df.drop(columns=drop_columns + [target_column])
y = df[target_column]

# Standardize numerical features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define regressors
regressors = {
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "SVM": SVR()
}
kf = KFold(n_splits=5, shuffle=True, random_state=42)

results = {}  # Store results for each regressor

for name, model in regressors.items():
    # Perform K-Fold cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
    
    # Train the model on the full training set
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = r2  # Store R² score for later
    
    print(f"{name} Regressor:")
    print(f"  Cross-Validation MSE: {-cv_scores.mean():.4f}")
    print(f"  Test MSE: {mse:.4f}")
    print(f"  Test MAE: {mae:.4f}")
    print(f"  Test R^2: {r2:.4f}\n")

    # Plot true vs predicted values
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred, alpha=0.7)
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
    plt.title(f'{name}: True vs Predicted Values')
    plt.xlabel('True Values')
    plt.ylabel('Predicted Values')
    plt.show()

# Save the best model and scaler for deployment
best_model_name = max(results, key=results.get)
best_model = regressors[best_model_name]
dump(best_model, f'{best_model_name.lower().replace(" ", "_")}_model.pkl')
dump(scaler, 'scaler.pkl')
print("Models and scaler saved.")


Dataset Preview:
   Student_ID  Age Gender  Heart_Rate  Blood_Pressure_Systolic  \
0           1   24      M   50.663217               122.173015   
1           2   21      F   57.926042               110.778407   
2           3   22      M   59.294219               109.375673   
3           4   24      M   76.826232               125.142227   
4           5   20      M   68.342769               107.515592   

   Blood_Pressure_Diastolic  Stress_Level_Biosensor  Stress_Level_Self_Report  \
0                 84.419860                3.137350                  9.028669   
1                 75.696145                3.699078                  5.819697   
2                 83.803814                6.785156                  5.892360   
3                 78.091587                6.408509                  6.884001   
4                 80.674937                7.264719                  4.483450   

  Physical_Activity Sleep_Quality      Mood  Study_Hours  Project_Hours  \
0              High     

ValueError: could not convert string to float: 'M'