## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

#### Import the CSV Data as Pandas DataFrame

In [2]:
df = pd.read_csv('stud.csv')

#### Show Top 5 Records

In [3]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


#### Preparing X and Y variables

In [4]:
X = df.drop(columns=['math_score'],axis=1)

In [5]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [6]:
print("Categories in 'gender' variable:     ",end=" " )
print(df['gender'].unique())

print("Categories in 'race_ethnicity' variable:  ",end=" ")
print(df['race_ethnicity'].unique())

print("Categories in'parental level of education' variable:",end=" " )
print(df['parental_level_of_education'].unique())

print("Categories in 'lunch' variable:     ",end=" " )
print(df['lunch'].unique())

print("Categories in 'test preparation course' variable:     ",end=" " )
print(df['test_preparation_course'].unique())

Categories in 'gender' variable:      ['female' 'male']
Categories in 'race_ethnicity' variable:   ['group B' 'group C' 'group A' 'group D' 'group E']
Categories in'parental level of education' variable: ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
Categories in 'lunch' variable:      ['standard' 'free/reduced']
Categories in 'test preparation course' variable:      ['none' 'completed']


In [7]:
y = df['math_score']

In [8]:
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math_score, Length: 1000, dtype: int64

In [9]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [10]:
# X = preprocessor.fit_transform(X)

In [11]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((800, 7), (200, 7))

#### Create an Evaluate Function to give all metrics after model Training

In [12]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [13]:
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.base import clone
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# from sklearn.preprocessing import StandardScaler
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
# import pandas as pd
# import numpy as np

# # Import regressors
# from sklearn.linear_model import LinearRegression, Ridge, Lasso
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
# from xgboost import XGBRegressor
# from catboost import CatBoostRegressor

# def evaluate_model(y_true, y_pred):
#     mae = mean_absolute_error(y_true, y_pred)
#     rmse = np.sqrt(mean_squared_error(y_true, y_pred))
#     r2 = r2_score(y_true, y_pred)
#     return mae, rmse, r2

# def train_models_with_pipeline(X_train, y_train, X_test, y_test, preprocessor=None):
#     """
#     Train multiple regression models with optional hyperparameter tuning
    
#     Parameters:
#     - X_train, y_train: Training data
#     - X_test, y_test: Test data
#     - preprocessor: Optional preprocessor for pipeline (e.g., StandardScaler, ColumnTransformer)
#     """
    
#     # If no preprocessor is provided, create a simple StandardScaler
#     if preprocessor is None:
#         preprocessor = StandardScaler()
    
#     # Define models
#     models = {
#         "Linear Regression": LinearRegression(),
#         "Lasso": Lasso(),
#         "Ridge": Ridge(),
#         "K-Neighbors Regressor": KNeighborsRegressor(),
#         "Decision Tree": DecisionTreeRegressor(),
#         "Random Forest Regressor": RandomForestRegressor(),
#         "XGBRegressor": XGBRegressor(verbosity=0),
#         "CatBoosting Regressor": CatBoostRegressor(verbose=False),
#         "AdaBoost Regressor": AdaBoostRegressor()
#     }
    
#     # Define hyperparameter grids for RandomizedSearchCV
#     param_grids = {
#         "Lasso": {
#             'regressor__alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
#         },
#         "Ridge": {
#             'regressor__alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
#         },
#         "K-Neighbors Regressor": {
#             'regressor__n_neighbors': [3, 5, 7, 9, 11, 15],
#             'regressor__weights': ['uniform', 'distance'],
#             'regressor__p': [1, 2]
#         },
#         "Decision Tree": {
#             'regressor__max_depth': [None, 5, 10, 15, 20],
#             'regressor__min_samples_split': [2, 5, 10],
#             'regressor__min_samples_leaf': [1, 2, 4]
#         },
#         "Random Forest Regressor": {
#             'regressor__n_estimators': [50, 100, 200],
#             'regressor__max_depth': [None, 10, 20, 30],
#             'regressor__min_samples_split': [2, 5, 10],
#             'regressor__min_samples_leaf': [1, 2, 4]
#         },
#         "XGBRegressor": {
#             'regressor__n_estimators': [100, 200, 300],
#             'regressor__learning_rate': [0.01, 0.1, 0.3],
#             'regressor__max_depth': [3, 5, 7],
#             'regressor__subsample': [0.7, 0.8, 1.0]
#         },
#         "CatBoosting Regressor": {
#             'regressor__iterations': [100, 200, 300],
#             'regressor__learning_rate': [0.01, 0.1, 0.3],
#             'regressor__depth': [4, 6, 8]
#         },
#         "AdaBoost Regressor": {
#             'regressor__n_estimators': [50, 100, 200],
#             'regressor__learning_rate': [0.01, 0.1, 1.0]
#         }
#     }
    
#     # Store results
#     model_list = []
#     r2_list = []
#     results = {
#         'model': [],
#         'train_mae': [],
#         'train_rmse': [],
#         'train_r2': [],
#         'test_mae': [],
#         'test_rmse': [],
#         'test_r2': [],
#         'best_params': []
#     }
    
#     best_models = {}
    
#     for name, model in models.items():
#         print(f"\n{'='*50}")
#         print(f"Training: {name}")
#         print('='*50)
        
#         # Create pipeline
#         pipeline = Pipeline([
#             ('preprocessor', preprocessor),
#             ('regressor', model)
#         ])
        
#         # Check if hyperparameter tuning is available for this model
#         if name in param_grids:
#             print(f"→ Performing RandomizedSearchCV for {name}...")
            
#             # Perform RandomizedSearchCV
#             random_search = RandomizedSearchCV(
#                 estimator=pipeline,
#                 param_distributions=param_grids[name],
#                 n_iter=20,  # Number of parameter settings sampled
#                 cv=5,       # 5-fold cross-validation
#                 scoring='r2',
#                 n_jobs=-1,
#                 random_state=42,
#                 verbose=0
#             )
            
#             # Fit the random search
#             random_search.fit(X_train, y_train)
            
#             # Get the best model
#             best_pipeline = random_search.best_estimator_
#             best_params = random_search.best_params_
            
#             print(f"✅ Best parameters: {best_params}")
#         else:
#             # For models without hyperparameter tuning, just fit the pipeline
#             pipeline.fit(X_train, y_train)
#             best_pipeline = pipeline
#             best_params = {}
        
#         # Make predictions
#         y_train_pred = best_pipeline.predict(X_train)
#         y_test_pred = best_pipeline.predict(X_test)
        
#         # Evaluate Train and Test dataset
#         model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
#         model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
        
#         # Store results
#         model_list.append(name)
#         r2_list.append(model_test_r2)
#         best_models[name] = best_pipeline
        
#         results['model'].append(name)
#         results['train_mae'].append(model_train_mae)
#         results['train_rmse'].append(model_train_rmse)
#         results['train_r2'].append(model_train_r2)
#         results['test_mae'].append(model_test_mae)
#         results['test_rmse'].append(model_test_rmse)
#         results['test_r2'].append(model_test_r2)
#         results['best_params'].append(best_params)
        
#         # Print results
#         print('\nModel performance for Training set')
#         print(f"- Root Mean Squared Error: {model_train_rmse:.4f}")
#         print(f"- Mean Absolute Error: {model_train_mae:.4f}")
#         print(f"- R2 Score: {model_train_r2:.4f}")
        
#         print('----------------------------------')
        
#         print('Model performance for Test set')
#         print(f"- Root Mean Squared Error: {model_test_rmse:.4f}")
#         print(f"- Mean Absolute Error: {model_test_mae:.4f}")
#         print(f"- R2 Score: {model_test_r2:.4f}")
    
#     # Create results DataFrame
#     results_df = pd.DataFrame(results)
#     results_df = results_df.sort_values('test_r2', ascending=False)
    
#     print("\n" + "="*50)
#     print("SUMMARY - Model Performance Ranking (by Test R2)")
#     print("="*50)
#     print(results_df[['model', 'test_r2', 'test_rmse', 'test_mae']].to_string(index=False))
    
#     # Find best model
#     best_model_name = results_df.iloc[0]['model']
#     print(f"\n🏆 Best Model: {best_model_name} with R2 Score: {results_df.iloc[0]['test_r2']:.4f}")
    
#     return results_df, best_models

# # Example usage:
# # If you have numerical and categorical features, you can create a preprocessor like this:
# def create_preprocessor(numerical_features, categorical_features):
#     """
#     Create a preprocessor for numerical and categorical features
#     """
#     numerical_transformer = StandardScaler()
#     categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)
    
#     preprocessor = ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, numerical_features),
#             ('cat', categorical_transformer, categorical_features)
#         ])
    
#     return preprocessor

# results_df, best_models = train_models_with_pipeline(X_train, y_train, X_test, y_test)


### Results

In [14]:
# This new block replaces the code in cells [14] and [15]

# Import necessary libraries
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pandas as pd
import numpy as np
import joblib
import os

# --- 1. Define Models and Hyperparameter Grids ---
# (This is the same as your original notebook)
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(verbosity=0),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

param_grids = {
    "Lasso": {'regressor__alpha': [0.001, 0.01, 0.1, 1.0, 10.0]},
    "Ridge": {'regressor__alpha': [0.001, 0.01, 0.1, 1.0, 10.0]},
    "K-Neighbors Regressor": {'regressor__n_neighbors': [3, 5, 7, 9, 15], 'regressor__weights': ['uniform', 'distance']},
    "Decision Tree": {'regressor__max_depth': [None, 5, 10, 20], 'regressor__min_samples_split': [2, 5, 10]},
    "Random Forest Regressor": {'regressor__n_estimators': [50, 100, 200], 'regressor__max_depth': [None, 10, 20]},
    "XGBRegressor": {'regressor__n_estimators': [100, 200], 'regressor__learning_rate': [0.01, 0.1, 0.3], 'regressor__max_depth': [3, 5, 7]},
    "CatBoosting Regressor": {'regressor__iterations': [100, 200], 'regressor__learning_rate': [0.01, 0.1], 'regressor__depth': [4, 6, 8]},
    "AdaBoost Regressor": {'regressor__n_estimators': [50, 100], 'regressor__learning_rate': [0.01, 0.1, 1.0]}
}

# --- 2. Correct Training Loop ---
# This loop builds a full pipeline with the *correct preprocessor* from cell [9]
results = []
best_models = {}

for name, model in models.items():
    print(f"==================================================")
    print(f"Training: {name}")
    
    # Create the full, correct pipeline
    full_pipeline = Pipeline([
        ('preprocessor', preprocessor), # The ColumnTransformer from cell [9]
        ('regressor', model)
    ])

    # Use RandomizedSearchCV if a parameter grid is defined
    if name in param_grids:
        print(f"→ Performing RandomizedSearchCV for {name}...")
        random_search = RandomizedSearchCV(
            estimator=full_pipeline,
            param_distributions=param_grids[name],
            n_iter=20, cv=5, scoring='r2', n_jobs=-1, random_state=42, error_score='raise'
        )
        # Fit on the raw training data (X_train is a DataFrame here)
        random_search.fit(X_train, y_train)
        best_pipeline = random_search.best_estimator_
        print(f"✅ Best parameters: {random_search.best_params_}")
    else:
        # Fit the pipeline directly for models without hyperparameter tuning
        best_pipeline = full_pipeline.fit(X_train, y_train)

    # Evaluate the best pipeline on the test set
    y_test_pred = best_pipeline.predict(X_test)
    
    # Calculate metrics
    r2 = r2_score(y_test, y_test_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    mae = mean_absolute_error(y_test, y_test_pred)
    
    print('Model performance for Test set')
    print(f"- R2 Score: {r2:.4f}")
    print(f"- RMSE: {rmse:.4f}")
    print(f"- MAE: {mae:.4f}")
    
    # Store results and the final, correct pipeline object
    results.append({'model': name, 'test_r2': r2, 'test_rmse': rmse, 'test_mae': mae})
    best_models[name] = best_pipeline

# --- 3. Rank Models and Save the Best One ---
results_df = pd.DataFrame(results).sort_values('test_r2', ascending=False)

print("\n" + "="*50)
print("SUMMARY - Model Performance Ranking (by Test R2)")
print("="*50)
print(results_df.to_string(index=False))
# ADD THIS LINE TO SAVE THE RESULTS
results_df.to_csv('saved_models/model_results.csv', index=False)
# Get the best model
best_model_name = results_df.iloc[0]['model']
best_model_pipeline = best_models[best_model_name]

print(f"\n🏆 Best Model: {best_model_name} with R2 Score: {results_df.iloc[0]['test_r2']:.4f}")

# Create directory for saving
os.makedirs('saved_models', exist_ok=True)

# Save the single, complete, and correct best model pipeline
model_filename = 'saved_models/best_model_pipeline.pkl'
joblib.dump(best_model_pipeline, model_filename)

print(f"\n✅ Best model pipeline saved as: {model_filename}")
print("This single file now contains everything needed for prediction.")

# ADD THIS BLOCK TO THE END OF THE CELL

import json

print("\n" + "="*50)
print("Saving feature information for the Streamlit app...")

# Dynamically get feature names and their unique values from the original X dataframe
categorical_features = X.select_dtypes(include='object').columns.tolist()
numerical_features = X.select_dtypes(include=np.number).columns.tolist()

categorical_values = {col: sorted(X[col].unique().tolist()) for col in categorical_features}

# Create the feature_info dictionary
feature_info = {
    'numerical_features': numerical_features,
    'categorical_features': categorical_features,
    'categorical_values': categorical_values,
    'target_variable': 'math_score' # The actual target variable
}

# Save the feature_info dictionary as a JSON file
feature_info_filename = 'saved_models/feature_info.json'
with open(feature_info_filename, 'w') as f:
    json.dump(feature_info, f, indent=4)

print(f"✅ Feature information saved as: {feature_info_filename}")

Training: Linear Regression
Model performance for Test set
- R2 Score: 0.8804
- RMSE: 5.3940
- MAE: 4.2148
Training: Lasso
→ Performing RandomizedSearchCV for Lasso...




✅ Best parameters: {'regressor__alpha': 0.001}
Model performance for Test set
- R2 Score: 0.8805
- RMSE: 5.3930
- MAE: 4.2137
Training: Ridge
→ Performing RandomizedSearchCV for Ridge...
✅ Best parameters: {'regressor__alpha': 1.0}
Model performance for Test set
- R2 Score: 0.8806
- RMSE: 5.3904
- MAE: 4.2111
Training: K-Neighbors Regressor
→ Performing RandomizedSearchCV for K-Neighbors Regressor...




✅ Best parameters: {'regressor__weights': 'distance', 'regressor__n_neighbors': 15}
Model performance for Test set
- R2 Score: 0.7907
- RMSE: 7.1362
- MAE: 5.5406
Training: Decision Tree
→ Performing RandomizedSearchCV for Decision Tree...




✅ Best parameters: {'regressor__min_samples_split': 2, 'regressor__max_depth': 5}
Model performance for Test set
- R2 Score: 0.8242
- RMSE: 6.5400
- MAE: 4.9315
Training: Random Forest Regressor
→ Performing RandomizedSearchCV for Random Forest Regressor...




✅ Best parameters: {'regressor__n_estimators': 200, 'regressor__max_depth': 10}
Model performance for Test set
- R2 Score: 0.8528
- RMSE: 5.9844
- MAE: 4.6256
Training: XGBRegressor
→ Performing RandomizedSearchCV for XGBRegressor...




✅ Best parameters: {'regressor__n_estimators': 100, 'regressor__max_depth': 3, 'regressor__learning_rate': 0.1}
Model performance for Test set
- R2 Score: 0.8683
- RMSE: 5.6611
- MAE: 4.3732
Training: CatBoosting Regressor
→ Performing RandomizedSearchCV for CatBoosting Regressor...




✅ Best parameters: {'regressor__learning_rate': 0.1, 'regressor__iterations': 200, 'regressor__depth': 4}
Model performance for Test set
- R2 Score: 0.8729
- RMSE: 5.5620
- MAE: 4.2930
Training: AdaBoost Regressor
→ Performing RandomizedSearchCV for AdaBoost Regressor...




✅ Best parameters: {'regressor__n_estimators': 100, 'regressor__learning_rate': 1.0}
Model performance for Test set
- R2 Score: 0.8539
- RMSE: 5.9632
- MAE: 4.6352

SUMMARY - Model Performance Ranking (by Test R2)
                  model  test_r2  test_rmse  test_mae
                  Ridge 0.880592   5.390418  4.211113
                  Lasso 0.880477   5.393004  4.213680
      Linear Regression 0.880433   5.393994  4.214763
  CatBoosting Regressor 0.872868   5.562026  4.293007
           XGBRegressor 0.868297   5.661136  4.373195
     AdaBoost Regressor 0.853867   5.963196  4.635230
Random Forest Regressor 0.852829   5.984350  4.625578
          Decision Tree 0.824230   6.540001  4.931523
  K-Neighbors Regressor 0.790719   7.136248  5.540594

🏆 Best Model: Ridge with R2 Score: 0.8806

✅ Best model pipeline saved as: saved_models/best_model_pipeline.pkl
This single file now contains everything needed for prediction.

Saving feature information for the Streamlit app...
✅ Feature inform