imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

import os
import glob
import pickle

inspecting and splitting the dataset for training,validating and testing

In [2]:
data = glob.glob(os.path.join('data/', '*.csv'))
df = pd.concat([pd.read_csv(f) for f in data], ignore_index=True)

In [3]:
feature_cols = ['campus', 'branch', 'year', 'difficulty']
target_col = 'marks'

X = df[feature_cols]
y = df[target_col]

In [4]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

encoding string values

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['campus', 'branch']),
        ('num', StandardScaler(), ['year', 'difficulty'])
    ])

sanity check

In [6]:
print(f"Dataset Split:")
print(f"Train: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
print(f"Val: {len(X_val)} ({len(X_val)/len(X)*100:.1f}%)")
print(f"Test: {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")

Dataset Split:
Train: 333 (69.8%)
Val: 72 (15.1%)
Test: 72 (15.1%)


feature-scaling

In [7]:
X_train_scaled = preprocessor.fit_transform(X_train)
X_val_scaled = preprocessor.transform(X_val)
X_test_scaled = preprocessor.transform(X_test)

In [8]:
poly_degree = 2
poly = PolynomialFeatures(degree=poly_degree, include_bias=False)

X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)
X_test_poly = poly.transform(X_test_scaled)

evaluation

In [9]:
def helper(model, X_train, y_train, X_val, y_val, X_test, y_test, model_name):

    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    
    metrics = {
        'Model': model_name,
        'Train RMSE': np.sqrt(mean_squared_error(y_train, y_train_pred)),
        'Train MAE': mean_absolute_error(y_train, y_train_pred),
        'Train R²': r2_score(y_train, y_train_pred),
        'Val RMSE': np.sqrt(mean_squared_error(y_val, y_val_pred)),
        'Val MAE': mean_absolute_error(y_val, y_val_pred),
        'Val R²': r2_score(y_val, y_val_pred),
        'Test RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred)),
        'Test MAE': mean_absolute_error(y_test, y_test_pred),
        'Test R²': r2_score(y_test, y_test_pred)
    }
    
    return metrics, y_test_pred

In [10]:
results = []
trained_models = {}

experimenting with multiple approaches till I find best-fit

In [11]:
print("1. Linear Regression")
model = LinearRegression()
metrics, trained_model = helper(
    model, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, 
    'Linear Regression'
)
results.append(metrics)

1. Linear Regression


In [12]:
print("2. Polynomial Regression")
model = LinearRegression()
metrics, trained_model = helper(
    model, X_train_poly, y_train, X_val_poly, y_val, X_test_poly, y_test, 
    'Polynomial Regression'
)
results.append(metrics)

2. Polynomial Regression


In [13]:
print("3. Ridge Regression")
model = Ridge(alpha=1.0)
metrics, trained_model = helper(
    model, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, 
    'Ridge Regression'
)
results.append(metrics)

3. Ridge Regression


In [14]:
print("4. Random Forest")
model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
metrics, trained_model = helper(
    model, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, 
    'Random Forest'
)
results.append(metrics)

4. Random Forest


In [15]:
print("5. Gradient Boosting")
model = GradientBoostingRegressor(n_estimators=100, random_state=42, max_depth=5)
metrics, trained_model = helper(
    model, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, 
    'Gradient Boosting'
)
results.append(metrics)

5. Gradient Boosting


In [16]:
df_results = pd.DataFrame(results)
print("\n", df_results.to_string(index=False))


                 Model  Train RMSE  Train MAE  Train R²  Val RMSE   Val MAE   Val R²  Test RMSE  Test MAE  Test R²
    Linear Regression   12.535890   9.779213  0.906524 11.494217  8.211321 0.940016  14.385330 10.926739 0.815649
Polynomial Regression    7.147673   4.415263  0.969611  8.592816  5.600017 0.966477   8.372327  5.626499 0.937555
     Ridge Regression   12.615799   9.758744  0.905329 11.467311  8.181189 0.940297  14.367475 10.796203 0.816106
        Random Forest    7.400827   5.818015  0.967420 15.869891 12.255066 0.885653  12.514723  9.610524 0.860476
    Gradient Boosting    3.363969   2.211537  0.993269 10.970191  7.641533 0.945361   9.794576  7.218929 0.914537


visualization time

In [17]:
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

axes[0, 0].barh(df_results['Model'], df_results['Test RMSE'], color='steelblue', alpha=0.8)
axes[0, 0].set_xlabel('Test RMSE')
axes[0, 0].set_title('Test RMSE by Model')
axes[0, 0].invert_yaxis()

axes[0, 1].barh(df_results['Model'], df_results['Test R²'], color='forestgreen', alpha=0.8)
axes[0, 1].set_xlabel('Test R² Score')
axes[0, 1].set_title('Test R² by Model')
axes[0, 1].invert_yaxis()

x = np.arange(len(df_results))
width = 0.25
axes[1, 0].bar(x - width, df_results['Train RMSE'], width, label='Train', alpha=0.8)
axes[1, 0].bar(x, df_results['Val RMSE'], width, label='Val', alpha=0.8)
axes[1, 0].bar(x + width, df_results['Test RMSE'], width, label='Test', alpha=0.8)
axes[1, 0].set_xlabel('Model')
axes[1, 0].set_ylabel('RMSE')
axes[1, 0].set_title('RMSE Comparison: Train vs Val vs Test')
axes[1, 0].set_xticks(x)
axes[1, 0].set_xticklabels(df_results['Model'], rotation=45, ha='right')
axes[1, 0].legend()

axes[1, 1].bar(x - width, df_results['Train R²'], width, label='Train', alpha=0.8)
axes[1, 1].bar(x, df_results['Val R²'], width, label='Val', alpha=0.8)
axes[1, 1].bar(x + width, df_results['Test R²'], width, label='Test', alpha=0.8)
axes[1, 1].set_xlabel('Model')
axes[1, 1].set_ylabel('R² Score')
axes[1, 1].set_title('R² Comparison: Train vs Val vs Test')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(df_results['Model'], rotation=45, ha='right')
axes[1, 1].legend()

plt.tight_layout()
plt.savefig('plots/model_results.png', dpi=150)
plt.close()

saving the best approach from all the experimentation

In [18]:
model = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('regressor', LinearRegression())
])
model.fit(X_train_scaled, y_train)

0,1,2
,steps,"[('poly', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [19]:
with open('models/model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('models/scaler.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)