In [1]:
# Download the data
# !wget https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv

In [2]:
#  Import the data
import pandas as pd

df  = pd.read_csv('jamb_exam_results.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   jamb_score                    5000 non-null   int64  
 1   study_hours_per_week          5000 non-null   int64  
 2   attendance_rate               5000 non-null   int64  
 3   teacher_quality               5000 non-null   int64  
 4   distance_to_school            5000 non-null   float64
 5   school_type                   5000 non-null   object 
 6   school_location               5000 non-null   object 
 7   extra_tutorials               5000 non-null   object 
 8   access_to_learning_materials  5000 non-null   object 
 9   parent_involvement            5000 non-null   object 
 10  it_knowledge                  5000 non-null   object 
 11  student_id                    5000 non-null   int64  
 12  age                           5000 non-null   int64  
 13  gen

In [3]:
# Data preprocessing
df = df.drop('student_id', axis=1)
df['parent_education_level'] = df['parent_education_level'].fillna(0)

X = df.drop('jamb_score', axis=1)
y = df.jamb_score

In [4]:
# Preprocess the data

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text
from sklearn.metrics import mean_squared_error, root_mean_squared_error

# Step 3: Split the data
train_full_df, test_df = train_test_split(df, test_size=0.2, random_state=1)
train_df, val_df = train_test_split(train_full_df, test_size=0.25, random_state=1)

# Separate the target (jamb_score) and features
y_train = train_df['jamb_score'].values
y_val = val_df['jamb_score'].values
y_test = test_df['jamb_score'].values

X_train = train_df.drop(columns=['jamb_score'])
X_val = val_df.drop(columns=['jamb_score'])
X_test = test_df.drop(columns=['jamb_score'])

# Step 4: Convert data into a matrix format using DictVectorizer
dv = DictVectorizer(sparse=False)

train_dict = X_train.to_dict(orient='records')
val_dict = X_val.to_dict(orient='records')
test_dict = X_test.to_dict(orient='records')

x_train = dv.fit_transform(train_dict)
x_val = dv.transform(val_dict)
x_test = dv.transform(test_dict)

In [5]:
question = """
Question 1
Let's train a decision tree regressor to predict the jamb_score variable.

Train a model with max_depth=1.
Which feature is used for splitting the data?

study_hours_per_week | attendance_rate | teacher_quality | distance_to_school
"""

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error

dt = DecisionTreeRegressor(max_depth=1)
dt.fit(x_train, y_train)

# Identify which feature is used for splitting
from sklearn.tree import export_text

tree_rules = export_text(dt, feature_names=dv.get_feature_names_out())
print(tree_rules)

|--- study_hours_per_week <= 18.50
|   |--- value: [155.24]
|--- study_hours_per_week >  18.50
|   |--- value: [188.59]



In [6]:
question = """
Question 2
Train a random forest model with these parameters:
n_estimators=10
random_state=1
n_jobs=-1 (optional - to make training faster)

What's the RMSE of this model on validation?

22.13  /  42.13  /  62.13  /  82.12
"""

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(x_train, y_train)

y_pred = rf.predict(x_val)
rmse = root_mean_squared_error(y_val, y_pred)

print(f"RMSE: {rmse}")

RMSE: 42.13724207871227


In [7]:
question = """
Question 3
Now let's experiment with the n_estimators parameter

Try different values of this parameter from 10 to 200 with step 10.
Set random_state to 1.
Evaluate the model on the validation dataset.
After which value of n_estimators does RMSE stop improving? Consider 3 decimal places for calculating the answer.

10  /  25  /  80  /  200
"""
import numpy as np

n_range = range(10, 201, 10)
rmse_scores = []

for n in n_range:
    rf = RandomForestRegressor(n_estimators=n, random_state=1)
    rf.fit(x_train, y_train)

    # Predict on the validation set
    y_pred_val = rf.predict(x_val)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    rmse_scores.append((n, rmse))

# Find after which value of n_estimators RMSE stops improving
for n, rmse in rmse_scores:
    print(f"n_estimators={n} --> RMSE={rmse:.3f}")

n_estimators=10 --> RMSE=42.137
n_estimators=20 --> RMSE=41.461
n_estimators=30 --> RMSE=41.106
n_estimators=40 --> RMSE=40.917
n_estimators=50 --> RMSE=40.852
n_estimators=60 --> RMSE=40.784
n_estimators=70 --> RMSE=40.677
n_estimators=80 --> RMSE=40.539
n_estimators=90 --> RMSE=40.504
n_estimators=100 --> RMSE=40.517
n_estimators=110 --> RMSE=40.593
n_estimators=120 --> RMSE=40.625
n_estimators=130 --> RMSE=40.651
n_estimators=140 --> RMSE=40.595
n_estimators=150 --> RMSE=40.597
n_estimators=160 --> RMSE=40.604
n_estimators=170 --> RMSE=40.628
n_estimators=180 --> RMSE=40.641
n_estimators=190 --> RMSE=40.631
n_estimators=200 --> RMSE=40.601


In [16]:
question = """
Question 4
Let's select the best max_depth:

Try different values of max_depth: [10, 15, 20, 25]
For each of these values,
try different values of n_estimators from 10 till 200 (with step 10)
calculate the mean RMSE
Fix the random seed: random_state=1
What's the best max_depth, using the mean RMSE?

10  /  15  /  20  /  25
"""

all_scores = {}

for depth in [10, 15, 20, 25]:
    rmse = []

    for i in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=i, max_depth=depth, random_state=1)
        rf.fit(x_train, y_train)
        y_pred = rf.predict(x_val)
        rmse_score = root_mean_squared_error(y_val, y_pred)
        rmse.append(rmse_score)

    all_scores[depth] = rmse

# After the loop, find the best max_depth with the lowest mean RMSE
best_max_depth = min(all_scores, key=all_scores.get)
best_rmse_scores = all_scores[best_max_depth]

print(f"Best max_depth: {best_max_depth}")





Best max_depth: 10


In [11]:
question = """
Question 5
We can extract feature importance information from tree-based models.
In Scikit-Learn, tree-based models contain this information in the feature_importances_ field.
For this homework question, we'll find the most important feature:

Train the model with these parameters:
n_estimators=10,
max_depth=20,
random_state=1,
n_jobs=-1 (optional)
Get the feature importance information from this model
What's the most important feature (among these 4)?

study_hours_per_week / attendance_rate  /  distance_to_school /  teacher_quality
"""

rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(x_train, y_train)
importances = rf.feature_importances_    # Get the feature importances

feature_names = X.columns     # Get feature names (assuming your feature names are stored in X_train.columns)

feature_importance_list = sorted(zip(feature_names, importances), reverse=True)
for feature, importance in feature_importance_list:
    print(f"{feature} --> {importance:.4f}")


teacher_quality --> 0.0693
study_hours_per_week --> 0.0123
socioeconomic_status --> 0.0091
school_type --> 0.1497
school_location --> 0.1365
parent_involvement --> 0.0093
parent_education_level --> 0.0000
it_knowledge --> 0.0104
gender --> 0.0124
extra_tutorials --> 0.0135
distance_to_school --> 0.0315
attendance_rate --> 0.0103
assignments_completed --> 0.0155
age --> 0.0177
access_to_learning_materials --> 0.0091


In [12]:
question = """
Question 6
Now let's train an XGBoost model! For this question, we'll tune the eta parameter:
Install XGBoost
Create DMatrix for train and validation
Create a watchlist
Train a model with these parameters for 100 rounds:

xgb_params = {
    'eta': 0.3, 'max_depth': 6,  'min_child_weight': 1,'objective': 'reg:squarederror',
    'nthread': 8,  'seed': 1, 'verbosity': 1 }
"""

import xgboost as xgb

In [13]:
dtrain = xgb.DMatrix(x_train, label=y_train, feature_names=dv.feature_names_)
dval = xgb.DMatrix(x_val, label=y_val, feature_names=dv.feature_names_)

# Create the watchlist for monitoring
watchlist = [(dtrain, 'train'), (dval, 'eval')]

In [14]:
xgb_params = {
    'eta': 0.3, 'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,'seed': 1,'verbosity': 1
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100,evals=watchlist, verbose_eval=10)

[0]	train-rmse:42.69552	eval-rmse:44.86028
[10]	train-rmse:31.55119	eval-rmse:40.83684
[20]	train-rmse:27.24424	eval-rmse:41.80313
[30]	train-rmse:24.12906	eval-rmse:42.28557
[40]	train-rmse:21.45994	eval-rmse:42.77917
[50]	train-rmse:19.30407	eval-rmse:43.11630
[60]	train-rmse:17.42414	eval-rmse:43.40830
[70]	train-rmse:15.74173	eval-rmse:43.77894
[80]	train-rmse:14.28350	eval-rmse:44.08528
[90]	train-rmse:12.90222	eval-rmse:44.30365
[99]	train-rmse:11.56417	eval-rmse:44.43210


In [15]:
xgb_params = {
    'eta': 0.1, 'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,'seed': 1,'verbosity': 1
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100,evals=watchlist, verbose_eval=10)

[0]	train-rmse:45.50072	eval-rmse:46.99373
[10]	train-rmse:37.11353	eval-rmse:41.55631
[20]	train-rmse:33.57997	eval-rmse:40.37859
[30]	train-rmse:31.47315	eval-rmse:40.20963
[40]	train-rmse:29.89807	eval-rmse:40.15747
[50]	train-rmse:28.58793	eval-rmse:40.28533
[60]	train-rmse:27.26360	eval-rmse:40.55054
[70]	train-rmse:26.05959	eval-rmse:40.73555
[80]	train-rmse:25.13835	eval-rmse:40.82813
[90]	train-rmse:23.93958	eval-rmse:40.89645
[99]	train-rmse:23.14487	eval-rmse:41.04335
