## Homework 6
 - Decision Trees and Ensemble Learning

In [71]:
import pandas as pd

## Getting the dataset and preparing it

In [72]:
data =  'https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv'

In [73]:
!curl -o jamb_exam_results.csv $data

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0


In [74]:
df = pd.read_csv(data)

In [75]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [76]:
df.head()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,student_id,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [89]:
df.dtypes

jamb_score                        int64
study_hours_per_week              int64
attendance_rate                   int64
teacher_quality                   int64
distance_to_school              float64
school_type                       int64
school_location                  object
extra_tutorials                  object
access_to_learning_materials     object
parent_involvement               object
it_knowledge                     object
age                               int64
gender                           object
socioeconomic_status             object
parent_education_level           object
assignments_completed             int64
dtype: object

In [97]:
df['parent_education_level'].value_counts()

parent_education_level
Secondary    1556
Primary      1335
Tertiary     1218
0             891
Name: count, dtype: int64

In [77]:
df = df.drop('student_id', axis=1)

In [98]:
from sklearn.preprocessing import LabelEncoder

# One-hot encoding the categorical columns
categorical_columns = [
    'school_location', 'extra_tutorials', 'access_to_learning_materials', 
    'parent_involvement', 'it_knowledge', 'gender', 'socioeconomic_status', 
    'parent_education_level'
]

df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

In [99]:
df = df.fillna(0)

In [100]:
df.isnull().sum()

jamb_score                          0
study_hours_per_week                0
attendance_rate                     0
teacher_quality                     0
distance_to_school                  0
school_type                         0
age                                 0
assignments_completed               0
school_location_Urban               0
extra_tutorials_Yes                 0
access_to_learning_materials_Yes    0
parent_involvement_Low              0
parent_involvement_Medium           0
it_knowledge_Low                    0
it_knowledge_Medium                 0
gender_Male                         0
socioeconomic_status_Low            0
socioeconomic_status_Medium         0
parent_education_level_Primary      0
parent_education_level_Secondary    0
parent_education_level_Tertiary     0
dtype: int64

## Question 1
Let's train a decision tree regressor to predict the jamb_score variable.
 - Train a model with max_depth=1.

Which feature is used for splitting the data?

In [146]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

# Separate the target variable 'jamb_score' from the feature columns
y_train = df_train['jamb_score'].values
y_val = df_val['jamb_score'].values
y_test = df_test['jamb_score'].values

df_train = df_train.drop('jamb_score', axis=1)
df_val = df_val.drop('jamb_score', axis=1)
df_test = df_test.drop('jamb_score', axis=1)


In [102]:
dv = DictVectorizer(sparse=True)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.fit_transform(val_dict)

In [103]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

In [104]:
feature_importance = dt.tree_.feature[0]  # Root node's feature
feature_name = dv.feature_names_[feature_importance]
print("Feature used for splitting:", feature_name)

Feature used for splitting: study_hours_per_week


## Question 2
 Train a random forest regressor with these parameters:
  - n_estimators=10
  - random_state=1
  - n_jobs=-1 (optional - to make training faster)

What's the RMSE of this model on the validation data?

In [105]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [106]:
df['school_type'].value_counts()

school_type
1    3735
0    1265
Name: count, dtype: int64

In [107]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)

In [108]:
rf.fit(df_train, y_train)

In [109]:
y_pred = rf.predict(df_val)

In [110]:
mse = mean_squared_error(y_val, y_pred)

In [113]:
rmse = np.sqrt(mse)
print(f'RMSE: {rmse}')

RMSE: 42.71064984286706


## Qustion 3
Now let's experiment with the n_estimators parameter
 - Try different values of this parameter from 10 to 200 with step 10.
 - Set random_state to 1.
 - Evaluate the model on the validation dataset.

After which value of n_estimators does RMSE stop improving? Consider 3 decimal places for calculating the answer.

In [114]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [119]:
n_estimators_range = range(10, 201, 10)

In [120]:
rmse_values = []

for n in n_estimators_range:
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(df_train, y_train)

    y_pred_val = rf.predict(df_val)

    rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    rmse_values.append(rmse)

for n, rmse in zip(n_estimators, rmse_values):
    print(f"n_estimators={n}: RMSE={rmse:.3f}")

## Qustion 4
Let's select the best max_depth:
 - Try different values of max_depth: [10, 15, 20, 25]
 - For each of these values,
   - try different values of n_estimators from 10 till 200 (with step 10)
   - calculate the mean RMSE
 - Fix the random seed: random_state=1
   
What's the best max_depth, using the mean RMSE?

In [125]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [126]:
mean_rmse_by_depth = {}

In [132]:
for max_depth in [10, 15, 20, 25]:
    rmse_list = []

    for n_estimators in range(10, 201, 10):
        rf = RandomForestRegressor(
            n_estimators=n_estimators, 
            max_depth=max_depth, 
            random_state=1, 
            n_jobs=-1
        )

        rf.fit(df_train, y_train)

        y_pred = rf.predict(df_val)  # Use df_val here, not y_val

        rmse = mean_squared_error(y_val, y_pred, squared=False)
        rmse_list.append(rmse)

    mean_rmse_by_depth[max_depth] = np.mean(rmse_list)

# Identify the max_depth with the lowest mean RMSE
best_max_depth = min(mean_rmse_by_depth, key=mean_rmse_by_depth.get)
print("Best max_depth:", best_max_depth)
print("Mean RMSE values by depth:", mean_rmse_by_depth)




Best max_depth: 10
Mean RMSE values by depth: {10: np.float64(40.84626775939854), 15: np.float64(41.14050546304692), 20: np.float64(41.14969405550623), 25: np.float64(41.14308591478132)}




## Question 5

For this homework question, we'll find the most important feature:
 - Train the model with these parameters:
   - n_estimators=10,
   - max_depth=20,
   - random_state=1,
   - n_jobs=-1 (optional)
 - Get the feature importance information from this model

What's the most important feature (among these 4)?

In [133]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

In [134]:
rf = RandomForestRegressor(
    n_estimators=10, 
    max_depth=20, 
    random_state=1, 
    n_jobs=-1
)

In [135]:
rf.fit(df_train, y_train)

In [137]:
feature_importances = rf.feature_importances_

In [138]:
# Map feature names to their importance values
features = ['study_hours_per_week', 'attendance_rate', 'distance_to_school', 'teacher_quality']
importance_dict = dict(zip(features, feature_importances))

In [139]:
# Find the most important feature
most_important_feature = max(importance_dict, key=importance_dict.get)

print("Feature Importances:", importance_dict)
print("Most Important Feature:", most_important_feature)

Feature Importances: {'study_hours_per_week': np.float64(0.24768623216170282), 'attendance_rate': np.float64(0.1532353163390145), 'distance_to_school': np.float64(0.08999694643740938), 'teacher_quality': np.float64(0.14606468111501192)}
Most Important Feature: study_hours_per_week


## Question 6
 - Now let's train an XGBoost model! For this question, we'll tune the eta parameter:
   - Install XGBoost
   - Create DMatrix for train and validation
   - Create a watchlist
   - Train a model with these parameters for 100 rounds:

Which eta leads to the best RMSE score on the validation dataset?

In [140]:
import xgboost as xgb

In [141]:
dtrain = xgb.DMatrix(df_train, label=y_train)
dval = xgb.DMatrix(df_val, label=y_val)

In [142]:
watchlist = [(dtrain, 'train'), (dval, 'eval')]

In [143]:
xgb_params = {
    'eta': 0.3,  # You will later change this to 0.1 for comparison
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}

model_0_3 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist)


[0]	train-rmse:42.78831	eval-rmse:44.95753
[1]	train-rmse:40.08410	eval-rmse:42.76326
[2]	train-rmse:38.12516	eval-rmse:41.67294
[3]	train-rmse:36.81284	eval-rmse:41.17275
[4]	train-rmse:35.75232	eval-rmse:40.94766
[5]	train-rmse:34.94044	eval-rmse:40.82090
[6]	train-rmse:34.03303	eval-rmse:40.50850
[7]	train-rmse:33.48451	eval-rmse:40.42920
[8]	train-rmse:32.80749	eval-rmse:40.55814
[9]	train-rmse:32.03266	eval-rmse:40.45150
[10]	train-rmse:31.43684	eval-rmse:40.50210
[11]	train-rmse:30.91739	eval-rmse:40.57134
[12]	train-rmse:30.66449	eval-rmse:40.62269
[13]	train-rmse:30.35380	eval-rmse:40.63890
[14]	train-rmse:29.84232	eval-rmse:40.73470
[15]	train-rmse:29.29614	eval-rmse:40.70167
[16]	train-rmse:29.03259	eval-rmse:40.76230
[17]	train-rmse:28.39362	eval-rmse:40.75085
[18]	train-rmse:28.24640	eval-rmse:40.78001
[19]	train-rmse:27.95836	eval-rmse:40.87794
[20]	train-rmse:27.58339	eval-rmse:40.99888
[21]	train-rmse:27.14693	eval-rmse:41.07116
[22]	train-rmse:26.64486	eval-rmse:41.0961

In [144]:
rmse_0_3 = model_0_3.eval(dval)
print("RMSE for eta=0.3:", rmse_0_3)

RMSE for eta=0.3: [0]	eval-rmse:43.74012926609951535


In [145]:
xgb_params['eta'] = 0.1
model_0_1 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist)

# Get RMSE for eta=0.1
rmse_0_1 = model_0_1.eval(dval)
print("RMSE for eta=0.1:", rmse_0_1)


[0]	train-rmse:45.53299	eval-rmse:47.03599
[1]	train-rmse:44.20720	eval-rmse:45.95183
[2]	train-rmse:43.05263	eval-rmse:45.09411
[3]	train-rmse:42.04756	eval-rmse:44.35138
[4]	train-rmse:41.17903	eval-rmse:43.70407
[5]	train-rmse:40.38276	eval-rmse:43.20616
[6]	train-rmse:39.65735	eval-rmse:42.78925
[7]	train-rmse:39.04996	eval-rmse:42.41836
[8]	train-rmse:38.46425	eval-rmse:42.13914
[9]	train-rmse:37.89896	eval-rmse:41.84058
[10]	train-rmse:37.44185	eval-rmse:41.65068
[11]	train-rmse:37.02981	eval-rmse:41.46666
[12]	train-rmse:36.63476	eval-rmse:41.25700
[13]	train-rmse:36.22168	eval-rmse:41.05441
[14]	train-rmse:35.85790	eval-rmse:40.90915
[15]	train-rmse:35.47438	eval-rmse:40.81025
[16]	train-rmse:35.17298	eval-rmse:40.69717
[17]	train-rmse:34.84378	eval-rmse:40.63673
[18]	train-rmse:34.56737	eval-rmse:40.54731
[19]	train-rmse:34.28293	eval-rmse:40.44529
[20]	train-rmse:33.96908	eval-rmse:40.40349
[21]	train-rmse:33.69815	eval-rmse:40.34203
[22]	train-rmse:33.46800	eval-rmse:40.2698