In [2]:
import os
os.getcwd()
os.chdir('/Users/utente/downloads')

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction import DictVectorizer
import xgboost as xgb
import numpy as np


In [64]:
!pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/ab/a2/876d56ae72d7472b7a4228b880f1aaaa9c01817e05b4943674c9384ff20a/xgboost-2.1.2-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl.metadata
  Downloading xgboost-2.1.2-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.2-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.2


In [21]:
!wget https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv

--2024-11-05 16:44:29--  https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/alexeygrigorev/datasets/refs/heads/master/jamb_exam_results.csv [following]
--2024-11-05 16:44:29--  https://raw.githubusercontent.com/alexeygrigorev/datasets/refs/heads/master/jamb_exam_results.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 391501 (382K) [text/plain]
Saving to: ‘jamb_exam_results.csv.2’


2024-11-05 16:44:29 (11,5 MB/s) - ‘jamb_exam_results.csv.2’ saved [391501/391501]



## Preparing the dataset



In [5]:
# Load the data
df = pd.read_csv("jamb_exam_results.csv")

# Lowercase column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Drop the student_id column and fill missing values with zero
df = df.drop(columns=['student_id']).fillna(0)


In [7]:
# Split data into train, validation, and test sets
df_train, df_temp = train_test_split(df, test_size=0.4, random_state=1)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=1)

# Separate the target variable
y_train = df_train.jamb_score.values
y_val = df_val.jamb_score.values
y_test = df_test.jamb_score.values

# Drop target column from features
df_train = df_train.drop(columns=['jamb_score'])
df_val = df_val.drop(columns=['jamb_score'])
df_test = df_test.drop(columns=['jamb_score'])

# Use DictVectorizer
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(df_train.to_dict(orient='records'))
X_val = dv.transform(df_val.to_dict(orient='records'))
X_test = dv.transform(df_test.to_dict(orient='records'))



## Question 1

In [47]:
# Train decision tree model
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

# Get the feature used for splitting
feature_split = dv.feature_names_[dt.tree_.feature[0]]
print("Feature used for splitting:", feature_split)


Feature used for splitting: study_hours_per_week


## Question 2

In [9]:
# Train random forest model
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

# Predict on validation set and calculate RMSE
y_pred = rf.predict(X_val)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print("RMSE:", rmse)


RMSE: 43.157758977963624




## Question 3

In [53]:
# Test different values for n_estimators and calculate RMSE for each
rmse_values = []
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    rmse_values.append((n, rmse))

# Find the point where RMSE stops improving significantly
for n, rmse in rmse_values:
    print(f"n_estimators={n}, RMSE={rmse}")




n_estimators=10, RMSE=43.157758977963624
n_estimators=20, RMSE=41.79043981582391
n_estimators=30, RMSE=41.555818472133225
n_estimators=40, RMSE=41.075631652173044
n_estimators=50, RMSE=40.9571573818301
n_estimators=60, RMSE=40.77368529456223
n_estimators=70, RMSE=40.587805985220214
n_estimators=80, RMSE=40.5027042403498
n_estimators=90, RMSE=40.43492224596255
n_estimators=100, RMSE=40.36491034549687
n_estimators=110, RMSE=40.347525479439874
n_estimators=120, RMSE=40.30191844844362
n_estimators=130, RMSE=40.285789466741
n_estimators=140, RMSE=40.26346078629849
n_estimators=150, RMSE=40.25426440073703
n_estimators=160, RMSE=40.1996656828838
n_estimators=170, RMSE=40.187325737485885
n_estimators=180, RMSE=40.13596272032919
n_estimators=190, RMSE=40.15216599857013
n_estimators=200, RMSE=40.138465594427




## Question 4

In [55]:
# Test combinations of max_depth and n_estimators
results = []
for max_depth in [10, 15, 20, 25]:
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=max_depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        results.append((max_depth, n, rmse))

# Find the best max_depth based on RMSE
best_result = min(results, key=lambda x: x[2])
print("Best max_depth:", best_result[0], "with RMSE:", best_result[2])




Best max_depth: 10 with RMSE: 39.82318665099042




## Question 5

In [60]:
# Train random forest with specified parameters
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

# Get feature importance
importances = rf.feature_importances_
feature_importance = sorted(zip(dv.feature_names_, importances), key=lambda x: x[1], reverse=True)
print("Most important feature:", feature_importance[0][0])


Most important feature: study_hours_per_week


## Question 6

In [68]:
# Convert data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)


In [70]:
xgb_params_1 = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}

xgb_params_2 = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}

# Train with eta=0.3
evals = [(dval, 'validation')]
model_1 = xgb.train(xgb_params_1, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=10)

# Train with eta=0.1
model_2 = xgb.train(xgb_params_2, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=10)


[0]	validation-rmse:44.52338
[1]	validation-rmse:42.83406
[2]	validation-rmse:41.62607
[3]	validation-rmse:41.25491
[4]	validation-rmse:40.84075
[5]	validation-rmse:40.71677
[6]	validation-rmse:40.72669
[7]	validation-rmse:40.68822
[8]	validation-rmse:40.81273
[9]	validation-rmse:40.84939
[10]	validation-rmse:40.83759
[11]	validation-rmse:40.80575
[12]	validation-rmse:40.84238
[13]	validation-rmse:40.96020
[14]	validation-rmse:40.98775
[15]	validation-rmse:41.04798
[16]	validation-rmse:41.08375
[17]	validation-rmse:41.15979
[0]	validation-rmse:46.63724
[1]	validation-rmse:45.58724
[2]	validation-rmse:44.76209
[3]	validation-rmse:44.02498
[4]	validation-rmse:43.40640
[5]	validation-rmse:42.92195
[6]	validation-rmse:42.49211
[7]	validation-rmse:42.15780
[8]	validation-rmse:41.84104
[9]	validation-rmse:41.58026
[10]	validation-rmse:41.35829
[11]	validation-rmse:41.19143
[12]	validation-rmse:41.02571
[13]	validation-rmse:40.90308
[14]	validation-rmse:40.79701
[15]	validation-rmse:40.66274
