In [22]:
# !wget https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv

In [23]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

In [None]:
df = pd.read_csv("jamb_exam_results.csv")
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head().T

In [25]:
del df['student_id']

In [None]:
df.isna().sum()

In [27]:
df = df.fillna(0)

In [28]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [None]:
len(df), len(df_full_train), len(df_train), len(df_test), len(df_val)

In [30]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [31]:
y_train = df_train["jamb_score"].values
y_val = df_val["jamb_score"].values
y_test = df_test["jamb_score"].values

In [32]:
del df_train["jamb_score"]
del df_val["jamb_score"]
del df_test["jamb_score"]

In [33]:
train_dicts = df_train.to_dict(orient="records")
val_dicts = df_val.to_dict(orient="records")

In [None]:
train_dicts[:1]

In [35]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

### Question 1

In [None]:
dt = DecisionTreeClassifier(max_depth=1)
dt.fit(X_train, y_train)

In [None]:
from sklearn.tree import export_text

print(export_text(dt, feature_names=dv.get_feature_names_out()))

Answer: study_hours_per_week

### Question 2

In [66]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(n_estimators=10,random_state=1,n_jobs=-1)
rf.fit(X_train, y_train)

In [71]:
y_pred = rf.predict(X_val)

In [72]:
from sklearn.metrics import root_mean_squared_error

In [None]:
root_mean_squared_error(y_val, y_pred)

Answer: 42.13

### Question 3

In [None]:
n_estimators = np.arange(10, 201, 10)
n_estimators

In [84]:
scores = []

for i in n_estimators:
    rf = RandomForestRegressor(n_estimators=i,random_state=1,n_jobs=-1)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_val)

    rmse = root_mean_squared_error(y_val, y_pred)

    scores.append((i,rmse))


In [None]:
df_scores = pd.DataFrame(scores, columns=["n_estimators", "rmse"])
plt.plot(df_scores.n_estimators, df_scores.rmse);

Answer: 80

### Question 4

In [87]:
max_depth = [10, 15, 20, 25]
n_estimators = np.arange(10, 201, 10)

scores = []

for d in max_depth:
    for i in n_estimators:
        rf = RandomForestRegressor(n_estimators=i,random_state=1,n_jobs=-1,max_depth=d)
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_val)

        rmse = root_mean_squared_error(y_val, y_pred)

        scores.append((d,i,rmse))

In [88]:
df_scores = pd.DataFrame(scores, columns=["max_depth","n_estimators", "rmse"])

In [None]:
df_scores.groupby("max_depth")['rmse'].mean()

Answer: 10

### Question 5