# Homework 06


In [1]:
from pathlib import Path

import altair as alt
import matplotlib.pyplot as plt
import polars as pl
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, export_text

from ml_zoomcamp.boost import parse_xgb_output
from ml_zoomcamp.utils import clean_column_names, load_data

ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR.joinpath("data")

In [2]:
SEED = 1

### Getting the data


In [3]:
csv_uri = "https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv"
df = load_data(csv_uri, DATA_DIR)
df = clean_column_names(df)

#### Cleanup Columns


In [4]:
df = df.drop("student_id")

### Data Preparation


In [5]:
df.glimpse()

Rows: 5000
Columns: 16
$ jamb_score                   <i64> 192, 207, 182, 210, 199, 202, 251, 129, 220, 157
$ study_hours_per_week         <i64> 22, 14, 29, 29, 12, 25, 35, 27, 23, 15
$ attendance_rate              <i64> 78, 88, 87, 99, 98, 85, 85, 75, 85, 79
$ teacher_quality              <i64> 4, 4, 2, 2, 3, 2, 4, 3, 3, 3
$ distance_to_school           <f64> 12.4, 2.7, 9.6, 2.6, 8.8, 13.6, 2.6, 9.4, 4.6, 15.6
$ school_type                  <str> 'Public', 'Public', 'Public', 'Public', 'Public', 'Public', 'Public', 'Public', 'Public', 'Public'
$ school_location              <str> 'Urban', 'Rural', 'Rural', 'Urban', 'Urban', 'Urban', 'Urban', 'Urban', 'Rural', 'Rural'
$ extra_tutorials              <str> 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No'
$ access_to_learning_materials <str> 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes'
$ parent_involvement           <str> 'High', 'High', 'High', 'Medium', 'Medium', 'Medium', 'Low', 'Low', 'Low', 'Low'
$

In [6]:
df.null_count().transpose(include_header=True, column_names=["null_count"]).filter(
    pl.col("null_count") > 0
).sort(pl.col("null_count"), descending=True)

column,null_count
str,u32


There are no missing values


## Setting Up Validation Framework


In [7]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=SEED)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=SEED)

assert len(df) == len(df_train) + len(df_val) + len(df_test)

In [8]:
len(df_train), len(df_val), len(df_test)

(3000, 1000, 1000)

In [9]:
y_train = df_train["jamb_score"].to_numpy()
y_val = df_val["jamb_score"].to_numpy()
y_test = df_test["jamb_score"].to_numpy()

In [10]:
df_train = df_train.drop("jamb_score")
df_val = df_val.drop("jamb_score")
df_test = df_test.drop("jamb_score")

## EDA


## Encoding


In [11]:
dicts_train = df_train.to_dicts()
dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(dicts_train, y_train)
features = dv.feature_names_

dicts_val = df_val.to_dicts()
X_val = dv.transform(dicts_val)

## 1. Decision Tree Feature


In [12]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

In [13]:
print(export_text(dt, feature_names=features))

|--- study_hours_per_week <= 18.50
|   |--- value: [155.24]
|--- study_hours_per_week >  18.50
|   |--- value: [188.59]



## 2. Random Forest


In [14]:
rf = RandomForestRegressor(n_estimators=10, random_state=SEED, n_jobs=-1)
rf.fit(X_train, y_train)

In [15]:
y_pred = rf.predict(X_val)
root_mean_squared_error(y_val, y_pred)

np.float64(41.82546234054084)

## 3. Random Forest n_estimators


In [16]:
scores = []
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=SEED, n_jobs=-1)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_val)
    score = root_mean_squared_error(y_val, y_pred)

    scores.append((n, score))

In [17]:
df_score = pl.DataFrame(scores, schema=["n_estimator", "rmse"], orient="row")

In [18]:
df_score.plot.line(x="n_estimator", y="rmse").configure_scale(zero=False)

In [19]:
df_score.filter(pl.col("n_estimator").is_between(70, 100))

n_estimator,rmse
i64,f64
70,40.534604
80,40.40394
90,40.376523
100,40.430283


## 4. Random Forest max_depth


In [20]:
scores = []
for d in [10, 15, 20, 25]:
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(
            n_estimators=n, max_depth=d, random_state=SEED, n_jobs=-1
        )
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_val)
        score = root_mean_squared_error(y_val, y_pred)

        scores.append((d, n, score))

In [21]:
df_score = pl.DataFrame(
    scores, schema=["max_depth", "n_estimator", "rmse"], orient="row"
)

In [22]:
df_score.group_by(pl.col("max_depth")).agg(mean_rmse=pl.col("rmse").mean()).sort(
    pl.col("mean_rmse")
)

max_depth,mean_rmse
i64,f64
10,40.365023
25,40.611362
15,40.639766
20,40.684563


In [23]:
df_score.group_by(pl.col("max_depth")).agg(mean_rmse=pl.col("rmse").mean()).plot.line(
    x="max_depth:O", y="mean_rmse:Q"
).configure_scale(zero=False).properties(width=300, height=300)

## 5. Feature Importance


In [24]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=SEED, n_jobs=-1)
rf.fit(X_train, y_train)

In [25]:
pl.DataFrame(
    [features, rf.feature_importances_],
    schema=["feature", "score"],
    orient="col",
).top_k(5, by="score")

feature,score
str,f64
"""study_hours_per_week""",0.246331
"""attendance_rate""",0.148986
"""distance_to_school""",0.134925
"""teacher_quality""",0.082243
"""age""",0.069812


## 6. XGBoost


In [26]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [27]:
watchlist = [(dtrain, "train"), (dval, "val")]

In [28]:
scores = {}

In [29]:
%%capture output

xgb_params = {
    "eta": 0.3,
    "max_depth": 6,
    "min_child_weight": 1,
    "objective": "reg:squarederror",
    "nthread": 8,
    "seed": SEED,
    "verbosity": 1,
}

model = xgb.train(
    xgb_params, dtrain, num_boost_round=100, verbose_eval=5, evals=watchlist
)

In [None]:
key = f"eta={xgb_params["eta"]}"
scores[key] = parse_xgb_output(output, "rmse").with_columns(key=pl.lit(key))

In [31]:
%%capture output

xgb_params = {
    "eta": 0.1,
    "max_depth": 6,
    "min_child_weight": 1,
    "objective": "reg:squarederror",
    "nthread": 8,
    "seed": SEED,
    "verbosity": 1,
}

model = xgb.train(
    xgb_params, dtrain, num_boost_round=100, verbose_eval=5, evals=watchlist
)

In [None]:
key = f"eta={xgb_params["eta"]}"
scores[key] = parse_xgb_output(output, "rmse").with_columns(key=pl.lit(key))

In [33]:
df_score = pl.concat(list(scores.values()))

In [34]:
color_scale = alt.Scale(domain=scores.keys(), range=["blue", "orange"])
df_score.plot.line(
    x="num_iter:O", y="val_rmse:Q", color=alt.Color("key:O", scale=color_scale)
).configure_scale(zero=False)

eta=0.1 resulted in a lower and more stable rmse
