In [196]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor  
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
#from sklearn.metrics import root_mean_squared_error as RMSE
from sklearn.tree import export_text
import matplotlib.pyplot as plt
import seaborn as sns

In [197]:
random_state = 1

In [198]:
def RMSE(x, y):
    return np.sqrt(np.mean((x - y) ** 2))

In [199]:
df = pd.read_csv("jamb_exam_results.csv")
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,student_id,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [200]:
df.parent_education_level = df.parent_education_level.fillna(0)

In [201]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=random_state)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=random_state)
assert len(df) == (len(df_train) + len(df_val) + len(df_test))


In [202]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.jamb_score.values
y_val = df_val.jamb_score.values
y_test = df_test.jamb_score.values

del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']

In [203]:
dv = DictVectorizer(sparse=True)
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

In [204]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

In [205]:
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [206]:
y_pred = dt.predict(X_val)

In [207]:
print(export_text(dt, feature_names=list(dv.get_feature_names_out())))

|--- study_hours_per_week <= 18.50
|   |--- value: [155.24]
|--- study_hours_per_week >  18.50
|   |--- value: [188.59]



In [208]:
n_estimators=10

dforest = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state, n_jobs=-1)
dforest.fit(X_train, y_train)

In [209]:
y_pred = dforest.predict(X_val)


In [174]:
RMSE(y_pred, y_val)

np.float64(41.35948718250747)

In [151]:
scores = []

for n in range(10, 61, 5):
    dforest = RandomForestRegressor(n_estimators=n, random_state=random_state, n_jobs=-1)
    dforest.fit(X_train, y_train)

    y_pred = dforest.predict(X_val)
    rmse = round(float(RMSE(y_pred, y_val)),3)
    print (n, rmse)
    scores.append((n, rmse))

10 41.359
15 41.297
20 41.116
25 40.93
30 40.796
35 40.725
40 40.532
45 40.553
50 40.513
55 40.556
60 40.595


In [148]:
scores

[(5, 43.667),
 (10, 41.359),
 (15, 41.297),
 (20, 41.116),
 (25, 40.93),
 (30, 40.796),
 (35, 40.725),
 (40, 40.532),
 (45, 40.553),
 (50, 40.513),
 (55, 40.556),
 (60, 40.595)]

In [188]:
scores = []
for n in range(10, 31, 10):
    print("n = ", n)
    for depth in [10, 15, 20, 25]:
        dforest = RandomForestRegressor(max_depth=depth, n_estimators=n, random_state=random_state, n_jobs=-1)
        dforest.fit(X_train, y_train)

        y_pred = dforest.predict(X_val)
        rmse = round(float(RMSE(y_pred, y_val)),3)
        print (depth, rmse)
        scores.append((n, depth, rmse))

n =  10
10 40.912
15 41.657
20 41.273
25 41.36
n =  20
10 40.657
15 41.379
20 41.177
25 41.151
n =  30
10 40.585
15 40.996
20 40.757
25 40.811


In [189]:
scores


[(10, 10, 40.912),
 (10, 15, 41.657),
 (10, 20, 41.273),
 (10, 25, 41.36),
 (20, 10, 40.657),
 (20, 15, 41.379),
 (20, 20, 41.177),
 (20, 25, 41.151),
 (30, 10, 40.585),
 (30, 15, 40.996),
 (30, 20, 40.757),
 (30, 25, 40.811)]

In [211]:
n_estimators=10
max_depth=20

dforest = RandomForestRegressor(max_depth = max_depth, n_estimators=n_estimators, random_state=random_state, n_jobs=-1)
dforest.fit(X_train, y_train)

In [215]:
dforest.feature_importances_

array([0.00899999, 0.007879  , 0.05624215, 0.02604739, 0.13494287,
       0.11063512, 0.00979022, 0.01058927, 0.0073589 , 0.00847942,
       0.01505077, 0.00973391, 0.00937252, 0.        , 0.0139754 ,
       0.01357768, 0.0117731 , 0.02096498, 0.01046491, 0.01144771,
       0.00724783, 0.00841969, 0.00584344, 0.00755391, 0.02594131,
       0.00994276, 0.0076226 , 0.11539527, 0.23427889, 0.08042899])

In [219]:
df.columns, dforest.feature_importances_

(Index(['jamb_score', 'study_hours_per_week', 'attendance_rate',
        'teacher_quality', 'distance_to_school', 'school_type',
        'school_location', 'extra_tutorials', 'access_to_learning_materials',
        'parent_involvement', 'it_knowledge', 'student_id', 'age', 'gender',
        'socioeconomic_status', 'parent_education_level',
        'assignments_completed'],
       dtype='object'),
 array([0.00899999, 0.007879  , 0.05624215, 0.02604739, 0.13494287,
        0.11063512, 0.00979022, 0.01058927, 0.0073589 , 0.00847942,
        0.01505077, 0.00973391, 0.00937252, 0.        , 0.0139754 ,
        0.01357768, 0.0117731 , 0.02096498, 0.01046491, 0.01144771,
        0.00724783, 0.00841969, 0.00584344, 0.00755391, 0.02594131,
        0.00994276, 0.0076226 , 0.11539527, 0.23427889, 0.08042899]))

0.007879