In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

In [2]:
df=pd.read_csv('jamb_exam_results.csv')
df.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [3]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [4]:
# Drop the 'student_id' column
df = df.drop(columns=['student_id'])

In [5]:
# Fill missing values with zero
df = df.fillna(0)



In [7]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 1)
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.jamb_score.values
del df_train['jamb_score']

y_val = df_val.jamb_score.values
del df_val['jamb_score']

y_test = df_test.jamb_score.values
del df_test['jamb_score']


In [8]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse = False)

train_dict = df_train.to_dict(orient = 'records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient = 'records')
X_val = dv.fit_transform(val_dict)

In [9]:
from sklearn.tree import DecisionTreeRegressor, export_text
 
model = DecisionTreeRegressor(max_depth = 1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)


print(export_text(model, feature_names = dv.get_feature_names()))


|--- study_hours_per_week <= 18.50
|   |--- value: [155.24]
|--- study_hours_per_week >  18.50
|   |--- value: [188.59]





In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred)** 0.5
print(rmse)

41.82546234054084


In [13]:
for i in range(10, 211, 10):
    model = RandomForestRegressor(n_estimators = i, random_state=1, n_jobs = -1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred) ** 0.5 
    print("For estimators %s, RMSE is %.3f" % (i, rmse))

For estimators 10, RMSE is 41.825
For estimators 20, RMSE is 41.196
For estimators 30, RMSE is 40.857
For estimators 40, RMSE is 40.666
For estimators 50, RMSE is 40.678
For estimators 60, RMSE is 40.604
For estimators 70, RMSE is 40.535
For estimators 80, RMSE is 40.404
For estimators 90, RMSE is 40.377
For estimators 100, RMSE is 40.430
For estimators 110, RMSE is 40.472
For estimators 120, RMSE is 40.522
For estimators 130, RMSE is 40.530
For estimators 140, RMSE is 40.465
For estimators 150, RMSE is 40.436
For estimators 160, RMSE is 40.462
For estimators 170, RMSE is 40.479
For estimators 180, RMSE is 40.482
For estimators 190, RMSE is 40.468
For estimators 200, RMSE is 40.454
For estimators 210, RMSE is 40.419


In [14]:

for depth in [10,15,20,25]:
    print('For max_depth %s \n' % depth)
    for i in range(10, 201, 10):
        model = RandomForestRegressor(n_estimators = i, random_state=1, n_jobs = -1, max_depth = depth)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred) ** 0.5 
        print("For estimators %s, RMSE is %.3f" % (i, rmse))

For max_depth 10 

For estimators 10, RMSE is 41.073
For estimators 20, RMSE is 40.853
For estimators 30, RMSE is 40.623
For estimators 40, RMSE is 40.334
For estimators 50, RMSE is 40.320
For estimators 60, RMSE is 40.262
For estimators 70, RMSE is 40.281
For estimators 80, RMSE is 40.228
For estimators 90, RMSE is 40.173
For estimators 100, RMSE is 40.226
For estimators 110, RMSE is 40.252
For estimators 120, RMSE is 40.276
For estimators 130, RMSE is 40.293
For estimators 140, RMSE is 40.258
For estimators 150, RMSE is 40.271
For estimators 160, RMSE is 40.310
For estimators 170, RMSE is 40.324
For estimators 180, RMSE is 40.332
For estimators 190, RMSE is 40.318
For estimators 200, RMSE is 40.293
For max_depth 15 

For estimators 10, RMSE is 41.864
For estimators 20, RMSE is 41.179
For estimators 30, RMSE is 40.911
For estimators 40, RMSE is 40.800
For estimators 50, RMSE is 40.758
For estimators 60, RMSE is 40.703
For estimators 70, RMSE is 40.639
For estimators 80, RMSE is 40.493

In [15]:
model = RandomForestRegressor(n_estimators = 10, max_depth = 20, random_state=1, n_jobs = -1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

In [16]:
d = {'feature': dv.get_feature_names(), 'values': model.feature_importances_}
feature_info_values = pd.DataFrame(data = d)
feature_info_values.sort_values('values', ascending = False)



Unnamed: 0,feature,values
27,study_hours_per_week,0.246331
4,attendance_rate,0.148986
5,distance_to_school,0.134925
28,teacher_quality,0.082243
2,age,0.069812
3,assignments_completed,0.030493
24,socioeconomic_status=High,0.025701
17,parent_involvement=High,0.021997
10,it_knowledge=High,0.017578
14,parent_education_level=Primary,0.014976


In [17]:
import xgboost as xgb
features = dv.get_feature_names()
dtrain = xgb.DMatrix(X_train, label = y_train, feature_names = features)
dval = xgb.DMatrix(X_val, label = y_val, feature_names = features)

ModuleNotFoundError: No module named 'xgboost'