In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('/kaggle/input/student-classification-dataset/student.csv')
data.sample(4)

Unnamed: 0.1,Unnamed: 0,Id,Student_Age,Sex,High_School_Type,Scholarship,Additional_Work,Sports_activity,Transportation,Weekly_Study_Hours,Attendance,Reading,Notes,Listening_in_Class,Project_work,Grade
25,25,5026,20,Male,State,50%,No,No,Bus,0,Always,No,Yes,No,No,BB
139,139,5140,18,Female,State,75%,Yes,No,Bus,2,Sometimes,No,No,Yes,Yes,Fail
130,130,5131,18,Female,State,50%,Yes,Yes,Private,0,Always,Yes,Yes,No,No,AA
44,44,5045,20,Male,Other,50%,No,No,Bus,0,Always,Yes,Yes,Yes,Yes,AA


In [3]:
data.drop(columns=['Unnamed: 0','Id'],inplace=True)

In [4]:
##since we have only one NAN value in scholarship removing that will not 
##effect us 
data.dropna(inplace=True)

In [5]:
## doing additional works will result in lower grades
data['Additional_Work'] = data['Additional_Work'].map({'Yes': 0, 'No' : 1})

In [6]:
data.drop(data[data['Attendance'] == '3'].index,inplace=True)

In [7]:
data['Sports_activity'] = data['Sports_activity'].map({'Yes': 1, 'No' : 0})
data['Listening_in_Class'] = data['Listening_in_Class'].map({'Yes': 1, 'No' : 0})
data['Project_work'] = data['Project_work'].map({'Yes': 1, 'No' : 0})
data['Notes'] = data['Notes'].map({'Yes': 1, 'No' : 0})
data['Reading'] = data['Reading'].map({'Yes': 1, 'No' : 0})
data['Sex'] = data['Sex'].map({'Male': 1, 'Female' : 0})
data['Transportation'] = data['Transportation'].map({'Private': 1, 'Bus' : 0})
data['Attendance'] = data['Attendance'].map({'Always': 3, 'Sometimes' : 2, 'Never' : 1})

In [8]:
data.sample(4)

Unnamed: 0,Student_Age,Sex,High_School_Type,Scholarship,Additional_Work,Sports_activity,Transportation,Weekly_Study_Hours,Attendance,Reading,Notes,Listening_in_Class,Project_work,Grade
36,19,1,Other,75%,0,0,0,12,3,1,1.0,1.0,1,BA
9,21,0,State,50%,1,0,0,12,1,0,1.0,0.0,0,Fail
105,18,1,State,75%,0,1,1,0,3,1,0.0,0.0,0,DD
31,25,1,State,50%,0,0,1,0,3,0,1.0,0.0,0,BB


In [9]:
ct = ColumnTransformer(
   [
       ('one-hot',OneHotEncoder(),['High_School_Type','Scholarship']),
       ('label-encoder',OrdinalEncoder(categories=[['AA','BA','BB','CC','DD','DC','CB','Fail']]),['Grade'])
   ]
)

trf = ct.fit_transform(data)
names = ct.get_feature_names_out()
df = pd.DataFrame(trf,columns=names)
df

Unnamed: 0,one-hot__High_School_Type_Other,one-hot__High_School_Type_Private,one-hot__High_School_Type_State,one-hot__Scholarship_100%,one-hot__Scholarship_25%,one-hot__Scholarship_50%,one-hot__Scholarship_75%,label-encoder__Grade
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
138,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0
139,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0
140,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
141,0.0,0.0,1.0,0.0,0.0,0.0,1.0,6.0


In [10]:
data = pd.concat([data,df],axis=1)
data.sample(4)

Unnamed: 0,Student_Age,Sex,High_School_Type,Scholarship,Additional_Work,Sports_activity,Transportation,Weekly_Study_Hours,Attendance,Reading,...,Project_work,Grade,one-hot__High_School_Type_Other,one-hot__High_School_Type_Private,one-hot__High_School_Type_State,one-hot__Scholarship_100%,one-hot__Scholarship_25%,one-hot__Scholarship_50%,one-hot__Scholarship_75%,label-encoder__Grade
52,22.0,0.0,State,50%,0.0,0.0,1.0,2.0,3.0,1.0,...,0.0,AA,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
33,21.0,0.0,State,50%,0.0,0.0,0.0,0.0,3.0,1.0,...,0.0,BA,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
91,20.0,1.0,State,100%,0.0,1.0,0.0,0.0,3.0,0.0,...,0.0,DC,0.0,0.0,1.0,1.0,0.0,0.0,0.0,5.0
0,21.0,1.0,Other,50%,0.0,0.0,1.0,0.0,3.0,1.0,...,0.0,AA,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [11]:
data.drop(columns=['High_School_Type','Scholarship'],inplace=True)

In [12]:
data.sample(4)

Unnamed: 0,Student_Age,Sex,Additional_Work,Sports_activity,Transportation,Weekly_Study_Hours,Attendance,Reading,Notes,Listening_in_Class,Project_work,Grade,one-hot__High_School_Type_Other,one-hot__High_School_Type_Private,one-hot__High_School_Type_State,one-hot__Scholarship_100%,one-hot__Scholarship_25%,one-hot__Scholarship_50%,one-hot__Scholarship_75%,label-encoder__Grade
35,18.0,1.0,1.0,0.0,1.0,12.0,3.0,1.0,1.0,0.0,0.0,AA,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
49,18.0,1.0,1.0,0.0,1.0,2.0,1.0,1.0,0.0,1.0,1.0,BA,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
56,20.0,1.0,1.0,1.0,1.0,0.0,3.0,1.0,1.0,1.0,0.0,CC,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0
115,19.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,1.0,1.0,1.0,AA,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [13]:
data.drop(columns=['Grade'],inplace=True)

In [14]:
data.rename(columns = {'label-encoder__Grade': 'Grade'},inplace=True)

In [15]:
data.corr()

Unnamed: 0,Student_Age,Sex,Additional_Work,Sports_activity,Transportation,Weekly_Study_Hours,Attendance,Reading,Notes,Listening_in_Class,Project_work,one-hot__High_School_Type_Other,one-hot__High_School_Type_Private,one-hot__High_School_Type_State,one-hot__Scholarship_100%,one-hot__Scholarship_25%,one-hot__Scholarship_50%,one-hot__Scholarship_75%,Grade
Student_Age,1.0,0.117924,-0.167715,-0.11539,0.060685,0.164856,-0.021974,0.00162,0.176138,-0.135005,0.04101,0.033573,-0.178847,0.126966,-0.001321,0.097048,0.178771,-0.224787,-0.206441
Sex,0.117924,1.0,0.190518,-0.229252,-0.113341,-0.017699,0.04447,0.042975,-0.064276,-0.023564,-0.105675,0.056897,-0.123662,0.064263,-0.177041,0.117911,0.141658,-0.051412,-0.062381
Additional_Work,-0.167715,0.190518,1.0,-0.240651,0.09894,-0.08715,-0.081257,0.079795,-0.180907,-0.151465,0.129096,-0.084902,-0.073035,0.120723,-0.075981,-0.00223,0.095172,-0.042931,0.029037
Sports_activity,-0.11539,-0.229252,-0.240651,1.0,0.009687,-0.110759,-0.015953,-0.199114,0.11984,-0.049012,0.091232,-0.021328,-0.004025,0.018282,0.044062,0.07884,-0.113294,0.063874,0.188559
Transportation,0.060685,-0.113341,0.09894,0.009687,1.0,-0.06055,0.017083,0.112045,-0.017778,-0.150281,0.022396,0.031505,0.054997,-0.068257,0.127005,0.025437,-0.231336,0.143808,0.024588
Weekly_Study_Hours,0.164856,-0.017699,-0.08715,-0.110759,-0.06055,1.0,-0.021497,0.049035,0.025878,-0.046699,-0.038846,0.077865,0.02847,-0.078327,-0.108905,0.110998,0.107403,-0.065891,-0.106283
Attendance,-0.021974,0.04447,-0.081257,-0.015953,0.017083,-0.021497,1.0,0.481156,0.145651,0.064845,-0.317706,0.135748,-0.032591,-0.067401,-0.044985,0.026867,0.031463,-0.007139,0.027567
Reading,0.00162,0.042975,0.079795,-0.199114,0.112045,0.049035,0.481156,1.0,-0.033286,0.001192,-0.259151,0.057444,0.035054,-0.069602,0.05437,0.05441,0.037291,-0.101031,-0.060476
Notes,0.176138,-0.064276,-0.180907,0.11984,-0.017778,0.025878,0.145651,-0.033286,1.0,0.011867,-0.148791,0.021777,0.008508,-0.022455,-0.028136,0.113239,0.061695,-0.074095,0.031354
Listening_in_Class,-0.135005,-0.023564,-0.151465,-0.049012,-0.150281,-0.046699,0.064845,0.001192,0.011867,1.0,0.042891,-0.015409,-0.038668,0.043316,0.020767,-0.154463,-0.016777,0.050947,-0.002764


In [16]:
data.dropna(inplace=True)

In [17]:
X = data.drop(columns=['Grade'])
y = data['Grade']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [18]:
rf = RandomForestClassifier(n_estimators=100,max_depth=5)
rf.fit(X_train,y_train)

In [19]:
rf.score(X_train,y_train)

0.7818181818181819

In [20]:
y_preds = rf.predict(X_test)

In [21]:
print("Score:",accuracy_score(y_test,y_preds))

Score: 0.21428571428571427


# ***Hyperparameter Tuning***

In [22]:
params_grid={
    'n_estimators' : [20,50,100,200,500],
    'criterion': ['gini','entropy'],
    'max_depth' : [5,10,20,50],
    'min_samples_split' : [2,3,4,5],
    'max_features' : ['auto','sqrt','log2'],
    'bootstrap' : [True,False]
}

rf_classfier = RandomForestClassifier()
gscv = GridSearchCV(estimator=rf_classfier,param_grid=params_grid,cv=5)

gscv.fit(X_train,y_train)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [23]:
best_params = gscv.best_params_
best_params

{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 50,
 'max_features': 'sqrt',
 'min_samples_split': 4,
 'n_estimators': 20}

In [24]:
gscv.score(X_train,y_train)

0.9181818181818182

In [25]:
y_preds = gscv.predict(X_test)

In [26]:
print('Score:',accuracy_score(y_test,y_preds))

Score: 0.21428571428571427
