A local school district has a goal to reach a 95% graduation rate by the end of the decade by identifying students who need intervention before they drop out of school. As a software engineer contacted by the school district, your task is to model the factors that predict how likely a student is to pass their high school final exam, by constructing an intervention system that leverages supervised learning techniques. The board of supervisors has asked that you find the most effective model that uses the least amount of computation costs to save on the budget. You will need to analyze the dataset on students' performance and develop a model that will predict a given student will pass, quantifying whether an intervention is necessary

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('/content/drive/MyDrive/ICTAK/student-data.csv')

In [3]:
data.shape

(395, 31)

In [4]:
data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,passed
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,6,no
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,4,no
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,10,yes
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,2,yes
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,4,yes


In [5]:
data.isnull().sum()

school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
passed        0
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 31 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

In [7]:
data.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'passed'],
      dtype='object')

In [8]:
x = data.drop(['address','famsize','Pstatus','Mjob', 'Fjob', 'reason','guardian','schoolsup', 'famsup', 'paid','nursery',
       'higher', 'internet', 'romantic','passed'],axis=1)
y = pd.DataFrame(data['passed'])

In [9]:
x['school'].value_counts()

GP    349
MS     46
Name: school, dtype: int64

In [10]:
data['activities'].value_counts()

yes    201
no     194
Name: activities, dtype: int64

In [11]:
x = pd.get_dummies(x, columns=['school','activities','sex'])

In [12]:
y = pd.get_dummies(y, columns=['passed'])

In [13]:
x.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,school_GP,school_MS,activities_no,activities_yes,sex_F,sex_M
0,18,4,4,2,2,0,4,3,4,1,1,3,6,1,0,1,0,1,0
1,17,1,1,1,2,0,5,3,3,1,1,3,4,1,0,1,0,1,0
2,15,1,1,1,2,3,4,3,2,2,3,3,10,1,0,1,0,1,0
3,15,4,2,1,3,0,3,2,2,1,1,5,2,1,0,0,1,1,0
4,16,3,3,1,2,0,4,3,2,1,2,5,4,1,0,1,0,1,0


In [14]:
x.shape

(395, 19)

In [15]:
x.columns

Index(['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'school_GP',
       'school_MS', 'activities_no', 'activities_yes', 'sex_F', 'sex_M'],
      dtype='object')

In [16]:
x['pedu'] = x['Medu'] + x['Fedu']
x['freetimes'] = x['freetime'] + x['goout']
x = x.drop(['freetime', 'goout','Medu', 'Fedu'],axis=1)

In [17]:
x.head()

Unnamed: 0,age,traveltime,studytime,failures,famrel,Dalc,Walc,health,absences,school_GP,school_MS,activities_no,activities_yes,sex_F,sex_M,pedu,freetimes
0,18,2,2,0,4,1,1,3,6,1,0,1,0,1,0,8,7
1,17,1,2,0,5,1,1,3,4,1,0,1,0,1,0,2,6
2,15,1,2,3,4,2,3,3,10,1,0,1,0,1,0,2,5
3,15,1,3,0,3,1,1,5,2,1,0,0,1,1,0,6,4
4,16,1,2,0,4,1,2,5,4,1,0,1,0,1,0,6,5


In [18]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=2)

In [19]:
x_train.head()

Unnamed: 0,age,traveltime,studytime,failures,famrel,Dalc,Walc,health,absences,school_GP,school_MS,activities_no,activities_yes,sex_F,sex_M,pedu,freetimes
10,15,1,2,0,3,1,2,2,0,1,0,1,0,1,0,8,6
112,16,1,2,1,3,1,1,5,6,1,0,0,1,1,0,4,3
338,18,1,4,0,5,1,1,1,7,1,0,1,0,1,0,6,6
292,18,1,2,1,5,1,1,5,12,1,0,1,0,1,0,3,7
305,18,1,2,1,4,1,1,3,8,1,0,0,1,1,0,6,7


In [20]:
x_test.head()

Unnamed: 0,age,traveltime,studytime,failures,famrel,Dalc,Walc,health,absences,school_GP,school_MS,activities_no,activities_yes,sex_F,sex_M,pedu,freetimes
94,15,1,4,0,4,1,1,4,6,1,0,0,1,0,1,4,7
32,15,1,2,0,4,1,1,5,0,1,0,0,1,0,1,7,7
222,16,1,2,0,2,1,1,3,2,1,0,1,0,1,0,5,4
329,17,2,3,0,4,1,2,4,4,1,0,1,0,1,0,8,6
369,18,3,2,0,3,4,2,5,10,0,1,1,0,1,0,8,4


In [21]:
y_train.head()

Unnamed: 0,passed_no,passed_yes
10,1,0
112,0,1
338,0,1
292,0,1
305,0,1


In [22]:
y_test.head()

Unnamed: 0,passed_no,passed_yes
94,0,1
32,0,1
222,0,1
329,0,1
369,0,1


In [23]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train,y_train)

y_pred =rf.predict(x_test)

In [24]:
y_pred

array([[0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0,

In [25]:
from sklearn.metrics import f1_score,confusion_matrix
print("f1_score : ",f1_score(y_test, y_pred,average='weighted'))
print("confusion_matrix : ",confusion_matrix(y_test.values.argmax(axis=1),y_pred.argmax(axis=1)))

f1_score :  0.6498371375302083
confusion_matrix :  [[10 29]
 [ 8 72]]


In [26]:
pd.Series(rf.feature_importances_,index=x.columns).sort_values(ascending=False)*100

absences          12.376943
failures          11.047036
pedu              10.391023
age                9.380749
freetimes          9.304173
health             7.688787
Walc               7.249616
studytime          6.472865
famrel             6.064884
traveltime         4.603265
Dalc               3.183483
activities_yes     2.639086
sex_F              2.461021
activities_no      2.283875
sex_M              2.220699
school_MS          1.378032
school_GP          1.254463
dtype: float64