In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('student-data.csv') #reading the dataset
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,passed
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,6,no
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,4,no
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,10,yes
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,2,yes
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,4,yes


In [3]:
df.shape #shape of dataset

(395, 31)

There are 395 rows and 31 columns in the dataset

In [4]:
# checking for null values in dataset
df.isna().sum().sum()

0

There are 0 null values in the dataset

In [13]:
df.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'passed'],
      dtype='object')

In [5]:
df.dtypes #checking the datatype of columns

school        object
sex           object
age            int64
address       object
famsize       object
Pstatus       object
Medu           int64
Fedu           int64
Mjob          object
Fjob          object
reason        object
guardian      object
traveltime     int64
studytime      int64
failures       int64
schoolsup     object
famsup        object
paid          object
activities    object
nursery       object
higher        object
internet      object
romantic      object
famrel         int64
freetime       int64
goout          int64
Dalc           int64
Walc           int64
health         int64
absences       int64
passed        object
dtype: object

In [6]:
#checking the unique values in the categorical columns
df_obj=[f for f in df.columns if df[f].dtypes=='O']
for d in df_obj:
    print(d,df[d].unique())

school ['GP' 'MS']
sex ['F' 'M']
address ['U' 'R']
famsize ['GT3' 'LE3']
Pstatus ['A' 'T']
Mjob ['at_home' 'health' 'other' 'services' 'teacher']
Fjob ['teacher' 'other' 'services' 'health' 'at_home']
reason ['course' 'other' 'home' 'reputation']
guardian ['mother' 'father' 'other']
schoolsup ['yes' 'no']
famsup ['no' 'yes']
paid ['no' 'yes']
activities ['no' 'yes']
nursery ['yes' 'no']
higher ['yes' 'no']
internet ['no' 'yes']
romantic ['no' 'yes']
passed ['no' 'yes']


In [7]:
df_obj.pop() #removing target column

'passed'

In [8]:
df['passed']=df['passed'].map({'yes':1,'no':0}) #target value encoding

In [14]:
#checking the pass percentage of the entire dataset to get an idea
round((df['passed'].value_counts()[1])/(len(df['passed']))*100,2)

67.09

### Preprocessing

In [17]:
#Setting the feature X
X=df.drop(['passed'],axis=1)
#Setting target
y=df['passed']

In [18]:
# dropping irrelevant columns from X
X.drop(['Fedu','traveltime','activities', 'nursery','paid'], axis=1,inplace=True)

#### One hot encoding on categorical feature columns

In [19]:
df_obj=[f for f in X.columns if X[f].dtypes=='O']
X=pd.get_dummies(X,columns=df_obj,drop_first=True)
X.head()

Unnamed: 0,age,Medu,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,...,reason_home,reason_other,reason_reputation,guardian_mother,guardian_other,schoolsup_yes,famsup_yes,higher_yes,internet_yes,romantic_yes
0,18,4,2,0,4,3,4,1,1,3,...,0,0,0,1,0,1,0,1,0,0
1,17,1,2,0,5,3,3,1,1,3,...,0,0,0,0,0,0,1,1,1,0
2,15,1,2,3,4,3,2,2,3,3,...,0,1,0,1,0,1,0,1,1,0
3,15,4,3,0,3,2,2,1,1,5,...,1,0,0,1,0,0,1,1,1,1
4,16,3,2,0,4,3,2,1,2,5,...,1,0,0,0,0,0,1,1,0,0


#### Train test splitting

In [20]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=.3) #30% data is for testing

### Model Building and evaluation

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
log_model=LogisticRegression(solver='newton-cg')
log_model.fit(X_train,y_train)
prediction=log_model.predict(X_test)
print("F1_score :",f1_score(y_test,prediction))
print("Accuracy score :",accuracy_score(y_test,prediction))
print("Confusion_matrix :",confusion_matrix(y_test,prediction))

F1_score : 0.7976190476190476
Accuracy score : 0.7142857142857143
Confusion_matrix : [[18 28]
 [ 6 67]]


With logistic regression model we are able to obtain an f1_score of 79.76% and as per the confusion matrix there are 34 misclassifications. The logistic regression method has given a fast response and can be considered as having least computational costs involved.