Problem : We are going to classify whether a student will pass in the exam or not based on the input features given

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier 

Importing all classification models from sklearn

In [2]:
import pandas as pd
df = pd.read_csv('./data/stud.csv')

In [3]:
df['avg_score'] = (df['math_score']+df['reading_score']+df['writing_score'])//3
df.drop(['math_score','reading_score','writing_score'],axis=1,inplace=True)

In [4]:
df.dtypes

gender                         object
race_ethnicity                 object
parental_level_of_education    object
lunch                          object
test_preparation_course        object
avg_score                       int64
dtype: object

Label Encoding vs One Hot Encoding

We can prefer one Hot Encoding on features like race ethnicity and parental level education data are nominal and have no definite relationship between them.

In [5]:
#target variable
target = df['avg_score']
df.drop(['avg_score'],axis=1,inplace=True)


In [6]:
one_hot_category = df[['parental_level_of_education','race_ethnicity']]
df.drop(['parental_level_of_education','race_ethnicity'],axis=1,inplace=True)

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


columns = ['lunch','gender','test_preparation_course']
for i in columns:
  df[i] = le.fit_transform(df[i])
new_df = pd.get_dummies(one_hot_category,dtype=int)
df = pd.concat([df,new_df],axis=1)
df


Unnamed: 0,gender,lunch,test_preparation_course,parental_level_of_education_associate's degree,parental_level_of_education_bachelor's degree,parental_level_of_education_high school,parental_level_of_education_master's degree,parental_level_of_education_some college,parental_level_of_education_some high school,race_ethnicity_group A,race_ethnicity_group B,race_ethnicity_group C,race_ethnicity_group D,race_ethnicity_group E
0,0,1,1,0,1,0,0,0,0,0,1,0,0,0
1,0,1,0,0,0,0,0,1,0,0,0,1,0,0
2,0,1,1,0,0,0,1,0,0,0,1,0,0,0
3,1,0,1,1,0,0,0,0,0,1,0,0,0,0
4,1,1,1,0,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,1,0,0,0,0,1,0,0,0,0,0,0,1
996,1,0,1,0,0,1,0,0,0,0,0,1,0,0
997,0,0,0,0,0,1,0,0,0,0,0,1,0,0
998,0,1,0,0,0,0,0,1,0,0,0,0,1,0


We have successfully converted our dataframe into one hot type

In [8]:
target = pd.DataFrame(target,columns=['avg_score'])
target[target['avg_score']<50] = 1
target[target['avg_score']>=50] = 0




In [9]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df,target['avg_score'],test_size=0.2)

In [47]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

In [46]:
X_train_resampled

Unnamed: 0,gender,lunch,test_preparation_course,parental_level_of_education_associate's degree,parental_level_of_education_bachelor's degree,parental_level_of_education_high school,parental_level_of_education_master's degree,parental_level_of_education_some college,parental_level_of_education_some high school,race_ethnicity_group A,race_ethnicity_group B,race_ethnicity_group C,race_ethnicity_group D,race_ethnicity_group E
0,0,1,1,0,0,1,0,0,0,0,0,1,0,0
1,0,1,1,1,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,1,0,0,0,0,0,1,0,0,0
3,0,1,1,0,0,0,0,1,0,0,0,1,0,0
4,1,1,1,0,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1425,1,0,1,0,0,0,0,0,0,1,0,0,0,0
1426,1,0,1,0,0,0,0,0,0,0,1,0,0,0
1427,1,0,1,0,0,0,0,1,0,0,0,1,0,0
1428,0,0,1,0,0,0,0,0,1,0,0,1,0,0


Metric used here will be precision to precisely identify students who will fail in the exams

In [18]:
model_name = ['LogisticRegression','RandomForestClassfier','Ridge','DecisionTreeRegressor','RandomForestRegressor','SVR']
models = [
  LogisticRegression(),
  Lasso(),
  Ridge(),
  RandomForestRegressor(),
  SVR()
]

Using Logistic Regression

Setting right parameters for param grid <br>
Solve the data imbalance problem <br>
Choose right model

In [54]:
t=['svm','lr','rf']
l=[]
c=0
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
model_type = {
    'svm':{
        'model' : SVC(gamma='auto'),
        'params':{
        'kernel' : ['linear','rbf'],
        'C' : [1,5,10,20,30]

        }
    },

    'lr' :{
        'model' : LogisticRegression(),
        'params':{
            'C' : [1,5,10,20,30]
        }
    },
    'rf' :{
        'model' : RandomForestClassifier(),
        'params':{
            'n_estimators' :[1,5,10,20,30]
        }
    }



}

from sklearn.model_selection import GridSearchCV
classification_report_list = []

for model,model_params in model_type.items():

  gsv = GridSearchCV(model_params['model'],model_params['params'],cv=5,scoring='precision')
  gsv.fit(x_train_resampled,y_train_resampled)
  l.append({
      'model': model,
      'score' : gsv.best_score_,
      'bestparamas' : gsv.best_params_,
      'classification_report' : classification_report(y_true=y_test,y_pred=gsv.best_estimator_.predict(x_test))
      })
#   classification_report_list.append(classification_report(y_true=y_test,y_pred=gsv.best_params_.predict(x_test)))


In [55]:
model_df = pd.DataFrame(l)
model_df

Unnamed: 0,model,score,bestparamas,classification_report
0,svm,0.847959,"{'C': 30, 'kernel': 'rbf'}",precision recall f1-score ...
1,lr,0.830797,{'C': 30},precision recall f1-score ...
2,rf,0.827818,{'n_estimators': 1},precision recall f1-score ...


In [59]:
y_pred = gsv.best_estimator_.predict(x_test)

In [74]:
sum_=0
c = 0
for i in range(len(y_pred)):
  if y_test.iloc[i] == 1:
    if y_pred[i] == y_test.iloc[i]:
      sum_+=1
    c+=1

sum_/c

0.3888888888888889