In [15]:
import pandas as pd
import numpy as np

In [16]:
df = pd.read_csv('../database/combined_cleaned/c_to_h.csv')
df = df.drop(['index'], axis=1)

## Select interet subjects

In [17]:
y = 1
df['Course_le'] = ([y if x == 'PCM'  else y-1 for x in df['Course']])
df

Unnamed: 0,AGGT,DIV,CIV,HIST,GEO,KISW,ENGL,PHY,CHEM,BIO,B/MATH,Course,Course_le
0,19,II,C,B,C,B,C,F,C,C,C,CBG,0
1,20,II,C,C,C,B,C,F,C,C,C,CBG,0
2,17,I,C,B,C,B,C,C,C,C,A,HGL,0
3,25,III,D,D,D,C,D,F,D,C,C,COMMUNITY DEVELOPMENT,0
4,19,II,C,C,D,C,C,D,B,B,C,PCB,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7494,21,II,C,C,C,C,C,D,C,C,D,HKL,0
7495,25,III,D,D,D,D,C,D,C,C,D,ACCOUNTANCY,0
7496,11,I,B,B,C,A,A,C,B,A,B,PCB,0
7497,22,III,C,D,C,C,C,F,D,C,F,ACCOUNTANCY,0


In [18]:
from sklearn.preprocessing import LabelEncoder 

In [19]:
le = LabelEncoder()

In [20]:
cols = df.columns
cols = list(cols)
cols.pop()
cols[2:]

df['DIV'] = le.fit_transform(df['DIV'])
df['CIV'] = le.fit_transform(df['CIV'])
df['HIST'] = le.fit_transform(df['HIST'])
df['GEO'] = le.fit_transform(df['GEO'])
df['KISW'] = le.fit_transform(df['KISW'])
df['ENGL'] = le.fit_transform(df['ENGL'])
df['PHY'] = le.fit_transform(df['PHY'])
df['CHEM'] = le.fit_transform(df['CHEM'])
df['BIO'] = le.fit_transform(df['BIO'])
df['B/MATH'] = le.fit_transform(df['B/MATH'])

In [21]:
df

Unnamed: 0,AGGT,DIV,CIV,HIST,GEO,KISW,ENGL,PHY,CHEM,BIO,B/MATH,Course,Course_le
0,19,1,2,1,2,1,2,4,2,2,2,CBG,0
1,20,1,2,2,2,1,2,4,2,2,2,CBG,0
2,17,0,2,1,2,1,2,2,2,2,0,HGL,0
3,25,2,3,3,3,2,3,4,3,2,2,COMMUNITY DEVELOPMENT,0
4,19,1,2,2,3,2,2,3,1,1,2,PCB,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7494,21,1,2,2,2,2,2,3,2,2,3,HKL,0
7495,25,2,3,3,3,3,2,3,2,2,3,ACCOUNTANCY,0
7496,11,0,1,1,2,0,0,2,1,0,1,PCB,0
7497,22,2,2,3,2,2,2,4,3,2,4,ACCOUNTANCY,0


In [22]:
df = df[df['Course'].isin(['PCM', 'PCB', 'EGM','CBG'])]
df

Unnamed: 0,AGGT,DIV,CIV,HIST,GEO,KISW,ENGL,PHY,CHEM,BIO,B/MATH,Course,Course_le
0,19,1,2,1,2,1,2,4,2,2,2,CBG,0
1,20,1,2,2,2,1,2,4,2,2,2,CBG,0
4,19,1,2,2,3,2,2,3,1,1,2,PCB,0
11,20,1,1,3,2,1,2,4,2,2,4,CBG,0
13,20,1,2,3,2,1,1,3,2,2,3,CBG,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7491,20,1,2,3,3,2,1,3,2,2,2,PCM,1
7492,19,1,2,2,2,1,1,3,2,2,2,PCM,1
7493,19,1,2,2,2,1,1,4,2,2,3,CBG,0
7496,11,0,1,1,2,0,0,2,1,0,1,PCB,0


In [23]:
X_values = df.drop(['Course', 'Course_le'], axis=1)
y_values = df['Course_le']

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [25]:
models_selection = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params': {
            'C': [17, 20, 50],
            'kernel': ['linear', 'rbf'],
            'gamma': ['auto', 'scale'],
            'probability': [True, False],
            'class_weight': ['balanced', None]
        }
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(),
        'params': {}
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [2, 11, 15, 12]
        }
    },
    'knn': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors':[20, 80],
            'metric' : ['euclidean', 'minkowski']
        }
    },
    'Logistic Reg': {
        'model' : LogisticRegression(solver='liblinear', multi_class='auto'),
        'params': {
            'C': [96, 65, 50]
        }
    },
    'Gaussian Bayes':{
        'model': GaussianNB(),
        'params':{
            
        }
    }
}

In [26]:
results = []
for model, model_values in models_selection.items():
    best_model = GridSearchCV(model_values['model'], model_values['params'], scoring="roc_auc")
    best_model.fit(X_values, y_values)
    results.append({
        'model' : model,
        'best score' : best_model.best_score_,
        'best params' : best_model.best_params_
    })

In [27]:
results_tabel = pd.DataFrame(results)
results_tabel

Unnamed: 0,model,best score,best params
0,svm,0.777683,"{'C': 50, 'class_weight': 'balanced', 'gamma':..."
1,Decision Tree,0.631731,{}
2,Random Forest,0.725608,{'n_estimators': 15}
3,knn,0.762205,"{'metric': 'euclidean', 'n_neighbors': 80}"
4,Logistic Reg,0.762732,{'C': 50}
5,Gaussian Bayes,0.690815,{}


In [28]:
results

[{'model': 'svm',
  'best score': 0.7776833219821004,
  'best params': {'C': 50,
   'class_weight': 'balanced',
   'gamma': 'scale',
   'kernel': 'rbf',
   'probability': True}},
 {'model': 'Decision Tree',
  'best score': 0.6317312234506928,
  'best params': {}},
 {'model': 'Random Forest',
  'best score': 0.7256080848373262,
  'best params': {'n_estimators': 15}},
 {'model': 'knn',
  'best score': 0.7622054316915374,
  'best params': {'metric': 'euclidean', 'n_neighbors': 80}},
 {'model': 'Logistic Reg',
  'best score': 0.7627319631012519,
  'best params': {'C': 50}},
 {'model': 'Gaussian Bayes',
  'best score': 0.6908148294144658,
  'best params': {}}]