# Evaluating who made the test (ENEM 2016) just for trainning

In [1]:
# Packages
import pandas as pd
import numpy as np
from sklearn import linear_model
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  import pandas.util.testing as tm


### Loading Train and Test data

In [2]:
notas_treino = pd.read_csv('train.csv')
notas_teste = pd.read_csv('test.csv')

### Using correlation, it was selected the variables that has 25%, at least, of correlation with 'IN_TREINEIRO'

In [3]:
corr1 = notas_treino.corr()[notas_treino.corr()['IN_TREINEIRO']<-0.25]['IN_TREINEIRO']
corr2 = notas_treino.corr()[notas_treino.corr()['IN_TREINEIRO']>0.25]['IN_TREINEIRO']
f_corr1 = corr1.index.to_list()
f_corr2 = corr2.index.to_list()
feat_corr = f_corr1 + f_corr2
feat_corr_test = feat_corr[0:3]
print(feat_corr)

['NU_IDADE', 'TP_ANO_CONCLUIU', 'TP_ST_CONCLUSAO', 'IN_TREINEIRO']


In [4]:
df_notas_treino = notas_treino[feat_corr]
df_notas_treino.dtypes

NU_IDADE           int64
TP_ANO_CONCLUIU    int64
TP_ST_CONCLUSAO    int64
IN_TREINEIRO       int64
dtype: object

In [5]:
print(df_notas_treino.isna().sum())

NU_IDADE           0
TP_ANO_CONCLUIU    0
TP_ST_CONCLUSAO    0
IN_TREINEIRO       0
dtype: int64


#### Check if the target variable is unbalanced

In [6]:
df_notas_treino["IN_TREINEIRO"].value_counts()

0    11947
1     1783
Name: IN_TREINEIRO, dtype: int64

## As we can see, the target variable is unbalanced. Let's use SMOTE technique to balance it.

In [7]:
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(df_notas_treino.iloc[:,:-1], df_notas_treino["IN_TREINEIRO"])

## Now, the target variable is balanced

In [8]:
y_smote.value_counts()

1    11947
0    11947
Name: IN_TREINEIRO, dtype: int64

### Now we can divide the data into train and test 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state = 42)

## First model: Logistic Regression

In [81]:
model_v1 = LogisticRegression()
model_v1.fit(X_train, y_train)
predict_train_v1 = model_v1.predict(X_test)

In [84]:
LogReg_v1 = {'Modelo':'Logistic Regression',
               'Precision':precision_score(predict_train_v1, y_test),
               'Recall':recall_score(predict_train_v1, y_test),
               'F1 Score':f1_score(predict_train_v1, y_test),
               'Acurácia':accuracy_score(predict_train_v1, y_test),
               'AUC':roc_auc_score(y_test, predict_train_v1)}
LogReg_v1

{'Modelo': 'Logistic Regression',
 'Precision': 0.9773109243697479,
 'Recall': 0.9935924818453652,
 'F1 Score': 0.9853844524465156,
 'Acurácia': 0.9855618330194601,
 'AUC': 0.9855291595587798}

In [11]:
df_notas_test = notas_teste[feat_corr_test]
predict_teste_v1 = model_v1.predict(df_notas_test)
predict_teste_v1

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

### Using Logistic Regression, we had a predict accuracy of 99,037%. Maybe we can do better using another tecnique...

## Second model: AdaBoost Classifier

In [85]:
model_v2 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=None),
                          n_estimators=400)
model_v2.fit(X_train, y_train)
predict_train_v2 = model_v2.predict(X_test)

In [87]:
AdaBClas_v1 = {'Modelo':'AdaBoostClassifier',
               'Precision':precision_score(predict_train_v2, y_test),
               'Recall':recall_score(predict_train_v2, y_test),
               'F1 Score':f1_score(predict_train_v2, y_test),
               'Acurácia':accuracy_score(predict_train_v2, y_test),
               'AUC':roc_auc_score(y_test, predict_train_v2)}
AdaBClas_v1

{'Modelo': 'AdaBoostClassifier',
 'Precision': 1.0,
 'Recall': 0.9875518672199171,
 'F1 Score': 0.9937369519832986,
 'Acurácia': 0.9937225360954175,
 'AUC': 0.9937473947478116}

In [88]:
predict_teste_v2 = model_v2.predict(df_notas_test)
predict_teste_v2

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [None]:
#98,731

### Using AdaBoost Classifier, we had a predict accuracy of 98,731%. This one was a little bit worse than Logistic Regression.
### Let's try another on.

## Third model: Support Vector Machine (SVM)

In [141]:
model_v3 = svm.SVC(kernel = 'linear')
model_v3.fit(X_train, y_train)
predict_train_v3 = model_v3.predict(X_test)

In [142]:
SVM_v1 = {'Modelo':'SVM',
               'Precision':precision_score(predict_train_v3, y_test),
               'Recall':recall_score(predict_train_v3, y_test),
               'F1 Score':f1_score(predict_train_v3, y_test),
               'Acurácia':accuracy_score(predict_train_v3, y_test),
               'AUC':roc_auc_score(y_test, predict_train_v3)}
SVM_v1

{'Modelo': 'SVM',
 'Precision': 0.9773109243697479,
 'Recall': 0.9935924818453652,
 'F1 Score': 0.9853844524465156,
 'Acurácia': 0.9855618330194601,
 'AUC': 0.9855291595587798}

In [143]:
predict_teste_v3 = model_v3.predict(df_notas_test)
predict_teste_v3

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

### Using SVM, we had a predict accuracy of 99,038%, better than Logistic Regression.
### But I'll play my last card!!!

## Fourth model: KNN

In [126]:
model_v4 = KNeighborsClassifier(n_neighbors=3)
model_v4.fit(X_train, y_train)
predict_train_v4 = model_v4.predict(X_test)

In [127]:
KNNClas_v1 = {'Modelo':'KNeighborsClassifier',
               'Precision':precision_score(predict_train_v4, y_test),
               'Recall':recall_score(predict_train_v4, y_test),
               'F1 Score':f1_score(predict_train_v4, y_test),
               'Acurácia':accuracy_score(predict_train_v4, y_test),
               'AUC':roc_auc_score(y_test, predict_train_v4)}
KNNClas_v1

{'Modelo': 'KNeighborsClassifier',
 'Precision': 0.9773109243697479,
 'Recall': 1.0,
 'F1 Score': 0.9885252868678283,
 'Acurácia': 0.9887005649717514,
 'AUC': 0.988655462184874}

In [128]:
predict_teste_v4 = model_v4.predict(df_notas_test)
predict_teste_v4

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

## Very impressive!!! Using KNN, we had a predict accuracy of 99,69%!!!
### Just save this result in a CSV file

In [144]:
answer = notas_teste
answer = answer.iloc[:,:1]
answer.insert(1, "IN_TREINEIRO", predict_teste_v4, True) 
answer.to_csv('answer.csv')