In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from imblearn.combine import SMOTEENN

from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

%matplotlib inline

##### Normal Data

In [2]:
treino_0null = pd.read_csv("training_0null.csv")
teste_0null = pd.read_csv("test_0null.csv")
#-------------------------------------------
treino_0null_X = treino_0null.drop(columns=['salary','fnlwgt']) 
treino_0null_Y = treino_0null['salary']
teste_0null_X = teste_0null.drop(columns=['salary','fnlwgt'])  
teste_0null_Y = teste_0null['salary']


treino_mode = pd.read_csv("training_mode.csv")
teste_mode = pd.read_csv("test_mode.csv")
#-------------------------------------------
treino_mode_X = treino_mode.drop(columns=['salary','fnlwgt']) 
treino_mode_Y = treino_mode['salary']
teste_mode_X = teste_mode.drop(columns=['salary','fnlwgt'])  
teste_mode_Y = teste_mode['salary']

treino_knn = pd.read_csv("training_knn.csv")
teste_knn = pd.read_csv("test_knn.csv")
#-------------------------------------------
treino_knn_X = treino_knn.drop(columns=['salary','fnlwgt']) 
treino_knn_Y = treino_knn['salary']
teste_knn_X = teste_knn.drop(columns=['salary','fnlwgt'])  
teste_knn_Y = teste_knn['salary']

##### Scaled Data

In [3]:
treino_strings = pd.read_csv("training_0null_strings.csv")
teste_strings = pd.read_csv("test_0null_strings.csv")

treino_strings_X = treino_strings.drop(columns=['salary','fnlwgt']) 
treino_strings_Y = treino_strings['salary']
teste_strings_X = teste_strings.drop(columns=['salary','fnlwgt'])  
teste_strings_Y = teste_strings['salary']

categorical = ['workclass', 'maritalstatus', 'occupation', 'relationship', 'race', 'sex', 'nativecountry']
for feature in categorical:
        le = preprocessing.LabelEncoder()
        treino_strings_X[feature] = le.fit_transform(treino_strings_X[feature])
        teste_strings_X[feature] = le.transform(teste_strings_X[feature])
        

scaler = StandardScaler()

treino_strings_X = pd.DataFrame(scaler.fit_transform(treino_strings_X), columns = treino_strings_X.columns)

teste_strings_X = pd.DataFrame(scaler.transform(teste_strings_X), columns = teste_strings_X.columns)

###### Apply SMOTEENN

In [4]:
onull = SMOTEENN(random_state=0)
mode = SMOTEENN(random_state=0)
knn = SMOTEENN(random_state=0)
strings = SMOTEENN(random_state=0)

In [5]:
treino_0null_X, treino_0null_Y = onull.fit_sample(treino_0null_X,treino_0null_Y)
treino_0null_X, treino_0null_Y = shuffle(treino_0null_X,treino_0null_Y)

treino_mode_X, treino_mode_Y = mode.fit_sample(treino_mode_X,treino_mode_Y)
treino_mode_X, treino_mode_Y = shuffle(treino_mode_X,treino_mode_Y)

treino_knn_X, treino_knn_Y = knn.fit_sample(treino_knn_X,treino_knn_Y)
treino_knn_X, treino_knn_Y = shuffle(treino_knn_X,treino_knn_Y)

treino_strings_X, treino_strings_Y = strings.fit_sample(treino_strings_X,treino_strings_Y)
treino_strings_X, treino_strings_Y = shuffle(treino_strings_X,treino_strings_Y)

###### Função a correr

In [6]:
def evaluator(xTrain,yTrain,xTest,yTest):
    counter = 0
    
    classifiers = [
        LogisticRegression(class_weight='balanced',max_iter=500),
        KNeighborsClassifier(n_neighbors=5),
        SVC(class_weight='balanced'),
        GaussianNB(),
        RandomForestClassifier(max_depth=20,class_weight='balanced'),
        KMeans(n_clusters=2)
    ]
    
    names = [
        "LogisticRegression", "KNeighbours",
        "SVM", "Gaussian NaiveBayes",
        "RandomForest", "KMeans" ]
    
    for name, alg in zip(names,classifiers):
        if(counter == 5):
            xTrain = pd.concat([xTrain, xTest], ignore_index=True)
            yTrain = pd.concat([yTrain, yTest], ignore_index=True)
            
        alg.fit(xTrain,yTrain)
        predicted = alg.predict(xTest)
        
        print(name)
        #---------------------------------------------------
        print(classification_report(yTest,predicted))
        print(pd.crosstab(yTest,predicted, rownames=['Actual'], colnames=['Predicted'], margins=True))
        #---------------------------------------------------
        score = accuracy_score(yTest,predicted)
        print('Accuracy:{0:f}'.format(score))
        print("#---------------------------------------------------")
        
        counter+=1
    
    return;

In [7]:
evaluator(treino_0null_X,treino_0null_Y,teste_0null_X,teste_0null_Y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression
              precision    recall  f1-score   support

           0       0.90      0.73      0.80     11360
           1       0.47      0.74      0.57      3700

    accuracy                           0.73     15060
   macro avg       0.68      0.73      0.69     15060
weighted avg       0.79      0.73      0.75     15060

Predicted     0     1    All
Actual                      
0          8244  3116  11360
1           951  2749   3700
All        9195  5865  15060
Accuracy:0.729947
#---------------------------------------------------
KNeighbours
              precision    recall  f1-score   support

           0       0.93      0.76      0.84     11360
           1       0.53      0.84      0.65      3700

    accuracy                           0.78     15060
   macro avg       0.73      0.80      0.74     15060
weighted avg       0.83      0.78      0.79     15060

Predicted     0     1    All
Actual                      
0          8624  2736  11360
1           

In [8]:
evaluator(treino_mode_X,treino_mode_Y,teste_mode_X,teste_mode_Y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression
              precision    recall  f1-score   support

           0       0.90      0.72      0.80     11360
           1       0.47      0.75      0.58      3700

    accuracy                           0.73     15060
   macro avg       0.68      0.74      0.69     15060
weighted avg       0.79      0.73      0.74     15060

Predicted     0     1    All
Actual                      
0          8181  3179  11360
1           918  2782   3700
All        9099  5961  15060
Accuracy:0.727955
#---------------------------------------------------
KNeighbours
              precision    recall  f1-score   support

           0       0.93      0.76      0.84     11360
           1       0.53      0.83      0.65      3700

    accuracy                           0.78     15060
   macro avg       0.73      0.80      0.74     15060
weighted avg       0.83      0.78      0.79     15060

Predicted     0     1    All
Actual                      
0          8680  2680  11360
1           

In [9]:
evaluator(treino_knn_X,treino_knn_Y,teste_knn_X,teste_knn_Y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression
              precision    recall  f1-score   support

         0.0       0.91      0.70      0.79     11360
         1.0       0.46      0.78      0.58      3700

    accuracy                           0.72     15060
   macro avg       0.68      0.74      0.68     15060
weighted avg       0.80      0.72      0.74     15060

Predicted   0.0   1.0    All
Actual                      
0.0        7965  3395  11360
1.0         819  2881   3700
All        8784  6276  15060
Accuracy:0.720186
#---------------------------------------------------
KNeighbours
              precision    recall  f1-score   support

         0.0       0.94      0.74      0.83     11360
         1.0       0.51      0.85      0.64      3700

    accuracy                           0.77     15060
   macro avg       0.73      0.79      0.73     15060
weighted avg       0.83      0.77      0.78     15060

Predicted   0.0   1.0    All
Actual                      
0.0        8372  2988  11360
1.0         

In [10]:
evaluator(treino_strings_X,treino_strings_Y,teste_strings_X,teste_strings_Y)

LogisticRegression
              precision    recall  f1-score   support

           0       0.91      0.73      0.81     11360
           1       0.49      0.78      0.60      3700

    accuracy                           0.74     15060
   macro avg       0.70      0.76      0.71     15060
weighted avg       0.81      0.74      0.76     15060

Predicted     0     1    All
Actual                      
0          8336  3024  11360
1           817  2883   3700
All        9153  5907  15060
Accuracy:0.744954
#---------------------------------------------------
KNeighbours
              precision    recall  f1-score   support

           0       0.93      0.77      0.84     11360
           1       0.53      0.82      0.65      3700

    accuracy                           0.78     15060
   macro avg       0.73      0.79      0.74     15060
weighted avg       0.83      0.78      0.79     15060

Predicted     0     1    All
Actual                      
0          8693  2667  11360
1           