In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from sklearn.metrics import accuracy_score

# Naive Bayes - Gaussian

# 1 - Without nulls

#### Read csv created in Data_preprocessing

In [2]:
treino_0null = pd.read_csv("training_0null.csv")
teste_0null = pd.read_csv("test_0null.csv")

In [3]:
treino_0null_X = treino_0null.drop(columns=['salary']) 
treino_0null_Y = treino_0null['salary']
teste_0null_X = teste_0null.drop(columns=['salary'])  
teste_0null_Y = teste_0null['salary']

#### Create the model to predict values

In [4]:
gnb_0null = GaussianNB()
gnb_0null.fit(treino_0null_X, treino_0null_Y)

GaussianNB()

#### Predict values

In [5]:
predictNull = gnb_0null.predict(teste_0null_X)

### Check accuracy of predition

In [6]:
print(classification_report(teste_0null_Y,predictNull))

              precision    recall  f1-score   support

           0       0.81      0.95      0.87     11360
           1       0.65      0.31      0.42      3700

    accuracy                           0.79     15060
   macro avg       0.73      0.63      0.64     15060
weighted avg       0.77      0.79      0.76     15060



In [7]:
confusionNotNull = pd.crosstab(teste_0null_Y, predictNull, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionNotNull)

Predicted      0     1    All
Actual                       
0          10738   622  11360
1           2562  1138   3700
All        13300  1760  15060


In [8]:
score = accuracy_score(teste_0null_Y,predictNull)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.788579


In [9]:
print("Number of mislabeled points out of a total %d points : %d"
...       % (teste_0null_X.shape[0], (teste_0null_Y != predictNull).sum()))

Number of mislabeled points out of a total 15060 points : 3184


### Undersampling e Oversampling

In [10]:
oversample = RandomOverSampler(sampling_strategy=0.4)
treino_0null_X,treino_0null_Y = oversample.fit_resample(treino_0null_X,treino_0null_Y)

undersample = RandomUnderSampler(sampling_strategy=0.7)
treino_0null_X,treino_0null_Y = undersample.fit_resample(treino_0null_X,treino_0null_Y)

In [11]:
gnb_0null = GaussianNB()
gnb_0null.fit(treino_0null_X, treino_0null_Y)

GaussianNB()

In [12]:
predictNull = gnb_0null.predict(teste_0null_X)

In [13]:
print(classification_report(teste_0null_Y,predictNull))

              precision    recall  f1-score   support

           0       0.81      0.94      0.87     11360
           1       0.64      0.31      0.42      3700

    accuracy                           0.79     15060
   macro avg       0.72      0.63      0.64     15060
weighted avg       0.77      0.79      0.76     15060



In [14]:
confusionNotNull = pd.crosstab(teste_0null_Y, predictNull, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionNotNull)

Predicted      0     1    All
Actual                       
0          10712   648  11360
1           2559  1141   3700
All        13271  1789  15060


In [15]:
score = accuracy_score(teste_0null_Y,predictNull)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.787052


In [16]:
print("Number of mislabeled points out of a total %d points : %d"
...       % (teste_0null_X.shape[0], (teste_0null_Y != predictNull).sum()))

Number of mislabeled points out of a total 15060 points : 3207


# 2 - Mode as null values

#### Read csv created in Data_preprocessing

In [17]:
treino_mode = pd.read_csv("training_mode.csv")
teste_mode = pd.read_csv("test_mode.csv")

In [18]:
treino_mode_X = treino_mode.drop(columns=['salary']) 
treino_mode_Y = treino_mode['salary']
teste_mode_X = teste_mode.drop(columns=['salary'])  
teste_mode_Y = teste_mode['salary']

#### Create the model to predict values

In [19]:
gnb_mode = GaussianNB()
gnb_mode.fit(treino_mode_X, treino_mode_Y)

GaussianNB()

#### Predict values

In [20]:
predictMode = gnb_mode.predict(teste_mode_X)

### Check accuracy of predition

In [21]:
print(classification_report(teste_mode_Y,predictMode))

              precision    recall  f1-score   support

           0       0.81      0.95      0.87     11360
           1       0.65      0.31      0.42      3700

    accuracy                           0.79     15060
   macro avg       0.73      0.63      0.64     15060
weighted avg       0.77      0.79      0.76     15060



In [22]:
confusionMode = pd.crosstab(teste_mode_Y, predictMode, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionMode)

Predicted      0     1    All
Actual                       
0          10754   606  11360
1           2568  1132   3700
All        13322  1738  15060


In [23]:
score = accuracy_score(teste_mode_Y,predictMode)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.789243


In [24]:
print("Number of mislabeled points out of a total %d points : %d"
...       % (teste_mode_X.shape[0], (teste_mode_Y != predictMode).sum()))

Number of mislabeled points out of a total 15060 points : 3174


### Undersampling e Oversampling

In [25]:
oversample = RandomOverSampler(sampling_strategy=0.4)
treino_mode_X,treino_mode_Y = oversample.fit_resample(treino_mode_X,treino_mode_Y)

undersample = RandomUnderSampler(sampling_strategy=0.7)
treino_mode_X,treino_mode_Y = undersample.fit_resample(treino_mode_X,treino_mode_Y)

In [26]:
gnb_mode = GaussianNB()
gnb_mode.fit(treino_mode_X, treino_mode_Y)

GaussianNB()

In [27]:
predictMode = gnb_mode.predict(teste_mode_X)

In [28]:
print(classification_report(teste_mode_Y,predictMode))

              precision    recall  f1-score   support

           0       0.81      0.94      0.87     11360
           1       0.64      0.31      0.42      3700

    accuracy                           0.79     15060
   macro avg       0.72      0.63      0.64     15060
weighted avg       0.77      0.79      0.76     15060



In [29]:
confusionMode = pd.crosstab(teste_mode_Y, predictMode, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionMode)

Predicted      0     1    All
Actual                       
0          10723   637  11360
1           2559  1141   3700
All        13282  1778  15060


In [30]:
score = accuracy_score(teste_mode_Y,predictMode)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.787782


In [31]:
print("Number of mislabeled points out of a total %d points : %d"
...       % (teste_mode_X.shape[0], (teste_mode_Y != predictMode).sum()))

Number of mislabeled points out of a total 15060 points : 3196


# 3 - KNN as null values

#### Read csv created in Data_preprocessing

In [32]:
treino_knn = pd.read_csv("training_knn.csv")
teste_knn = pd.read_csv("test_knn.csv")

In [33]:
treino_knn_X = treino_knn.drop(columns=['salary']) 
treino_knn_Y = treino_knn['salary']
teste_knn_X = teste_knn.drop(columns=['salary'])  
teste_knn_Y = teste_knn['salary']

#### Create the model to predict values


In [34]:
gnb_knn = GaussianNB()
gnb_knn.fit(treino_knn_X, treino_knn_Y)

GaussianNB()

#### Predict values

In [35]:
predictKNN = gnb_knn.predict(teste_knn_X)

### Check accuracy of predition

In [36]:
print(classification_report(teste_knn_Y,predictKNN))

              precision    recall  f1-score   support

         0.0       0.81      0.95      0.87     11360
         1.0       0.65      0.31      0.42      3700

    accuracy                           0.79     15060
   macro avg       0.73      0.63      0.64     15060
weighted avg       0.77      0.79      0.76     15060



In [37]:
confusionKNN = pd.crosstab(teste_knn_Y, predictKNN, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionKNN)

Predicted    0.0   1.0    All
Actual                       
0.0        10755   605  11360
1.0         2569  1131   3700
All        13324  1736  15060


In [38]:
score = accuracy_score(teste_knn_Y,predictKNN)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.789243


In [39]:
print("Number of mislabeled points out of a total %d points : %d"
...       % (teste_knn_X.shape[0], (teste_knn_Y != predictKNN).sum()))

Number of mislabeled points out of a total 15060 points : 3174


### Undersampling e Oversampling

In [40]:
oversample = RandomOverSampler(sampling_strategy=0.4)
treino_knn_X,treino_knn_Y = oversample.fit_resample(treino_knn_X,treino_knn_Y)

undersample = RandomUnderSampler(sampling_strategy=0.7)
treino_knn_X,treino_knn_Y = undersample.fit_resample(treino_knn_X,treino_knn_Y)

In [41]:
gnb_knn = GaussianNB()
gnb_knn.fit(treino_knn_X, treino_knn_Y)

GaussianNB()

In [42]:
predictKNN = gnb_knn.predict(teste_knn_X)

In [43]:
print(classification_report(teste_knn_Y,predictKNN))

              precision    recall  f1-score   support

         0.0       0.81      0.94      0.87     11360
         1.0       0.64      0.31      0.42      3700

    accuracy                           0.79     15060
   macro avg       0.72      0.63      0.64     15060
weighted avg       0.77      0.79      0.76     15060



In [44]:
confusionKNN = pd.crosstab(teste_knn_Y, predictKNN, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionKNN)

Predicted    0.0   1.0    All
Actual                       
0.0        10707   653  11360
1.0         2559  1141   3700
All        13266  1794  15060


In [45]:
score = accuracy_score(teste_knn_Y,predictKNN)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.786720


In [46]:
print("Number of mislabeled points out of a total %d points : %d"
...       % (teste_knn_X.shape[0], (teste_knn_Y != predictKNN).sum()))

Number of mislabeled points out of a total 15060 points : 3212


# 4 - No null but using scale instead of ints

#### Read csv created in Data_preprocessing

In [47]:
treino_strings = pd.read_csv("training_0null_strings.csv")
teste_strings = pd.read_csv("test_0null_strings.csv")

In [48]:
treino_strings_X = treino_strings.drop(columns=['salary']) 
treino_strings_Y = treino_strings['salary']
teste_strings_X = teste_strings.drop(columns=['salary'])  
teste_strings_Y = teste_strings['salary']

#### Scale the data

In [49]:
from sklearn import preprocessing

categorical = ['workclass', 'maritalstatus', 'occupation', 'relationship', 'race', 'sex', 'nativecountry']
for feature in categorical:
        le = preprocessing.LabelEncoder()
        treino_strings_X[feature] = le.fit_transform(treino_strings_X[feature])
        teste_strings_X[feature] = le.transform(teste_strings_X[feature])

In [50]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

treino_strings_X = pd.DataFrame(scaler.fit_transform(treino_strings_X), columns = treino_strings_X.columns)

teste_strings_X = pd.DataFrame(scaler.transform(teste_strings_X), columns = teste_strings_X.columns)

#### Create the model to predict values

In [51]:
gnb_strings = GaussianNB()
gnb_strings.fit(treino_strings_X, treino_strings_Y)

GaussianNB()

#### Predict values

In [52]:
predictStrings = gnb_strings.predict(teste_strings_X)

### Check accuracy of predition

In [53]:
print(classification_report(teste_strings_Y,predictStrings))

              precision    recall  f1-score   support

           0       0.81      0.95      0.88     11360
           1       0.68      0.32      0.44      3700

    accuracy                           0.80     15060
   macro avg       0.74      0.64      0.66     15060
weighted avg       0.78      0.80      0.77     15060



In [54]:
confusionStrings = pd.crosstab(teste_strings_Y, predictStrings, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionStrings)

Predicted      0     1    All
Actual                       
0          10787   573  11360
1           2500  1200   3700
All        13287  1773  15060


In [55]:
score = accuracy_score(teste_strings_Y,predictStrings)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.795950


In [56]:
print("Number of mislabeled points out of a total %d points : %d"
...       % (teste_strings_X.shape[0], (teste_strings_Y != predictStrings).sum()))

Number of mislabeled points out of a total 15060 points : 3073


### Undersampling e Oversampling

In [57]:
oversample = RandomOverSampler(sampling_strategy=0.4)
treino_strings_X,treino_strings_Y = oversample.fit_resample(treino_strings_X,treino_strings_Y)

undersample = RandomUnderSampler(sampling_strategy=0.7)
treino_strings_X,treino_strings_Y = undersample.fit_resample(treino_strings_X,treino_strings_Y)

In [58]:
gnb_strings = GaussianNB()
gnb_strings.fit(treino_strings_X, treino_strings_Y)

GaussianNB()

In [59]:
predictStrings = gnb_strings.predict(teste_strings_X)

In [60]:
print(classification_report(teste_strings_Y,predictStrings))

              precision    recall  f1-score   support

           0       0.83      0.94      0.88     11360
           1       0.68      0.39      0.50      3700

    accuracy                           0.81     15060
   macro avg       0.75      0.67      0.69     15060
weighted avg       0.79      0.81      0.79     15060



In [61]:
confusionStrings = pd.crosstab(teste_strings_Y, predictStrings, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionStrings)

Predicted      0     1    All
Actual                       
0          10683   677  11360
1           2246  1454   3700
All        12929  2131  15060


In [62]:
score = accuracy_score(teste_strings_Y,predictStrings)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.805910


In [63]:
print("Number of mislabeled points out of a total %d points : %d"
...       % (teste_strings_X.shape[0], (teste_strings_Y != predictStrings).sum()))

Number of mislabeled points out of a total 15060 points : 2923
