In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from sklearn.metrics import accuracy_score

# SVM

# 1 - Without nulls

#### Read csv created in Data_preprocessing

In [2]:
treino_0null = pd.read_csv("training_0null.csv")
teste_0null = pd.read_csv("test_0null.csv")

In [3]:
treino_0null_X = treino_0null.drop(columns=['salary']) 
treino_0null_Y = treino_0null['salary']
teste_0null_X = teste_0null.drop(columns=['salary'])  
teste_0null_Y = teste_0null['salary']

#### Create the model to predict values

In [4]:
svc_model = SVC()

In [5]:
svc_model.fit(treino_0null_X,treino_0null_Y)

SVC()

#### Test accuracy

In [6]:
predictions = svc_model.predict(teste_0null_X)

In [7]:
print(classification_report(teste_0null_Y,predictions))

              precision    recall  f1-score   support

           0       0.78      1.00      0.88     11360
           1       0.97      0.15      0.27      3700

    accuracy                           0.79     15060
   macro avg       0.88      0.58      0.57     15060
weighted avg       0.83      0.79      0.73     15060



In [8]:
confusionNotNull = pd.crosstab(teste_0null_Y,predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionNotNull)

Predicted      0    1    All
Actual                      
0          11341   19  11360
1           3131  569   3700
All        14472  588  15060


In [9]:
score = accuracy_score(teste_0null_Y,predictions)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.790837


#### Undersampling e Oversampling

In [10]:
oversample = RandomOverSampler(sampling_strategy=0.4)
treino_0null_X,treino_0null_Y = oversample.fit_resample(treino_0null_X,treino_0null_Y)

undersample = RandomUnderSampler(sampling_strategy=0.7)
treino_0null_X,treino_0null_Y = undersample.fit_resample(treino_0null_X,treino_0null_Y)

In [11]:
svc_model = SVC()

In [12]:
svc_model.fit(treino_0null_X,treino_0null_Y)

SVC()

In [13]:
print(classification_report(teste_0null_Y,predictions))

              precision    recall  f1-score   support

           0       0.78      1.00      0.88     11360
           1       0.97      0.15      0.27      3700

    accuracy                           0.79     15060
   macro avg       0.88      0.58      0.57     15060
weighted avg       0.83      0.79      0.73     15060



In [14]:
confusionNotNull = pd.crosstab(teste_0null_Y,predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionNotNull)

Predicted      0    1    All
Actual                      
0          11341   19  11360
1           3131  569   3700
All        14472  588  15060


In [15]:
score = accuracy_score(teste_0null_Y,predictions)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.790837


# 2 - Mode as null values

#### Read csv created in Data_preprocessing

In [16]:
treino_mode = pd.read_csv("training_mode.csv")
teste_mode = pd.read_csv("test_mode.csv")

In [17]:
treino_mode_X = treino_mode.drop(columns=['salary']) 
treino_mode_Y = treino_mode['salary']
teste_mode_X = teste_mode.drop(columns=['salary'])  
teste_mode_Y = teste_mode['salary']

#### Train a Model

In [18]:
svc_model_mode = SVC()

In [19]:
svc_model_mode.fit(treino_mode_X,treino_mode_Y)

SVC()

#### Test accuracy

In [20]:
predictions_mode = svc_model_mode.predict(teste_mode_X)

In [21]:
print(classification_report(teste_mode_Y,predictions_mode))

              precision    recall  f1-score   support

           0       0.78      1.00      0.88     11360
           1       0.96      0.16      0.27      3700

    accuracy                           0.79     15060
   macro avg       0.87      0.58      0.57     15060
weighted avg       0.83      0.79      0.73     15060



In [22]:
confusionMode = pd.crosstab(teste_mode_Y,predictions_mode, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionMode)

Predicted      0    1    All
Actual                      
0          11339   21  11360
1           3124  576   3700
All        14463  597  15060


In [23]:
score = accuracy_score(teste_mode_Y,predictions_mode)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.791169


#### Undersampling e Oversampling

In [24]:
oversample = RandomOverSampler(sampling_strategy=0.4)
treino_mode_X,treino_mode_Y = oversample.fit_resample(treino_mode_X,treino_mode_Y)

undersample = RandomUnderSampler(sampling_strategy=0.7)
treino_mode_X,treino_mode_Y = undersample.fit_resample(treino_mode_X,treino_mode_Y)

In [25]:
svc_model_mode = SVC()

In [26]:
svc_model_mode.fit(treino_mode_X,treino_mode_Y)

SVC()

In [27]:
predictions_mode = svc_model_mode.predict(teste_mode_X)

In [28]:
print(classification_report(teste_mode_Y,predictions_mode))

              precision    recall  f1-score   support

           0       0.79      0.99      0.88     11360
           1       0.89      0.18      0.30      3700

    accuracy                           0.79     15060
   macro avg       0.84      0.59      0.59     15060
weighted avg       0.81      0.79      0.74     15060



In [29]:
confusionMode = pd.crosstab(teste_mode_Y,predictions_mode, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionMode)

Predicted      0    1    All
Actual                      
0          11282   78  11360
1           3044  656   3700
All        14326  734  15060


In [30]:
score = accuracy_score(teste_mode_Y,predictions_mode)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.792696


# 3 - KNN as null values

#### Read csv created in Data_preprocessing

In [31]:
treino_knn = pd.read_csv("training_knn.csv")
teste_knn = pd.read_csv("test_knn.csv")

In [32]:
treino_knn_X = treino_knn.drop(columns=['salary']) 
treino_knn_Y = treino_knn['salary']
teste_knn_X = teste_knn.drop(columns=['salary'])  
teste_knn_Y = teste_knn['salary']

#### Train a Model

In [33]:
svc_model_knn = SVC()

In [34]:
svc_model_knn.fit(teste_knn_X,teste_knn_Y)

SVC()

#### Test accuracy

In [35]:
predictions_knn = svc_model_knn.predict(teste_knn_X)

In [36]:
print(classification_report(teste_knn_Y,predictions_knn))

              precision    recall  f1-score   support

         0.0       0.78      1.00      0.87     11360
         1.0       0.99      0.11      0.20      3700

    accuracy                           0.78     15060
   macro avg       0.88      0.56      0.54     15060
weighted avg       0.83      0.78      0.71     15060



In [37]:
confusionKNN = pd.crosstab(teste_knn_Y,predictions_knn, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionKNN)

Predicted    0.0  1.0    All
Actual                      
0.0        11354    6  11360
1.0         3290  410   3700
All        14644  416  15060


In [38]:
score = accuracy_score(teste_knn_Y,predictions_knn)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.781142


#### Undersampling e Oversampling

In [39]:
oversample = RandomOverSampler(sampling_strategy=0.4)
treino_knn_X,treino_knn_Y = oversample.fit_resample(treino_knn_X,treino_knn_Y)

undersample = RandomUnderSampler(sampling_strategy=0.7)
treino_knn_X,treino_knn_Y = undersample.fit_resample(treino_knn_X,treino_knn_Y)

In [40]:
svc_model_knn = SVC()

In [41]:
svc_model_knn.fit(teste_knn_X,teste_knn_Y)

SVC()

In [42]:
predictions_knn = svc_model_knn.predict(teste_knn_X)

In [43]:
print(classification_report(teste_knn_Y,predictions_knn))

              precision    recall  f1-score   support

         0.0       0.78      1.00      0.87     11360
         1.0       0.99      0.11      0.20      3700

    accuracy                           0.78     15060
   macro avg       0.88      0.56      0.54     15060
weighted avg       0.83      0.78      0.71     15060



In [44]:
confusionKNN = pd.crosstab(teste_knn_Y,predictions_knn, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionKNN)

Predicted    0.0  1.0    All
Actual                      
0.0        11354    6  11360
1.0         3290  410   3700
All        14644  416  15060


In [45]:
score = accuracy_score(teste_knn_Y,predictions_knn)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.781142


# 4 - No null but using scale instead of ints

#### Read csv created in Data_preprocessing

In [46]:
treino_strings = pd.read_csv("training_0null_strings.csv")
teste_strings = pd.read_csv("test_0null_strings.csv")

In [47]:
treino_strings_X = treino_strings.drop(columns=['salary']) 
treino_strings_Y = treino_strings['salary']
teste_strings_X = teste_strings.drop(columns=['salary'])  
teste_strings_Y = teste_strings['salary']

#### Scale the data

In [48]:
from sklearn import preprocessing

categorical = ['workclass', 'maritalstatus', 'occupation', 'relationship', 'race', 'sex', 'nativecountry']
for feature in categorical:
        le = preprocessing.LabelEncoder()
        treino_strings_X[feature] = le.fit_transform(treino_strings_X[feature])
        teste_strings_X[feature] = le.transform(teste_strings_X[feature])

In [49]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

treino_strings_X = pd.DataFrame(scaler.fit_transform(treino_strings_X), columns = treino_strings_X.columns)

teste_strings_X = pd.DataFrame(scaler.transform(teste_strings_X), columns = teste_strings_X.columns)

#### Train a Model

In [50]:
svc_model_str = SVC()

In [51]:
svc_model_str.fit(treino_strings_X,treino_strings_Y)

SVC()

#### Test accuracy

In [52]:
predictions_scale = svc_model_str.predict(teste_strings_X)

In [53]:
print(classification_report(teste_strings_Y,predictions_scale))

              precision    recall  f1-score   support

           0       0.86      0.94      0.90     11360
           1       0.76      0.54      0.63      3700

    accuracy                           0.85     15060
   macro avg       0.81      0.74      0.77     15060
weighted avg       0.84      0.85      0.84     15060



In [54]:
confusionScale = pd.crosstab(teste_strings_Y,predictions_scale, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionScale)

Predicted      0     1    All
Actual                       
0          10719   641  11360
1           1686  2014   3700
All        12405  2655  15060


In [55]:
score = accuracy_score(teste_strings_Y,predictions_scale)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.845485


#### Undersampling e Oversampling

In [56]:
oversample = RandomOverSampler(sampling_strategy=0.4)
treino_strings_X,treino_strings_Y = oversample.fit_resample(treino_strings_X,treino_strings_Y)

undersample = RandomUnderSampler(sampling_strategy=0.7)
treino_strings_X,treino_strings_Y = undersample.fit_resample(treino_strings_X,treino_strings_Y)

In [57]:
svc_model_str = SVC()

In [58]:
svc_model_str.fit(treino_strings_X,treino_strings_Y)

SVC()

In [59]:
predictions_scale = svc_model_str.predict(teste_strings_X)

In [60]:
print(classification_report(teste_strings_Y,predictions_scale))

              precision    recall  f1-score   support

           0       0.93      0.82      0.87     11360
           1       0.60      0.80      0.68      3700

    accuracy                           0.82     15060
   macro avg       0.76      0.81      0.78     15060
weighted avg       0.84      0.82      0.83     15060



In [61]:
confusionScale = pd.crosstab(teste_strings_Y,predictions_scale, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionScale)

Predicted      0     1    All
Actual                       
0           9362  1998  11360
1            749  2951   3700
All        10111  4949  15060


In [62]:
score = accuracy_score(teste_strings_Y,predictions_scale)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.817596
