In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from sklearn.metrics import accuracy_score

# Logistic Regression

# 1 - Without nulls

#### Read csv created in Data_preprocessing

In [2]:
treino_0null = pd.read_csv("training_0null.csv")
teste_0null = pd.read_csv("test_0null.csv")

In [3]:
treino_0null_X = treino_0null.drop(columns=['salary']) 
treino_0null_Y = treino_0null['salary']
teste_0null_X = teste_0null.drop(columns=['salary'])  
teste_0null_Y = teste_0null['salary']

#### Create the model to predict values

In [4]:
logmodel1 = LogisticRegression(max_iter=500)
logmodel1.fit(treino_0null_X,treino_0null_Y)

LogisticRegression(max_iter=500)

#### Predict values

In [5]:
predictNotNull = logmodel1.predict(teste_0null_X)

### Check accuracy of predition

In [6]:
print(classification_report(teste_0null_Y,predictNotNull))

              precision    recall  f1-score   support

           0       0.81      0.94      0.87     11360
           1       0.63      0.30      0.41      3700

    accuracy                           0.79     15060
   macro avg       0.72      0.62      0.64     15060
weighted avg       0.76      0.79      0.76     15060



In [7]:
confusionNotNull = pd.crosstab(teste_0null_Y, predictNotNull, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionNotNull)

Predicted      0     1    All
Actual                       
0          10717   643  11360
1           2591  1109   3700
All        13308  1752  15060


In [8]:
score = accuracy_score(teste_0null_Y,predictNotNull)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.785259


#### Undersampling e Oversampling

In [9]:
oversample = RandomOverSampler(sampling_strategy=0.4)
treino_0null_X,treino_0null_Y = oversample.fit_resample(treino_0null_X,treino_0null_Y)

undersample = RandomUnderSampler(sampling_strategy=0.7)
treino_0null_X,treino_0null_Y = undersample.fit_resample(treino_0null_X,treino_0null_Y)

In [10]:
logmodel1 = LogisticRegression(max_iter=500)
logmodel1.fit(treino_0null_X,treino_0null_Y)

LogisticRegression(max_iter=500)

In [11]:
predictNotNull = logmodel1.predict(teste_0null_X)

In [12]:
print(classification_report(teste_0null_Y,predictNotNull))

              precision    recall  f1-score   support

           0       0.80      0.93      0.87     11360
           1       0.60      0.30      0.40      3700

    accuracy                           0.78     15060
   macro avg       0.70      0.62      0.63     15060
weighted avg       0.76      0.78      0.75     15060



In [13]:
confusionNotNull = pd.crosstab(teste_0null_Y,predictNotNull, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionNotNull)

Predicted      0     1    All
Actual                       
0          10619   741  11360
1           2573  1127   3700
All        13192  1868  15060


In [14]:
score = accuracy_score(teste_0null_Y,predictNotNull)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.779947


# 2 - Mode as null values

#### Read csv created in Data_preprocessing

In [15]:
treino_mode = pd.read_csv("training_mode.csv")
teste_mode = pd.read_csv("test_mode.csv")

In [16]:
treino_mode_X = treino_mode.drop(columns=['salary']) 
treino_mode_Y = treino_mode['salary']
teste_mode_X = teste_mode.drop(columns=['salary'])  
teste_mode_Y = teste_mode['salary']

#### Create the model to predict values

In [17]:
logmodel2 = LogisticRegression(max_iter=500)
logmodel2.fit(treino_mode_X,treino_mode_Y)

LogisticRegression(max_iter=500)

#### Predict values

In [18]:
predictYMode = logmodel2.predict(teste_mode_X)

### Check accuracy of predition

In [19]:
print(classification_report(teste_mode_Y,predictYMode))

              precision    recall  f1-score   support

           0       0.80      0.97      0.88     11360
           1       0.73      0.26      0.39      3700

    accuracy                           0.79     15060
   macro avg       0.76      0.62      0.63     15060
weighted avg       0.78      0.79      0.76     15060



In [20]:
confusionMode = pd.crosstab(teste_mode_Y, predictYMode, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionMode)

Predicted      0     1    All
Actual                       
0          10992   368  11360
1           2722   978   3700
All        13714  1346  15060


In [21]:
score = accuracy_score(teste_mode_Y,predictYMode)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.794821


#### Undersampling e Oversampling

In [22]:
oversample = RandomOverSampler(sampling_strategy=0.4)
treino_mode_X,treino_mode_Y = oversample.fit_resample(treino_mode_X,treino_mode_Y)

undersample = RandomUnderSampler(sampling_strategy=0.7)
treino_mode_X,treino_mode_Y = undersample.fit_resample(treino_mode_X,treino_mode_Y)

In [23]:
logmodel2 = LogisticRegression(max_iter=500)
logmodel2.fit(treino_mode_X,treino_mode_Y)

LogisticRegression(max_iter=500)

In [24]:
predictYMode = logmodel2.predict(teste_mode_X)

In [25]:
print(classification_report(teste_mode_Y,predictYMode))

              precision    recall  f1-score   support

           0       0.82      0.85      0.83     11360
           1       0.48      0.42      0.44      3700

    accuracy                           0.74     15060
   macro avg       0.65      0.63      0.64     15060
weighted avg       0.73      0.74      0.74     15060



In [26]:
confusionMode = pd.crosstab(teste_mode_Y, predictYMode, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionMode)

Predicted      0     1    All
Actual                       
0           9663  1697  11360
1           2158  1542   3700
All        11821  3239  15060


In [27]:
score = accuracy_score(teste_mode_Y,predictYMode)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.744024


# 3 - KNN as null values

#### Read csv created in Data_preprocessing

In [28]:
treino_knn = pd.read_csv("training_knn.csv")
teste_knn = pd.read_csv("test_knn.csv")

In [29]:
treino_knn_X = treino_knn.drop(columns=['salary']) 
treino_knn_Y = treino_knn['salary']
teste_knn_X = teste_knn.drop(columns=['salary'])  
teste_knn_Y = teste_knn['salary']

#### Create the model to predict values

In [30]:
logmodel3 = LogisticRegression(max_iter=500)
logmodel3.fit(treino_knn_X,treino_knn_Y)

LogisticRegression(max_iter=500)

#### Predict values


In [31]:
predictYKNN = logmodel3.predict(teste_knn_X)

### Check accuracy of predition


In [32]:
print(classification_report(teste_knn_Y,predictYKNN))

              precision    recall  f1-score   support

         0.0       0.80      0.95      0.87     11360
         1.0       0.64      0.28      0.39      3700

    accuracy                           0.78     15060
   macro avg       0.72      0.62      0.63     15060
weighted avg       0.76      0.78      0.75     15060



In [33]:
confusionKNN = pd.crosstab(teste_knn_Y, predictYKNN, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionKNN)

Predicted    0.0   1.0    All
Actual                       
0.0        10766   594  11360
1.0         2654  1046   3700
All        13420  1640  15060


In [34]:
score = accuracy_score(teste_knn_Y,predictYKNN)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.784329


#### Undersampling e Oversampling

In [35]:
oversample = RandomOverSampler(sampling_strategy=0.4)
treino_knn_X,treino_knn_Y = oversample.fit_resample(treino_knn_X,treino_knn_Y)

undersample = RandomUnderSampler(sampling_strategy=0.7)
treino_knn_X,treino_knn_Y = undersample.fit_resample(treino_knn_X,treino_knn_Y)

In [36]:
logmodel3 = LogisticRegression(max_iter=500)
logmodel3.fit(treino_knn_X,treino_knn_Y)

LogisticRegression(max_iter=500)

In [37]:
predictYKNN = logmodel3.predict(teste_knn_X)

In [38]:
print(classification_report(teste_knn_Y,predictYKNN))

              precision    recall  f1-score   support

         0.0       0.82      0.85      0.83     11360
         1.0       0.48      0.42      0.45      3700

    accuracy                           0.74     15060
   macro avg       0.65      0.63      0.64     15060
weighted avg       0.73      0.74      0.74     15060



In [39]:
confusionKNN = pd.crosstab(teste_knn_Y, predictYKNN, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionKNN)

Predicted    0.0   1.0    All
Actual                       
0.0         9651  1709  11360
1.0         2146  1554   3700
All        11797  3263  15060


In [40]:
score = accuracy_score(teste_knn_Y,predictYKNN)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.744024


# 4 - No null but using scale instead of ints

#### Read csv created in Data_preprocessing

In [41]:
treino_strings = pd.read_csv("training_0null_strings.csv")
teste_strings = pd.read_csv("test_0null_strings.csv")

In [42]:
treino_strings_X = treino_strings.drop(columns=['salary']) 
treino_strings_Y = treino_strings['salary']
teste_strings_X = teste_strings.drop(columns=['salary'])  
teste_strings_Y = teste_strings['salary']

#### Scale the data

In [43]:
from sklearn import preprocessing

categorical = ['workclass', 'maritalstatus', 'occupation', 'relationship', 'race', 'sex', 'nativecountry']
for feature in categorical:
        le = preprocessing.LabelEncoder()
        treino_strings_X[feature] = le.fit_transform(treino_strings_X[feature])
        teste_strings_X[feature] = le.transform(teste_strings_X[feature])

In [44]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

treino_strings_X = pd.DataFrame(scaler.fit_transform(treino_strings_X), columns = treino_strings_X.columns)

teste_strings_X = pd.DataFrame(scaler.transform(teste_strings_X), columns = teste_strings_X.columns)

#### Create the model to predict values

In [45]:
logmodel4 = LogisticRegression(max_iter=500)
logmodel4.fit(treino_strings_X,treino_strings_Y)

LogisticRegression(max_iter=500)

#### Predict values


In [46]:
predictScale = logmodel4.predict(teste_strings_X)

### Check accuracy of predition


In [47]:
print(classification_report(teste_strings_Y,predictScale))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89     11360
           1       0.70      0.46      0.56      3700

    accuracy                           0.82     15060
   macro avg       0.77      0.70      0.72     15060
weighted avg       0.81      0.82      0.81     15060



In [48]:
confusionScale = pd.crosstab(teste_strings_Y, predictScale, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionScale)

Predicted      0     1    All
Actual                       
0          10643   717  11360
1           1996  1704   3700
All        12639  2421  15060


In [49]:
score = accuracy_score(teste_strings_Y,predictScale)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.819854


#### Undersampling e Oversampling

In [50]:
oversample = RandomOverSampler(sampling_strategy=0.4)
treino_strings_X,treino_strings_Y = oversample.fit_resample(treino_strings_X,treino_strings_Y)

undersample = RandomUnderSampler(sampling_strategy=0.7)
treino_strings_X,treino_strings_Y = undersample.fit_resample(treino_strings_X,treino_strings_Y)

In [51]:
logmodel4 = LogisticRegression(max_iter=500)
logmodel4.fit(treino_strings_X,treino_strings_Y)

LogisticRegression(max_iter=500)

In [52]:
predictScale = logmodel4.predict(teste_strings_X)

In [53]:
print(classification_report(teste_strings_Y,predictScale))

              precision    recall  f1-score   support

           0       0.88      0.84      0.86     11360
           1       0.57      0.65      0.61      3700

    accuracy                           0.79     15060
   macro avg       0.73      0.75      0.73     15060
weighted avg       0.80      0.79      0.80     15060



In [54]:
confusionScale = pd.crosstab(teste_strings_Y, predictScale, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionScale)

Predicted      0     1    All
Actual                       
0           9562  1798  11360
1           1296  2404   3700
All        10858  4202  15060


In [55]:
score = accuracy_score(teste_strings_Y,predictScale)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.794555
