In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from sklearn.metrics import accuracy_score

# K Means Clustering

# 1 - Without nulls

#### Read csv created in Data_preprocessing

In [2]:
treino_0null = pd.read_csv("training_0null.csv")
teste_0null = pd.read_csv("test_0null.csv")

In [3]:
merged = pd.concat([treino_0null, teste_0null], ignore_index=True)
merged_X = merged.drop(columns=['salary']) 
merged_Y = merged['salary']

#### Create K Means Clustering model

In [4]:
kmeans = KMeans(n_clusters=2)

In [5]:
kmeans.fit(merged_X)

KMeans(n_clusters=2)

#### Predict values and check accuracy

In [6]:
score = accuracy_score(merged_Y,kmeans.predict(merged_X))
print('Accuracy:{0:f}'.format(score))

Accuracy:0.614325


In [7]:
print(classification_report(merged_Y,kmeans.predict(merged_X)))

              precision    recall  f1-score   support

           0       0.75      0.73      0.74     34014
           1       0.24      0.25      0.24     11208

    accuracy                           0.61     45222
   macro avg       0.49      0.49      0.49     45222
weighted avg       0.62      0.61      0.62     45222



In [8]:
confusionNotNull = pd.crosstab(merged_Y, kmeans.predict(merged_X), rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionNotNull)

Predicted      0      1    All
Actual                        
0          24994   9020  34014
1           8421   2787  11208
All        33415  11807  45222


### Undersampling e Oversampling

In [9]:
oversample = RandomOverSampler(sampling_strategy=0.4)
merged_X,merged_Y = oversample.fit_resample(merged_X,merged_Y)

undersample = RandomUnderSampler(sampling_strategy=0.7)
merged_X,merged_Y = undersample.fit_resample(merged_X,merged_Y)

In [10]:
kmeans = KMeans(n_clusters=2)

In [11]:
kmeans.fit(merged_X)

KMeans(n_clusters=2)

In [12]:
score = accuracy_score(merged_Y,kmeans.predict(merged_X))
print('Accuracy:{0:f}'.format(score))

Accuracy:0.466495


In [13]:
print(classification_report(merged_Y,kmeans.predict(merged_X)))

              precision    recall  f1-score   support

           0       0.60      0.27      0.37     19435
           1       0.42      0.75      0.54     13605

    accuracy                           0.47     33040
   macro avg       0.51      0.51      0.45     33040
weighted avg       0.53      0.47      0.44     33040



In [14]:
confusionNotNull = pd.crosstab(merged_Y, kmeans.predict(merged_X), rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionNotNull)

Predicted     0      1    All
Actual                       
0          5219  14216  19435
1          3411  10194  13605
All        8630  24410  33040


# 2 - Mode as null values

#### Read csv created in Data_preprocessing

In [15]:
treino_mode = pd.read_csv("training_mode.csv")
teste_mode = pd.read_csv("test_mode.csv")

In [16]:
merged = pd.concat([treino_mode, teste_mode], ignore_index=True)
merged_X = merged.drop(columns=['salary']) 
merged_Y = merged['salary']

#### Create K Means Clustering model

In [17]:
kmeans = KMeans(n_clusters=2)

In [18]:
kmeans.fit(merged_X)

KMeans(n_clusters=2)

#### Predict values and check accuracy

In [19]:
score = accuracy_score(merged_Y,kmeans.predict(merged_X))
print('Accuracy:{0:f}'.format(score))

Accuracy:0.617501


In [20]:
print(classification_report(merged_Y,kmeans.predict(merged_X)))

              precision    recall  f1-score   support

           0       0.75      0.74      0.74     36080
           1       0.23      0.25      0.24     11541

    accuracy                           0.62     47621
   macro avg       0.49      0.49      0.49     47621
weighted avg       0.63      0.62      0.62     47621



In [21]:
confusionMode = pd.crosstab(merged_Y, kmeans.predict(merged_X), rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionMode)

Predicted      0      1    All
Actual                        
0          26537   9543  36080
1           8672   2869  11541
All        35209  12412  47621


### Undersampling e Oversampling

In [22]:
oversample = RandomOverSampler(sampling_strategy=0.4)
merged_X,merged_Y = oversample.fit_resample(merged_X,merged_Y)

undersample = RandomUnderSampler(sampling_strategy=0.7)
merged_X,merged_Y = undersample.fit_resample(merged_X,merged_Y)

In [23]:
kmeans = KMeans(n_clusters=2)

In [24]:
kmeans.fit(merged_X)

KMeans(n_clusters=2)

In [25]:
score = accuracy_score(merged_Y,kmeans.predict(merged_X))
print('Accuracy:{0:f}'.format(score))

Accuracy:0.534309


In [26]:
print(classification_report(merged_Y,kmeans.predict(merged_X)))

              precision    recall  f1-score   support

           0       0.58      0.74      0.65     20617
           1       0.39      0.24      0.30     14432

    accuracy                           0.53     35049
   macro avg       0.49      0.49      0.48     35049
weighted avg       0.50      0.53      0.51     35049



In [27]:
confusionMode = pd.crosstab(merged_Y, kmeans.predict(merged_X), rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionMode)

Predicted      0     1    All
Actual                       
0          15200  5417  20617
1          10905  3527  14432
All        26105  8944  35049


# 3 - KNN as null values

#### Read csv created in Data_preprocessing

In [28]:
treino_knn = pd.read_csv("training_knn.csv")
teste_knn = pd.read_csv("test_knn.csv")

In [29]:
merged_knn = pd.concat([treino_knn, teste_knn], ignore_index=True)
merged_knn_X = merged_knn.drop(columns=['salary']) 
merged_knn_Y = merged_knn['salary']

#### Create K Means Clustering model

In [30]:
kmeans = KMeans(n_clusters=2)

In [31]:
kmeans.fit(merged_knn_X)

KMeans(n_clusters=2)

#### Predict values and check accuracy

In [32]:
score = accuracy_score(merged_knn_Y,kmeans.predict(merged_knn_X))
print('Accuracy:{0:f}'.format(score))

Accuracy:0.617375


In [33]:
print(classification_report(merged_knn_Y,kmeans.predict(merged_knn_X)))

              precision    recall  f1-score   support

         0.0       0.75      0.74      0.74     36080
         1.0       0.23      0.25      0.24     11541

    accuracy                           0.62     47621
   macro avg       0.49      0.49      0.49     47621
weighted avg       0.63      0.62      0.62     47621



In [34]:
confusionKNN = pd.crosstab(merged_knn_Y, kmeans.predict(merged_knn_X), rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionKNN)

Predicted      0      1    All
Actual                        
0.0        26530   9550  36080
1.0         8671   2870  11541
All        35201  12420  47621


### Undersampling e Oversampling

In [35]:
oversample = RandomOverSampler(sampling_strategy=0.4)
merged_knn_X,merged_knn_Y = oversample.fit_resample(merged_knn_X,merged_knn_Y)

undersample = RandomUnderSampler(sampling_strategy=0.7)
merged_knn_X,merged_knn_Y = undersample.fit_resample(merged_knn_X,merged_knn_Y)

In [36]:
kmeans = KMeans(n_clusters=2)

In [37]:
kmeans.fit(merged_knn_X)

KMeans(n_clusters=2)

In [38]:
score = accuracy_score(merged_knn_Y,kmeans.predict(merged_knn_X))
print('Accuracy:{0:f}'.format(score))

Accuracy:0.536278


In [39]:
print(classification_report(merged_knn_Y,kmeans.predict(merged_knn_X)))

              precision    recall  f1-score   support

         0.0       0.58      0.74      0.65     20617
         1.0       0.40      0.25      0.30     14432

    accuracy                           0.54     35049
   macro avg       0.49      0.49      0.48     35049
weighted avg       0.51      0.54      0.51     35049



In [40]:
confusionKNN = pd.crosstab(merged_knn_Y, kmeans.predict(merged_knn_X), rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionKNN)

Predicted      0     1    All
Actual                       
0.0        15257  5360  20617
1.0        10893  3539  14432
All        26150  8899  35049


# 4 - No null but using scale instead of ints

#### Read csv created in Data_preprocessing

In [41]:
treino_strings = pd.read_csv("training_0null_strings.csv")
teste_strings = pd.read_csv("test_0null_strings.csv")

In [42]:
merged_scale = pd.concat([treino_strings, teste_strings], ignore_index=True)
merged_scale_X = merged_scale.drop(columns=['salary']) 
merged_scale_Y = merged_scale['salary']

#### Scale the data

In [43]:
from sklearn import preprocessing

categorical = ['workclass', 'maritalstatus', 'occupation', 'relationship', 'race', 'sex', 'nativecountry']
for feature in categorical:
        le = preprocessing.LabelEncoder()
        merged_scale_X[feature] = le.fit_transform(merged_scale_X[feature])

In [44]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

merged_scale_X = pd.DataFrame(scaler.fit_transform(merged_scale_X), columns = merged_scale_X.columns)

#### Create K Means Clustering model

In [45]:
kmeans = KMeans(n_clusters=2)

In [46]:
kmeans.fit(merged_scale_X)

KMeans(n_clusters=2)

#### Predict values and check accuracy

In [47]:
score = accuracy_score(merged_scale_Y,kmeans.predict(merged_scale_X))
print('Accuracy:{0:f}'.format(score))

Accuracy:0.593848


In [48]:
print(classification_report(merged_scale_Y,kmeans.predict(merged_scale_X)))

              precision    recall  f1-score   support

           0       0.92      0.51      0.65     34014
           1       0.36      0.86      0.51     11208

    accuracy                           0.59     45222
   macro avg       0.64      0.68      0.58     45222
weighted avg       0.78      0.59      0.62     45222



In [49]:
confusionScale = pd.crosstab(merged_scale_Y, kmeans.predict(merged_scale_X), rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionScale)

Predicted      0      1    All
Actual                        
0          17213  16801  34014
1           1566   9642  11208
All        18779  26443  45222


### Undersampling e Oversampling

In [50]:
oversample = RandomOverSampler(sampling_strategy=0.4)
merged_scale_X,merged_scale_Y = oversample.fit_resample(merged_scale_X,merged_scale_Y)

undersample = RandomUnderSampler(sampling_strategy=0.7)
merged_scale_X,merged_scale_Y = undersample.fit_resample(merged_scale_X,merged_scale_Y)

In [51]:
kmeans = KMeans(n_clusters=2)

In [52]:
kmeans.fit(merged_scale_X)

KMeans(n_clusters=2)

In [53]:
score = accuracy_score(merged_scale_Y,kmeans.predict(merged_scale_X))
print('Accuracy:{0:f}'.format(score))

Accuracy:0.658081


In [54]:
print(classification_report(merged_scale_Y,kmeans.predict(merged_scale_X)))

              precision    recall  f1-score   support

           0       0.84      0.52      0.64     19435
           1       0.55      0.86      0.67     13605

    accuracy                           0.66     33040
   macro avg       0.70      0.69      0.66     33040
weighted avg       0.72      0.66      0.65     33040



In [55]:
confusionScale = pd.crosstab(merged_scale_Y, kmeans.predict(merged_scale_X), rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusionScale)

Predicted      0      1    All
Actual                        
0          10043   9392  19435
1           1905  11700  13605
All        11948  21092  33040
