In [1]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, pearsonr
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import spearmanr, pearsonr
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# read data and show it

In [2]:
#read data
train_data = pd.read_csv("date set/train_data.csv")
test_data = pd.read_csv("date set/test_data.csv")

In [4]:
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# Preprocess data

In [4]:

class preprocess_data:

     # set all column names to lower case and delete spaces from whole table symbols
     @staticmethod
     def clean_table_from_spaces_and_symbols(df):
         # clean column names
         df.columns = df.columns.str.lower().str.replace(' ', '')
         df.columns = df.columns.str.lower().str.replace('-', '')
         df.columns = df.columns.str.lower().str.replace('.', '')
         # clean values
         for col in df.columns:
             # if column type is object
             if df[col].dtype == 'object':
                 df[col] = df[col].str.lower().str.replace(' ', '')
                 df[col] = df[col].str.lower().str.replace('-', '')
                 df[col] = df[col].str.lower().str.replace('.', '')
         return df



     # fill missing values with most frequent value
     @staticmethod
     def fill_missing_values(df):
         df = df.replace('?', np.nan)
         for col in df.columns:
             df[col].fillna(df[col].mode()[0], inplace=True)
         return df

In [5]:
#clean_tables_from_spaces_and_symbols
train_data = preprocess_data.clean_table_from_spaces_and_symbols(train_data)
test_data = preprocess_data.clean_table_from_spaces_and_symbols(test_data)
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educationnum,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,nativecountry,income
0,39,stategov,77516,bachelors,13,nevermarried,admclerical,notinfamily,white,male,2174,0,40,unitedstates,<=50k
1,50,selfempnotinc,83311,bachelors,13,marriedcivspouse,execmanagerial,husband,white,male,0,0,13,unitedstates,<=50k
2,38,private,215646,hsgrad,9,divorced,handlerscleaners,notinfamily,white,male,0,0,40,unitedstates,<=50k
3,53,private,234721,11th,7,marriedcivspouse,handlerscleaners,husband,black,male,0,0,40,unitedstates,<=50k
4,28,private,338409,bachelors,13,marriedcivspouse,profspecialty,wife,black,female,0,0,40,cuba,<=50k


In [6]:
# show unique values
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        print(col, train_data[col].unique())
        print('\n')

workclass ['stategov' 'selfempnotinc' 'private' 'federalgov' 'localgov' '?'
 'selfempinc' 'withoutpay' 'neverworked']


education ['bachelors' 'hsgrad' '11th' 'masters' '9th' 'somecollege' 'assocacdm'
 'assocvoc' '7th8th' 'doctorate' 'profschool' '5th6th' '10th' '1st4th'
 'preschool' '12th']


maritalstatus ['nevermarried' 'marriedcivspouse' 'divorced' 'marriedspouseabsent'
 'separated' 'marriedafspouse' 'widowed']


occupation ['admclerical' 'execmanagerial' 'handlerscleaners' 'profspecialty'
 'otherservice' 'sales' 'craftrepair' 'transportmoving' 'farmingfishing'
 'machineopinspct' 'techsupport' '?' 'protectiveserv' 'armedforces'
 'privhouseserv']


relationship ['notinfamily' 'husband' 'wife' 'ownchild' 'unmarried' 'otherrelative']


race ['white' 'black' 'asianpacislander' 'amerindianeskimo' 'other']


sex ['male' 'female']


nativecountry ['unitedstates' 'cuba' 'jamaica' 'india' '?' 'mexico' 'south' 'puertorico'
 'honduras' 'england' 'canada' 'germany' 'iran' 'philippines' 'italy'

In [7]:
#number of missing values in each columns
print(train_data.isin(['?']).sum())

age                 0
workclass        1836
fnlwgt              0
education           0
educationnum        0
maritalstatus       0
occupation       1843
relationship        0
race                0
sex                 0
capitalgain         0
capitalloss         0
hoursperweek        0
nativecountry     583
income              0
dtype: int64


In [8]:
# fill missing values with most frequent value
train_data = preprocess_data.fill_missing_values(train_data)
test_data = preprocess_data.fill_missing_values(test_data)

In [9]:
#number of missing values in each columns
print(train_data.isin(['?']).sum())

age              0
workclass        0
fnlwgt           0
education        0
educationnum     0
maritalstatus    0
occupation       0
relationship     0
race             0
sex              0
capitalgain      0
capitalloss      0
hoursperweek     0
nativecountry    0
income           0
dtype: int64


In [10]:
train_data.shape

(32561, 15)

In [11]:
# drop duplicates
train_data = train_data.drop_duplicates()
train_data.shape

(32537, 15)

## Feature Engineering

In [12]:
# 75% of capitalgain and capitalloss is 0, so we can drop it
print(train_data.describe(include='all'))

                 age workclass        fnlwgt education  educationnum   
count   32537.000000     32537  3.253700e+04     32537  32537.000000  \
unique           NaN         8           NaN        16           NaN   
top              NaN   private           NaN    hsgrad           NaN   
freq             NaN     24509           NaN     10494           NaN   
mean       38.585549       NaN  1.897808e+05       NaN     10.081815   
std        13.637984       NaN  1.055565e+05       NaN      2.571633   
min        17.000000       NaN  1.228500e+04       NaN      1.000000   
25%        28.000000       NaN  1.178270e+05       NaN      9.000000   
50%        37.000000       NaN  1.783560e+05       NaN     10.000000   
75%        48.000000       NaN  2.369930e+05       NaN     12.000000   
max        90.000000       NaN  1.484705e+06       NaN     16.000000   

           maritalstatus     occupation relationship   race    sex   
count              32537          32537        32537  32537  3253

In [13]:
def replace_maritalstatus(df):
        df = df.replace(['divorced', 'marriedafspouse',
                'marriedcivspouse', 'marriedspouseabsent',
                 'nevermarried', 'separated', 'widowed'],
                  ['notmarried', 'married', 'married', 'married',
                    'notmarried', 'notmarried', 'notmarried'])
        return df

In [14]:
 # replace values with another meaning
train_data = replace_maritalstatus(train_data)
test_data = replace_maritalstatus(test_data)

In [15]:
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educationnum,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,nativecountry,income
0,39,stategov,77516,bachelors,13,notmarried,admclerical,notinfamily,white,male,2174,0,40,unitedstates,<=50k
1,50,selfempnotinc,83311,bachelors,13,married,execmanagerial,husband,white,male,0,0,13,unitedstates,<=50k
2,38,private,215646,hsgrad,9,notmarried,handlerscleaners,notinfamily,white,male,0,0,40,unitedstates,<=50k
3,53,private,234721,11th,7,married,handlerscleaners,husband,black,male,0,0,40,unitedstates,<=50k
4,28,private,338409,bachelors,13,married,profspecialty,wife,black,female,0,0,40,cuba,<=50k


In [16]:
#get coll between income and featuressss
class correlation:

    def corr(data, self=None):
        data = self.Feature_Encoder(data)
        col_names = data.columns
        param = []
        correlation = []
        abs_correlation = []
        for c in col_names:
            if c != 'income':
                if len(data[c].unique()) <= 2:
                    corr = spearmanr(data['income'], data[c])[0]
                else:
                    corr = pearsonr(data['income'], data[c])[0]
                param.append(c)
                correlation.append(corr)
                abs_correlation.append(abs(corr))
        param_df = pd.DataFrame({'correlation': correlation, 'parameter': param, 'abs_correlation': abs_correlation})
        param_df = param_df.sort_values(by=['abs_correlation'], ascending=False)
        return param_df



    def Feature_Encoder(X,self=None):
        cols = X.columns
        for c in cols:
            if X[c].dtype == 'object':
                lbl = LabelEncoder()
                lbl.fit(list(X[c].values))
                X[c] = lbl.transform(list(X[c].values))
        return X

In [17]:
correlation.corr(train_data.copy(), correlation )

Unnamed: 0,correlation,parameter,abs_correlation
5,-0.434848,maritalstatus,0.434848
4,0.335272,educationnum,0.335272
7,-0.250948,relationship,0.250948
0,0.234037,age,0.234037
12,0.229658,hoursperweek,0.229658
10,0.223336,capitalgain,0.223336
9,0.215969,sex,0.215969
11,0.150501,capitalloss,0.150501
3,0.079366,education,0.079366
8,0.071847,race,0.071847


In [71]:
# so we can drop with corr < 0.1
drop_features = ['education', 'race', 'occupation', 'nativecountry', 'fnlwgt', 'workclass']
train_data = train_data.drop(drop_features, axis=1)
test_data = test_data.drop(drop_features, axis=1)

In [72]:
test_data.head()

In [20]:

class process:
     @staticmethod
     def normalize(df, numerical_columns):
         scaler = MinMaxScaler()
         df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
         return df

    
   
     # get one hot vectors
     @staticmethod
     def get_one_hot(df, column):
         # get min(10, len(df[column].unique())) most frequent values
         top_n = [x for x in
                  df[column].value_counts().sort_values(ascending=False).head(min(100, len(df[column].unique()))).index]
         for label in top_n:
             df[column + '_' + label] = np.where(df[column] == label, 1, 0)
         df.drop(column, axis=1, inplace=True)
         return df
    

     # get one hot vectors for categorical columns
     @staticmethod
     def get_one_hot_all(df, categorical_columns, self=None):
         for column in categorical_columns:
             df = self.get_one_hot(df, column)
         return df

In [21]:
# Normalize numerical columns by MinMax normalization
numerical_columns = ['age', 'educationnum', 'capitalgain', 'capitalloss', 'hoursperweek']
train_data = process.normalize(train_data, numerical_columns)
test_data = process.normalize(test_data, numerical_columns)

In [22]:
train_data.head()

Unnamed: 0,age,educationnum,maritalstatus,relationship,sex,capitalgain,capitalloss,hoursperweek,income
0,0.30137,0.8,notmarried,notinfamily,male,0.02174,0.0,0.397959,<=50k
1,0.452055,0.8,married,husband,male,0.0,0.0,0.122449,<=50k
2,0.287671,0.533333,notmarried,notinfamily,male,0.0,0.0,0.397959,<=50k
3,0.493151,0.4,married,husband,male,0.0,0.0,0.397959,<=50k
4,0.150685,0.8,married,wife,female,0.0,0.0,0.397959,<=50k


In [23]:
# get one hot vectors for categorical columns
categorical_columns = [ 'maritalstatus',  'relationship',  'sex']
train_data = process.get_one_hot_all(train_data, categorical_columns,process)
test_data = process.get_one_hot_all(test_data, categorical_columns, process)

In [24]:
train_data.head()

Unnamed: 0,age,educationnum,capitalgain,capitalloss,hoursperweek,income,maritalstatus_notmarried,maritalstatus_married,relationship_husband,relationship_notinfamily,relationship_ownchild,relationship_unmarried,relationship_wife,relationship_otherrelative,sex_male,sex_female
0,0.30137,0.8,0.02174,0.0,0.397959,<=50k,1,0,0,1,0,0,0,0,1,0
1,0.452055,0.8,0.0,0.0,0.122449,<=50k,0,1,1,0,0,0,0,0,1,0
2,0.287671,0.533333,0.0,0.0,0.397959,<=50k,1,0,0,1,0,0,0,0,1,0
3,0.493151,0.4,0.0,0.0,0.397959,<=50k,0,1,1,0,0,0,0,0,1,0
4,0.150685,0.8,0.0,0.0,0.397959,<=50k,0,1,0,0,0,0,1,0,0,1


In [25]:
print(train_data.shape)
print(test_data.shape)

(32537, 16)
(16281, 16)


In [26]:
# get labels
YTrain = train_data['income'].replace(['<=50k', '>50k'], [0, 1])

# get features
XTrain = train_data.drop(['income'], axis=1)
XTrain.head()

Unnamed: 0,age,educationnum,capitalgain,capitalloss,hoursperweek,maritalstatus_notmarried,maritalstatus_married,relationship_husband,relationship_notinfamily,relationship_ownchild,relationship_unmarried,relationship_wife,relationship_otherrelative,sex_male,sex_female
0,0.30137,0.8,0.02174,0.0,0.397959,1,0,0,1,0,0,0,0,1,0
1,0.452055,0.8,0.0,0.0,0.122449,0,1,1,0,0,0,0,0,1,0
2,0.287671,0.533333,0.0,0.0,0.397959,1,0,0,1,0,0,0,0,1,0
3,0.493151,0.4,0.0,0.0,0.397959,0,1,1,0,0,0,0,0,1,0
4,0.150685,0.8,0.0,0.0,0.397959,0,1,0,0,0,0,1,0,0,1


In [27]:
# get labels
YTest = test_data['income'].replace(['<=50k', '>50k'], [0, 1])

# get features
XTest = test_data.drop(['income'], axis=1)
XTest.head()

Unnamed: 0,age,educationnum,capitalgain,capitalloss,hoursperweek,maritalstatus_notmarried,maritalstatus_married,relationship_husband,relationship_notinfamily,relationship_ownchild,relationship_unmarried,relationship_wife,relationship_otherrelative,sex_male,sex_female
0,0.109589,0.4,0.0,0.0,0.397959,1,0,0,0,1,0,0,0,1,0
1,0.287671,0.533333,0.0,0.0,0.5,0,1,1,0,0,0,0,0,1,0
2,0.150685,0.733333,0.0,0.0,0.397959,0,1,1,0,0,0,0,0,1,0
3,0.369863,0.6,0.076881,0.0,0.397959,0,1,1,0,0,0,0,0,1,0
4,0.013699,0.6,0.0,0.0,0.295918,1,0,0,0,1,0,0,0,0,1


# classifiers

In [28]:
class classifiers:
    def Evaluation(YTest, YPred, method=''):
        print('====================== ' + method + ' ======================')
        acc = accuracy_score(YTest, YPred) * 100
        print('Accuracy is %.3f%%.' % acc)
        conf = confusion_matrix(YTest, YPred)
        print(pd.DataFrame(conf, columns=['Pred-Neg', 'Pred-Pos'], index=['Actl-Neg', 'Actl-Pos']))
        precision = conf[1][1] / (conf[0][1] + conf[1][1]) if (conf[0][1] + conf[1][1]) else 0
        recall = conf[1][1] / (conf[1][0] + conf[1][1]) if (conf[1][0] + conf[1][1]) else 0
        F1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
        print('Precision is %.3f.' % precision)
        print('Recall is %.3f.' % recall)
        print('F1 score is %.3f.' % F1)
        conf = confusion_matrix(YTest, YPred)
        print('confusion_matrix :')
        print('TP = ', conf[0][0])#TP = True Positive
        print('FP = ', conf[0][1])#FP = False Positive
        print('FN = ', conf[1][0])#FN = False Negative
        print('TN = ', conf[1][1])#TN = True Negative



    @staticmethod
    def LogisticRegression(XTrain, YTrain, XTest, YTest, self=None):
        model = LogisticRegression()
        model.fit(XTrain, YTrain)
        YPred = model.predict(XTest)
        self.Evaluation(YTest, YPred, 'Logistic Regression')

    @staticmethod
    def SVM(XTrain, YTrain, XTest, YTest, self=None):
        model = SVC()
        model.fit(XTrain, YTrain)
        YPred = model.predict(XTest)
        self.Evaluation(YTest, YPred, 'SVM')

    @staticmethod
    def DecisionTree(XTrain, YTrain, XTest, YTest, self=None):
        model = DecisionTreeClassifier()
        model.fit(XTrain, YTrain)
        YPred = model.predict(XTest)
        self.Evaluation(YTest, YPred, 'Decision Tree')

In [29]:
classifiers.LogisticRegression(XTrain, YTrain, XTest, YTest, classifiers)

Accuracy is 84.067%.
          Pred-Neg  Pred-Pos
Actl-Neg     11531       904
Actl-Pos      1690      2156
Precision is 0.705.
Recall is 0.561.
F1 score is 0.624.
confusion_matrix :
TP =  11531
FP =  904
FN =  1690
TN =  2156


In [31]:
classifiers.SVM(XTrain, YTrain, XTest, YTest, classifiers)

Accuracy is 84.436%.
          Pred-Neg  Pred-Pos
Actl-Neg     11708       727
Actl-Pos      1807      2039
Precision is 0.737.
Recall is 0.530.
F1 score is 0.617.
confusion_matrix :
TP =  11708
FP =  727
FN =  1807
TN =  2039


In [30]:
classifiers.DecisionTree(XTrain, YTrain, XTest, YTest, classifiers)

Accuracy is 81.647%.
          Pred-Neg  Pred-Pos
Actl-Neg     11289      1146
Actl-Pos      1842      2004
Precision is 0.636.
Recall is 0.521.
F1 score is 0.573.
confusion_matrix :
TP =  11289
FP =  1146
FN =  1842
TN =  2004
