In [1]:
import pandas as pd
import numpy as np
import math
import statsmodels as sm
import sklearn as skl
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import sklearn.linear_model as linear_model
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
import sklearn.metrics as metrics
import sklearn.tree as tree
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
Train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None, sep=' *, *', engine='python')
Test_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', skiprows = 1, sep=' *, *', engine='python', header = None)

In [3]:
pd.set_option('display.max_columns', None)
col = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status','occupation','relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week','native_country', 'wage_class']

In [4]:
Train_set.columns=col
Test_set.columns=col

In [5]:
Train_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
Train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education_num     32561 non-null int64
marital_status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital_gain      32561 non-null int64
capital_loss      32561 non-null int64
hours_per_week    32561 non-null int64
native_country    32561 non-null object
wage_class        32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [7]:
Test_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [8]:
Test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 15 columns):
age               16281 non-null int64
workclass         16281 non-null object
fnlwgt            16281 non-null int64
education         16281 non-null object
education_num     16281 non-null int64
marital_status    16281 non-null object
occupation        16281 non-null object
relationship      16281 non-null object
race              16281 non-null object
sex               16281 non-null object
capital_gain      16281 non-null int64
capital_loss      16281 non-null int64
hours_per_week    16281 non-null int64
native_country    16281 non-null object
wage_class        16281 non-null object
dtypes: int64(6), object(9)
memory usage: 1.9+ MB


In [9]:
#Train_data.replace('?', 'Unknown', inplace=True)

In [10]:
#Test_data.replace('?', 'Unknown', inplace=True)

In [11]:
Train_set['Age_new'] = pd.qcut(Train_set.age, 4, labels=['very_low','low','high','very_high'])

In [12]:
Train_set['Fnlwgt_new'] = pd.qcut(Train_set.fnlwgt, 3, labels=['low','medium','high'])

In [13]:
cap_gain = Train_set['capital_gain']
Train_set['Capital_gain_new'] = list(map(lambda cap_gain:"low" if(cap_gain==0) else "high", cap_gain))

In [14]:
cap_loss = Train_set['capital_loss']
Train_set['Capital_loss_new'] = list(map(lambda cap_loss:"low" if(cap_loss==0) else "high", cap_loss))

In [15]:
hpw = Train_set['hours_per_week']
Train_set['Hours_per_week_new'] = list(map(lambda hpw:"low" if(hpw<=30) else "high" if (hpw>30 and hpw<=40) else "very high", hpw))

In [16]:
Train_dummies_age_new = pd.get_dummies(Train_set.Age_new, drop_first=True, prefix='Age')
Train_dummies_fnlwgt_new = pd.get_dummies(Train_set.Fnlwgt_new, drop_first=True, prefix='Fnlwgt')
Train_dummies_capital_gain_new = pd.get_dummies(Train_set['Capital_gain_new'], drop_first=True, prefix='Capital_gain')
Train_dummies_capital_loss_new = pd.get_dummies(Train_set['Capital_loss_new'], drop_first=True, prefix='Capital_loss')
Train_dummies_hours_per_week_new = pd.get_dummies(Train_set['Hours_per_week_new'], drop_first=True, prefix='Hours_per_week')

In [17]:
Train_set = Train_set.join([Train_dummies_age_new,Train_dummies_fnlwgt_new,Train_dummies_capital_gain_new,Train_dummies_capital_loss_new,Train_dummies_hours_per_week_new])

In [18]:
Train_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class,Age_new,Fnlwgt_new,Capital_gain_new,Capital_loss_new,Hours_per_week_new,Age_low,Age_high,Age_very_high,Fnlwgt_medium,Fnlwgt_high,Capital_gain_low,Capital_loss_low,Hours_per_week_low,Hours_per_week_very high
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,high,low,high,low,high,0,1,0,0,0,0,1,0,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,very_high,low,low,low,low,0,0,1,0,0,1,1,1,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,high,high,low,low,high,0,1,0,0,1,1,1,0,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,very_high,high,low,low,high,0,0,1,0,1,1,1,0,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,very_low,high,low,low,high,0,0,0,0,1,1,1,0,0


In [19]:
Train_set = Train_set.drop(['age','Age_new','fnlwgt','Fnlwgt_new','capital_gain','Capital_gain_new','capital_loss','Capital_loss_new','hours_per_week','Hours_per_week_new'],axis=1)

In [20]:
Train_set.head()

Unnamed: 0,workclass,education,education_num,marital_status,occupation,relationship,race,sex,native_country,wage_class,Age_low,Age_high,Age_very_high,Fnlwgt_medium,Fnlwgt_high,Capital_gain_low,Capital_loss_low,Hours_per_week_low,Hours_per_week_very high
0,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K,0,1,0,0,0,0,1,0,0
1,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K,0,0,1,0,0,1,1,1,0
2,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K,0,1,0,0,1,1,1,0,0
3,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K,0,0,1,0,1,1,1,0,0
4,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K,0,0,0,0,1,1,1,0,0


In [21]:
Test_set['Age_new'] = pd.qcut(Test_set.age, 4, labels=['very_low','low','high','very_high'])

In [22]:
Test_set['Fnlwgt_new'] = pd.qcut(Test_set.fnlwgt, 3, labels=['low','medium','high'])

In [23]:
cap_gain = Test_set['capital_gain']
Test_set['Capital_gain_new'] = list(map(lambda cap_gain:"low" if(cap_gain==0) else "high", cap_gain))

In [24]:
cap_loss = Test_set['capital_loss']
Test_set['Capital_loss_new'] = list(map(lambda cap_loss:"low" if(cap_loss==0) else "high", cap_loss))

In [25]:
hpw = Test_set['hours_per_week']
Test_set['Hours_per_week_new'] = list(map(lambda hpw:"low" if(hpw<=30) else "high" if (hpw>30 and hpw<=40) else "very high", hpw))

In [26]:
Test_dummies_age_new = pd.get_dummies(Test_set.Age_new, drop_first=True, prefix='Age')
Test_dummies_fnlwgt_new = pd.get_dummies(Test_set.Fnlwgt_new, drop_first=True, prefix='Fnlwgt')
Test_dummies_capital_gain_new = pd.get_dummies(Test_set['Capital_gain_new'], drop_first=True, prefix='Capital_gain')
Test_dummies_capital_loss_new = pd.get_dummies(Test_set['Capital_loss_new'], drop_first=True, prefix='Capital_loss')
Test_dummies_hours_per_week_new = pd.get_dummies(Test_set['Hours_per_week_new'], drop_first=True, prefix='Hours_per_week')

In [28]:
Test_set = Test_set.join([Train_dummies_age_new,Train_dummies_fnlwgt_new,Train_dummies_capital_gain_new,Train_dummies_capital_loss_new,Train_dummies_hours_per_week_new])

In [29]:
Test_set = Test_set.drop(['age','Age_new','fnlwgt','Fnlwgt_new','capital_gain','Capital_gain_new','capital_loss','Capital_loss_new','hours_per_week','Hours_per_week_new'],axis=1)

In [30]:
Test_set.head()

Unnamed: 0,workclass,education,education_num,marital_status,occupation,relationship,race,sex,native_country,wage_class,Age_low,Age_high,Age_very_high,Fnlwgt_medium,Fnlwgt_high,Capital_gain_low,Capital_loss_low,Hours_per_week_low,Hours_per_week_very high
0,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,United-States,<=50K.,0,1,0,0,0,0,1,0,0
1,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,United-States,<=50K.,0,0,1,0,0,1,1,1,0
2,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,United-States,>50K.,0,1,0,0,1,1,1,0,0
3,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,United-States,>50K.,0,0,1,0,1,1,1,0,0
4,?,Some-college,10,Never-married,?,Own-child,White,Female,United-States,<=50K.,0,0,0,0,1,1,1,0,0


In [31]:
categories_dict = {"United-States":"North America","Mexico":"North America","Unknown":"Unknown","Philippines":"Asia","Germany":"Europe","Canada":"North America","Puerto-Rico":"Central America","El-Salvador":"Central America","India":"Asia","Cuba":"Central America","England":"Europe","Jamaica":"Central America","South":"Asia","China":"Asia","Italy":"Europe","Dominican-Republic":"Central America","Vietnam":"Asia","Guatemala":"Central America","Japan":"Asia","Poland":"Europe","Columbia":"South America","Taiwan":"Asia",
"Haiti":"Central America","Iran":"Asia","Portugal":"Europe","Nicaragua":"Central America","Peru":"South America","Greece":"Europe","France":"Europe","Ecuador":"South America","Ireland":"Europe","Hong":"Asia","Cambodia":"Asia","Trinadad&Tobago":"Central America","Laos":"Asia","Thailand":"Asia","Yugoslavia":"Europe","Outlying-US(Guam-USVI-etc)":"North America","Honduras":"Central America","Hungary":"Europe","Scotland":"Europe","Holand-Netherlands":"Europe"}
Train_set['Continent'] = Train_set['native_country'].map(categories_dict)
Test_set['Continent'] = Test_set['native_country'].map(categories_dict)

In [32]:
Train_set = Train_set.drop(['native_country'],axis=1)
Test_set = Test_set.drop(['native_country'],axis=1)

In [34]:
Train_set.head()

Unnamed: 0,workclass,education,education_num,marital_status,occupation,relationship,race,sex,wage_class,Age_low,Age_high,Age_very_high,Fnlwgt_medium,Fnlwgt_high,Capital_gain_low,Capital_loss_low,Hours_per_week_low,Hours_per_week_very high,Continent
0,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,<=50K,0,1,0,0,0,0,1,0,0,North America
1,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,<=50K,0,0,1,0,0,1,1,1,0,North America
2,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,<=50K,0,1,0,0,1,1,1,0,0,North America
3,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,<=50K,0,0,1,0,1,1,1,0,0,North America
4,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,<=50K,0,0,0,0,1,1,1,0,0,Central America


In [37]:
Test_set.head()

Unnamed: 0,workclass,education,education_num,marital_status,occupation,relationship,race,sex,wage_class,Age_low,Age_high,Age_very_high,Fnlwgt_medium,Fnlwgt_high,Capital_gain_low,Capital_loss_low,Hours_per_week_low,Hours_per_week_very high,Continent
0,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,<=50K.,0,1,0,0,0,0,1,0,0,North America
1,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,<=50K.,0,0,1,0,0,1,1,1,0,North America
2,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,>50K.,0,1,0,0,1,1,1,0,0,North America
3,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,>50K.,0,0,1,0,1,1,1,0,0,North America
4,?,Some-college,10,Never-married,?,Own-child,White,Female,<=50K.,0,0,0,0,1,1,1,0,0,North America


In [40]:
Train_dummies_continent = pd.get_dummies(Train_set.Continent, drop_first=True, prefix='Continent')
Test_dummies_continent = pd.get_dummies(Test_set.Continent, drop_first=True, prefix='Continent')

In [41]:
Train_set = Train_set.join(Train_dummies_continent)
Test_set = Test_set.join(Test_dummies_continent)

In [42]:
Train_set.head()

Unnamed: 0,workclass,education,education_num,marital_status,occupation,relationship,race,sex,wage_class,Age_low,Age_high,Age_very_high,Fnlwgt_medium,Fnlwgt_high,Capital_gain_low,Capital_loss_low,Hours_per_week_low,Hours_per_week_very high,Continent,Continent_Central America,Continent_Europe,Continent_North America,Continent_South America
0,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,<=50K,0,1,0,0,0,0,1,0,0,North America,0,0,1,0
1,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,<=50K,0,0,1,0,0,1,1,1,0,North America,0,0,1,0
2,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,<=50K,0,1,0,0,1,1,1,0,0,North America,0,0,1,0
3,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,<=50K,0,0,1,0,1,1,1,0,0,North America,0,0,1,0
4,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,<=50K,0,0,0,0,1,1,1,0,0,Central America,1,0,0,0


In [43]:
Test_set.head()

Unnamed: 0,workclass,education,education_num,marital_status,occupation,relationship,race,sex,wage_class,Age_low,Age_high,Age_very_high,Fnlwgt_medium,Fnlwgt_high,Capital_gain_low,Capital_loss_low,Hours_per_week_low,Hours_per_week_very high,Continent,Continent_Central America,Continent_Europe,Continent_North America,Continent_South America
0,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,<=50K.,0,1,0,0,0,0,1,0,0,North America,0,0,1,0
1,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,<=50K.,0,0,1,0,0,1,1,1,0,North America,0,0,1,0
2,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,>50K.,0,1,0,0,1,1,1,0,0,North America,0,0,1,0
3,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,>50K.,0,0,1,0,1,1,1,0,0,North America,0,0,1,0
4,?,Some-college,10,Never-married,?,Own-child,White,Female,<=50K.,0,0,0,0,1,1,1,0,0,North America,0,0,1,0


In [44]:
Train_set = Train_set.drop(['Continent'],axis=1)
Test_set = Test_set.drop(['Continent'],axis=1)

In [45]:
Train_set.drop(['education_num'], axis=1, inplace=True)

In [46]:
Train_set = pd.get_dummies(Train_set,columns=['workclass','education','marital_status','occupation','relationship','race','sex'],drop_first=True)

In [47]:
Train_set['wage_class'] = Train_set['wage_class'].map({'<=50K':0,'>50K':1})

In [48]:
Train_set.head()

Unnamed: 0,wage_class,Age_low,Age_high,Age_very_high,Fnlwgt_medium,Fnlwgt_high,Capital_gain_low,Capital_loss_low,Hours_per_week_low,Hours_per_week_very high,Continent_Central America,Continent_Europe,Continent_North America,Continent_South America,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_11th,education_12th,education_1st-4th,education_5th-6th,education_7th-8th,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college,marital_status_Married-AF-spouse,marital_status_Married-civ-spouse,marital_status_Married-spouse-absent,marital_status_Never-married,marital_status_Separated,marital_status_Widowed,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Male
0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1
1,0,0,0,1,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,0,0,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1
3,0,0,0,1,0,1,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
4,0,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0


In [49]:
Test_set = pd.get_dummies(Test_set,columns=['workclass','education','marital_status','occupation','relationship','race','sex'],drop_first=True)

In [50]:
Test_set['wage_class'] = Test_set['wage_class'].map({'<=50K.':0,'>50K.':1})

In [51]:
Test_set.head()

Unnamed: 0,education_num,wage_class,Age_low,Age_high,Age_very_high,Fnlwgt_medium,Fnlwgt_high,Capital_gain_low,Capital_loss_low,Hours_per_week_low,Hours_per_week_very high,Continent_Central America,Continent_Europe,Continent_North America,Continent_South America,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_11th,education_12th,education_1st-4th,education_5th-6th,education_7th-8th,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college,marital_status_Married-AF-spouse,marital_status_Married-civ-spouse,marital_status_Married-spouse-absent,marital_status_Never-married,marital_status_Separated,marital_status_Widowed,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Male
0,7,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1
1,9,0,0,0,1,0,0,1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,12,1,0,1,0,0,1,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1
3,10,1,0,0,1,0,1,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
4,10,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0


In [52]:
Test_set.drop(['education_num'],axis=1,inplace=True)

In [53]:
Test_set.head()

Unnamed: 0,wage_class,Age_low,Age_high,Age_very_high,Fnlwgt_medium,Fnlwgt_high,Capital_gain_low,Capital_loss_low,Hours_per_week_low,Hours_per_week_very high,Continent_Central America,Continent_Europe,Continent_North America,Continent_South America,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_11th,education_12th,education_1st-4th,education_5th-6th,education_7th-8th,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college,marital_status_Married-AF-spouse,marital_status_Married-civ-spouse,marital_status_Married-spouse-absent,marital_status_Never-married,marital_status_Separated,marital_status_Widowed,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Male
0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1
1,0,0,0,1,0,0,1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2,1,0,1,0,0,1,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1
3,1,0,0,1,0,1,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
4,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0


In [54]:
X_Test = Test_set.iloc[:,1:]
y_Test = Test_set.iloc[:,0]

In [55]:
X_Train = Train_set.iloc[:,1:]
y_Train = Train_set.iloc[:,0]

# XG Boost Classifier

In [56]:
import xgboost as xgb

In [57]:
data_dmatrix = xgb.DMatrix(data=X_Train,label=y_Train)
xg_class = xgb.XGBClassifier(objective ='binary:logistic', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 100)
xg_class.fit(X_Train,y_Train)
predicted = xg_class.predict(X_Test)

  if diff:


In [58]:
accuracy = metrics.accuracy_score(y_Test, predicted)
print("Accuracy is %.2f%%" % (accuracy * 100.0))

Accuracy is 81.04%


# Logistic Regression

In [59]:
lm= LogisticRegression()
lm= lm.fit(X_Train, y_Train)

In [60]:
y_pred=lm.predict(X_Test)

In [61]:
print("Accuracy is ", metrics.accuracy_score(y_Test, y_pred))


Accuracy is  0.8034518764203673


# Decision Tree

In [62]:
from sklearn.tree import DecisionTreeClassifier

In [63]:
dtree = DecisionTreeClassifier(criterion='gini', random_state=0)
dtree.fit(X_Train, y_Train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [64]:
y_pred=dtree.predict(X_Test)

In [65]:
count_misclassified = (y_Test != y_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(y_Test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

Misclassified samples: 3902
Accuracy: 0.76


# Random Forest

In [66]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=1)
rf.fit(X_Train, y_Train)
predicted = rf.predict(X_Test)
print(roc_auc_score(y_Test, predicted))

0.657937342825438


# Naive Bayes

In [67]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_Train,y_Train)
y_pred = nb.predict(X_Test)
print("Accuracy is ", metrics.accuracy_score(y_Test, y_pred))

Accuracy is  0.5739819421411462


# K-Nearest Neighbour Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
for K in range(25):
    K_value = K+1
    neigh = KNeighborsClassifier(n_neighbors = K_value, weights='uniform', algorithm='auto')
    neigh.fit(X_Train, y_Train) 
    predicted = neigh.predict(X_Test)
    print("Accuracy is ", metrics.accuracy_score(y_Test,predicted))

Accuracy is  0.7643879368589153
