In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn import tree
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split

In [1]:
#read the data source file
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)

In [2]:
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
#change the coumn names
train_set.columns =['age', 'workclass', 'fnlwgt', 'education','education_num', 'marital_status','occupation', 'relationship','race', 'sex','capital_gain', 'capital_loss','hours_per_week', 'native_country', 'salary_slab']

In [4]:
#check for the top rows
train_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary_slab
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [22]:
#check salary slab count
train_set.salary_slab.value_counts()

 <=50K    24720
 >50K      7841
Name: salary_slab, dtype: int64

In [23]:
#analyse the data
train_set.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [24]:
#replace ? with NaN
dataset = train_set.replace('?',np.NaN)
dataset.shape

(32561, 15)

In [25]:
#replace null values with 0
dataset.fillna(0, inplace=True)

In [26]:
X = dataset[['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country']]
Y = dataset['salary_slab']

In [65]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

X = pd.get_dummies(X, drop_first=True)

In [66]:
#convert ndarray to dataframe
X = pd.DataFrame(X)
X.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [67]:
from sklearn.model_selection import train_test_split

#split into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [68]:
from xgboost import XGBClassifier

#create train and test data
classifier = XGBClassifier()
classifier.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [69]:
#predict the test data with created model
y_pred = classifier.predict(X_test)

  if diff:


In [70]:
from sklearn.metrics import confusion_matrix
#check right predictions in confusion matix
cm = confusion_matrix(Y_test, y_pred)
cm

array([[4643,  275],
       [ 633,  962]], dtype=int64)

In [71]:
#efficiency of the model will be
(cm[1][1]+cm[0][0])/(cm[0][1]+cm[0][0]+cm[1][0]+cm[1][1])

0.860586519269154

In [72]:
df_2 = pd.get_dummies(dataset,drop_first=True)

In [73]:
pf=df_2.corr().sort_values('salary_slab', ascending=False)
pf['salary_slab'].head(10)

salary_slab                           1.000000
marital_status_ Married-civ-spouse    0.444696
education_num                         0.335154
age                                   0.234037
hours_per_week                        0.229689
capital_gain                          0.223329
sex_ Male                             0.215980
occupation_ Exec-managerial           0.214861
occupation_ Prof-specialty            0.185866
education_ Bachelors                  0.180485
Name: salary_slab, dtype: float64

<h4><b>Hence the top important factors are <i>marital_status, education_num, age, hours_per_week, capital_gain</i></b></h1>

In [81]:
names = ["Decision Tree", "Random Forest", "AdaBoost", "Logisic", "XGBClassifier", "GBM"]

In [82]:
classifiers = [
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(n_estimators=100),
    AdaBoostClassifier(n_estimators=100),
    LogisticRegression(),
    XGBClassifier(),
    GradientBoostingClassifier(n_estimators=100)]

In [83]:
for clf in zip(names, classifiers):
    clf[1].fit(X_train, Y_train) #[imp_cols]
    print(clf[0], clf[1].score(X_test, Y_test)) #[imp_cols]

Decision Tree 0.8412405957316137
Random Forest 0.8476892369107938
AdaBoost 0.8596652848149854
Logisic 0.7931828650391525


  if diff:


XGBClassifier 0.860586519269154
GBM 0.8622754491017964


<h4><b>Hence, Ada Boost and Gradient Boost classifier are best for this dataset</b></h4>