In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,cross_val_score,KFold,GridSearchCV
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [3]:
train_set = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)
test_set = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test' , skiprows = 1, header = None)
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num','marital_status', 'occupation','relationship', 'race', 'sex', 'capital_gain','capital_loss', 'hours_per_week', 'native_country', 'wage_class']
train_set.columns = col_labels
test_set.columns = col_labels

In [4]:
train_set.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [5]:
train_set.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
wage_class        0
dtype: int64

In [6]:
test_set.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
wage_class        0
dtype: int64

In [7]:
train_set.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
wage_class        object
dtype: object

In [31]:
dict_workclass={}
count=0
for i in train_set.workclass.unique():
    dict_workclass[i] = count
    count+=1

dict_edu={}
count=0
for i in train_set.education.unique():
    dict_edu[i] = count
    count+=1

dict_ms={}
count=0
for i in train_set.marital_status.unique():
    dict_ms[i] = count
    count+=1

dict_occ={}
count=0
for i in train_set.occupation.unique():
    dict_occ[i] = count
    count+=1

dict_realt={}
count=0
for i in train_set.relationship.unique():
    dict_realt[i] = count
    count+=1

dict_race={}
count=0
for i in train_set.race.unique():
    dict_race[i] = count
    count+=1

dict_sex={}
count=0
for i in train_set.sex.unique():
    dict_sex[i] = count
    count+=1

dict_nc={}
count=0
for i in train_set.native_country.unique():
    dict_nc[i] = count
    count+=1

dict_wagec={}
count=0
for i in train_set.wage_class.unique():
    dict_wagec[i] = count
    count+=1    

In [32]:
train_set['wage_class'] = train_set['wage_class'].map(dict_wagec)
train_set['native_country'] = train_set['native_country'].map(dict_nc)
train_set['sex'] = train_set['sex'].map(dict_sex)
train_set['race'] = train_set['race'].map(dict_race)
train_set['relationship'] = train_set['relationship'].map(dict_realt)
train_set['occupation'] = train_set['occupation'].map(dict_occ)
train_set['marital_status'] = train_set['marital_status'].map(dict_ms)
train_set['education'] = train_set['education'].map(dict_edu)
train_set['workclass'] = train_set['workclass'].map(dict_workclass)

In [33]:
train_set.dtypes

age               int64
workclass         int64
fnlwgt            int64
education         int64
education_num     int64
marital_status    int64
occupation        int64
relationship      int64
race              int64
sex               int64
capital_gain      int64
capital_loss      int64
hours_per_week    int64
native_country    int64
wage_class        int64
dtype: object

In [48]:
dictT_workclass={}
count=0
for i in test_set.workclass.unique():
    dictT_workclass[i] = count
    count+=1

dictT_edu={}
count=0
for i in test_set.education.unique():
    dictT_edu[i] = count
    count+=1

dictT_ms={}
count=0
for i in test_set.marital_status.unique():
    dictT_ms[i] = count
    count+=1

dictT_occ={}
count=0
for i in test_set.occupation.unique():
    dictT_occ[i] = count
    count+=1

dictT_realt={}
count=0
for i in test_set.relationship.unique():
    dictT_realt[i] = count
    count+=1

dictT_race={}
count=0
for i in test_set.race.unique():
    dictT_race[i] = count
    count+=1

dictT_sex={}
count=0
for i in test_set.sex.unique():
    dictT_sex[i] = count
    count+=1

dictT_nc={}
count=0
for i in test_set.native_country.unique():
    dictT_nc[i] = count
    count+=1

dictT_wagec={}
count=0
for i in test_set.wage_class.unique():
    dictT_wagec[i] = count
    count+=1 

In [49]:
test_set['wage_class'] = test_set['wage_class'].map(dictT_wagec)
test_set['native_country'] = test_set['native_country'].map(dictT_nc)
test_set['sex'] = test_set['sex'].map(dictT_sex)
test_set['race'] = test_set['race'].map(dictT_race)
test_set['relationship'] = test_set['relationship'].map(dictT_realt)
test_set['occupation'] = test_set['occupation'].map(dictT_occ)
test_set['marital_status'] = test_set['marital_status'].map(dictT_ms)
test_set['education'] = test_set['education'].map(dictT_edu)
test_set['workclass'] = test_set['workclass'].map(dictT_workclass)

In [50]:
test_set 

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,25,0,226802,0,7,0,0,0,0,0,0,0,40,0,0
1,38,0,89814,0,9,0,0,0,0,0,0,0,50,0,0
2,28,1,336951,0,12,0,0,0,0,0,0,0,40,0,0
3,44,0,160323,0,10,0,0,0,0,0,7688,0,40,0,0
4,18,2,103497,0,10,0,0,0,0,0,0,0,30,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,0,215419,0,13,0,0,0,0,0,0,0,36,0,0
16277,64,2,321403,0,9,0,0,0,0,0,0,0,40,0,0
16278,38,0,374983,0,13,0,0,0,0,0,0,0,50,0,0
16279,44,0,83891,0,13,0,0,0,0,0,5455,0,40,0,0


In [51]:
Xtrain_set = train_set.drop('wage_class',axis=1)
Ytrain_set = train_set['wage_class']

Xtest_set = test_set.drop('wage_class',axis=1)
Ytest_set = test_set['wage_class']

In [52]:
import xgboost as xgb
from xgboost import XGBClassifier
model = XGBClassifier(objective='binary:logistic')
model.fit(Xtrain_set, Ytrain_set)

XGBClassifier()

In [53]:
y_pred = model.predict(Xtrain_set)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(Ytrain_set,predictions)
accuracy

0.8648075919044256

In [54]:
y_pred = model.predict(Xtest_set)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(Ytest_set,predictions)
accuracy

0.9513543394140409

In [55]:
from sklearn.model_selection import GridSearchCV

In [59]:
param_grid={
   
    ' learning_rate':[1,0.5,0.1],
    'max_depth': [3,5,10],
    'n_estimators':[10,50,100]
    
}

In [60]:
grid= GridSearchCV(XGBClassifier(objective='binary:logistic'),param_grid, verbose=3)

In [61]:
grid.fit(Xtrain_set,Ytrain_set)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END . learning_rate=1, max_depth=3, n_estimators=10; total time=   0.1s
[CV 2/5] END . learning_rate=1, max_depth=3, n_estimators=10; total time=   0.1s
[CV 3/5] END . learning_rate=1, max_depth=3, n_estimators=10; total time=   0.1s
[CV 4/5] END . learning_rate=1, max_depth=3, n_estimators=10; total time=   0.1s
[CV 5/5] END . learning_rate=1, max_depth=3, n_estimators=10; total time=   0.1s
[CV 1/5] END . learning_rate=1, max_depth=3, n_estimators=50; total time=   0.6s
[CV 2/5] END . learning_rate=1, max_depth=3, n_estimators=50; total time=   0.6s
[CV 3/5] END . learning_rate=1, max_depth=3, n_estimators=50; total time=   0.6s
[CV 4/5] END . learning_rate=1, max_depth=3, n_estimators=50; total time=   0.6s
[CV 5/5] END . learning_rate=1, max_depth=3, n_estimators=50; total time=   0.6s
[CV 1/5] END  learning_rate=1, max_depth=3, n_estimators=100; total time=   1.3s
[CV 2/5] END  learning_rate=1, max_depth=3, n_e

GridSearchCV(estimator=XGBClassifier(),
             param_grid={' learning_rate': [1, 0.5, 0.1],
                         'max_depth': [3, 5, 10],
                         'n_estimators': [10, 50, 100]},
             verbose=3)

In [62]:
grid.best_params_

{' learning_rate': 1, 'max_depth': 10, 'n_estimators': 100}

In [63]:
new_model=XGBClassifier(learning_rate= 1, max_depth= 10, n_estimators= 100)
new_model.fit(Xtrain_set, Ytrain_set)

XGBClassifier(learning_rate=1, max_depth=10)

In [64]:
y_pred_new = new_model.predict(Xtest_set)
predictions_new = [round(value) for value in y_pred_new]
accuracy_new = accuracy_score(Ytest_set,predictions_new)
accuracy_new

0.9369817578772802