# XGBoost Assignemet 

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import pickle
from sklearn import datasets
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_url = '''https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data''' 
test_url =  '''https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'''
train_set  =  pd.read_csv(train_url, header = None) 
test_set  =  pd.read_csv(test_url, skiprows = 1, header = None) 
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 
              'marital_status',  'occupation','relationship',  'race',  'sex', 'capital_gain',
              'capital_loss', 'hours_per_week', 'native_country', 'wage_class']
train_set.columns = col_labels
test_set.columns = col_labels
print(train_set.shape)
print(test_set.shape)

(32561, 15)
(16281, 15)


In [3]:
train_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
train = train_set.copy()
test = test_set.copy()
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
train.wage_class.unique()

array([' <=50K', ' >50K'], dtype=object)

In [6]:
wage_class = {" <=50K" : 0," >50K" : 1}
train['wage_class'] = train['wage_class'].map(wage_class)
train['wage_class'].head(5)

0    0
1    0
2    0
3    0
4    0
Name: wage_class, dtype: int64

In [7]:
test.wage_class.unique()

array([' <=50K.', ' >50K.'], dtype=object)

In [8]:
wage_class = {" <=50K." : 0," >50K." : 1}
test['wage_class'] = test['wage_class'].map(wage_class)
test['wage_class'].head(5)

0    0
1    0
2    1
3    1
4    0
Name: wage_class, dtype: int64

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  wage_class      32561 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 3.7+ MB


In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             16281 non-null  int64 
 1   workclass       16281 non-null  object
 2   fnlwgt          16281 non-null  int64 
 3   education       16281 non-null  object
 4   education_num   16281 non-null  int64 
 5   marital_status  16281 non-null  object
 6   occupation      16281 non-null  object
 7   relationship    16281 non-null  object
 8   race            16281 non-null  object
 9   sex             16281 non-null  object
 10  capital_gain    16281 non-null  int64 
 11  capital_loss    16281 non-null  int64 
 12  hours_per_week  16281 non-null  int64 
 13  native_country  16281 non-null  object
 14  wage_class      16281 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 1.9+ MB


In [11]:
train.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,wage_class
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456,0.24081
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429,0.427581
min,17.0,12285.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0,0.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0,0.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0,1.0


In [12]:
obj_col = [feature for feature in train.columns if train[feature].dtype == 'object']
obj_col

['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native_country']

In [13]:
train[obj_col].describe()

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country
count,32561,32561,32561,32561,32561,32561,32561,32561
unique,9,16,7,15,6,5,2,42
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States
freq,22696,10501,14976,4140,13193,27816,21790,29170


In [14]:
print('workclass',train.workclass.unique(),'\n')
print('education',train.education.unique(),'\n')
print('marital_status',train.marital_status.unique(),'\n')
print('occupation',train.occupation.unique(),'\n')
print('relationship',train.relationship.unique(),'\n')
print('race',train.race.unique(),'\n')
print('sex',train.sex.unique(),'\n')
print('native_country',train.native_country.unique(),'\n')

workclass [' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked'] 

education [' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th'] 

marital_status [' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed'] 

occupation [' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving'
 ' Farming-fishing' ' Machine-op-inspct' ' Tech-support' ' ?'
 ' Protective-serv' ' Armed-Forces' ' Priv-house-serv'] 

relationship [' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative'] 

race [' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other'] 

sex [' Male' ' Female'] 

native_country [' United-States' ' Cuba' ' Jamai

In [15]:
print(train['workclass'].value_counts())
print(train['occupation'].value_counts())
print(train['native_country'].value_counts())

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64
 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 ?                    1843
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: occupation, dtype: int64
 United-States                 29170
 Mexico                          643
 ?                               583
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                    

In [16]:
print(train.shape)
print(train.replace(' ?', np.nan).dropna().shape)
print(test.shape)
print(test.replace(' ?', np.nan).dropna().shape)

(32561, 15)
(30162, 15)
(16281, 15)
(15060, 15)


In [17]:
train_n = train.replace(' ?', np.nan).dropna()
test_n = test.replace(' ?', np.nan).dropna()
print(train_n.shape)
print(test_n.shape)

(30162, 15)
(15060, 15)


In [18]:
train_n.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [19]:
train_n.age = train_n.age.astype(float)
train_n.fnlwgt = train_n.fnlwgt.astype(float)
train_n.education_num = train_n.education_num.astype(float)
train_n.capital_gain = train_n.capital_gain.astype(float)
train_n.capital_loss = train_n.capital_loss.astype(float)
train_n.hours_per_week = train_n.hours_per_week.astype(float)
train_n.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30162 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             30162 non-null  float64
 1   workclass       30162 non-null  object 
 2   fnlwgt          30162 non-null  float64
 3   education       30162 non-null  object 
 4   education_num   30162 non-null  float64
 5   marital_status  30162 non-null  object 
 6   occupation      30162 non-null  object 
 7   relationship    30162 non-null  object 
 8   race            30162 non-null  object 
 9   sex             30162 non-null  object 
 10  capital_gain    30162 non-null  float64
 11  capital_loss    30162 non-null  float64
 12  hours_per_week  30162 non-null  float64
 13  native_country  30162 non-null  object 
 14  wage_class      30162 non-null  int64  
dtypes: float64(6), int64(1), object(8)
memory usage: 3.7+ MB


In [20]:
test_n.age = test_n.age.astype(float)
test_n.fnlwgt = test_n.fnlwgt.astype(float)
test_n.education_num = test_n.education_num.astype(float)
test_n.capital_gain = test_n.capital_gain.astype(float)
test_n.capital_loss = test_n.capital_loss.astype(float)
test_n.hours_per_week = test_n.hours_per_week.astype(float)
test_n.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15060 entries, 0 to 16280
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             15060 non-null  float64
 1   workclass       15060 non-null  object 
 2   fnlwgt          15060 non-null  float64
 3   education       15060 non-null  object 
 4   education_num   15060 non-null  float64
 5   marital_status  15060 non-null  object 
 6   occupation      15060 non-null  object 
 7   relationship    15060 non-null  object 
 8   race            15060 non-null  object 
 9   sex             15060 non-null  object 
 10  capital_gain    15060 non-null  float64
 11  capital_loss    15060 non-null  float64
 12  hours_per_week  15060 non-null  float64
 13  native_country  15060 non-null  object 
 14  wage_class      15060 non-null  int64  
dtypes: float64(6), int64(1), object(8)
memory usage: 1.8+ MB


In [21]:
train_n[obj_col] = train_n[obj_col]
train_n[obj_col]

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States


In [22]:
for feature in obj_col:
    train_n[feature] = pd.Categorical(train_n[feature]).codes
for feature in obj_col:
    test_n[feature] = pd.Categorical(test_n[feature]).codes

In [23]:
x_train = train_n.drop(columns=['wage_class'])
y_train = train_n['wage_class']
x_test = test_n.drop(columns=['wage_class'])
y_test = test_n['wage_class']
model = XGBClassifier(objective='binary:logistic')
model.fit(x_train, y_train)
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)
print(accuracy_score(y_train,y_train_pred))
print(accuracy_score(y_test,y_test_pred))

0.9037862210728731
0.8681938911022576
