In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
import xgboost as xgb
import sklearn
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score

  data = yaml.load(f.read()) or {}
  import pandas.util.testing as tm
  defaults = yaml.load(f)


In [2]:
train = pd.read_csv("adult.data", header = None)
test = pd.read_csv("adult.test", skiprows = 1, header = None)

In [3]:
print(train.head(5))
print(test.head(5))

   0                  1       2           3   4                    5   \
0  39          State-gov   77516   Bachelors  13        Never-married   
1  50   Self-emp-not-inc   83311   Bachelors  13   Married-civ-spouse   
2  38            Private  215646     HS-grad   9             Divorced   
3  53            Private  234721        11th   7   Married-civ-spouse   
4  28            Private  338409   Bachelors  13   Married-civ-spouse   

                   6               7       8        9     10  11  12  \
0        Adm-clerical   Not-in-family   White     Male  2174   0  40   
1     Exec-managerial         Husband   White     Male     0   0  13   
2   Handlers-cleaners   Not-in-family   White     Male     0   0  40   
3   Handlers-cleaners         Husband   Black     Male     0   0  40   
4      Prof-specialty            Wife   Black   Female     0   0  40   

               13      14  
0   United-States   <=50K  
1   United-States   <=50K  
2   United-States   <=50K  
3   United-State

### We can see we dont have a column header for our data

In [4]:
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 
              'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
             'wage_class']

In [5]:
# Apply these col to both dataset
train.columns = col_labels
test.columns = col_labels

### Lets start EDA (Exploratory Data Anallysis)

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  wage_class      32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             16281 non-null  int64 
 1   workclass       16281 non-null  object
 2   fnlwgt          16281 non-null  int64 
 3   education       16281 non-null  object
 4   education_num   16281 non-null  int64 
 5   marital_status  16281 non-null  object
 6   occupation      16281 non-null  object
 7   relationship    16281 non-null  object
 8   race            16281 non-null  object
 9   sex             16281 non-null  object
 10  capital_gain    16281 non-null  int64 
 11  capital_loss    16281 non-null  int64 
 12  hours_per_week  16281 non-null  int64 
 13  native_country  16281 non-null  object
 14  wage_class      16281 non-null  object
dtypes: int64(6), object(9)
memory usage: 1.9+ MB


In [8]:
train.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
wage_class        0
dtype: int64

In [9]:
# From above we can see there is no missing values
# now lets check unique values and count of it which having type as object
train.workclass.value_counts()

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64

In [10]:
test.workclass.value_counts()

 Private             11210
 Self-emp-not-inc     1321
 Local-gov            1043
 ?                     963
 State-gov             683
 Self-emp-inc          579
 Federal-gov           472
 Without-pay             7
 Never-worked            3
Name: workclass, dtype: int64

In [11]:
# both train and test data has ? so lets drop ? and then see its impact
train.replace(' ?', np.nan).dropna().shape 

(30162, 15)

In [12]:
test.replace(' ?', np.nan).dropna().shape 

(15060, 15)

In [13]:
train_nomissing = train.replace(' ?', np.nan).dropna()
test_nomissing = test.replace(' ?', np.nan).dropna()

In [14]:

test_nomissing['wage_class'] = test_nomissing.wage_class.replace({' <=50K.': ' <=50K', ' >50K.':' >50K'})

In [15]:
# Checking the unique values from each set, we can see if they now match
test_nomissing.wage_class.unique()

array([' <=50K', ' >50K'], dtype=object)

In [16]:
train_nomissing.wage_class.unique()

array([' <=50K', ' >50K'], dtype=object)

In [17]:
combined_set = pd.concat([train_nomissing, test_nomissing], axis = 0) # Stacks them vertically

In [18]:
combined_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45222 entries, 0 to 16280
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             45222 non-null  int64 
 1   workclass       45222 non-null  object
 2   fnlwgt          45222 non-null  int64 
 3   education       45222 non-null  object
 4   education_num   45222 non-null  int64 
 5   marital_status  45222 non-null  object
 6   occupation      45222 non-null  object
 7   relationship    45222 non-null  object
 8   race            45222 non-null  object
 9   sex             45222 non-null  object
 10  capital_gain    45222 non-null  int64 
 11  capital_loss    45222 non-null  int64 
 12  hours_per_week  45222 non-null  int64 
 13  native_country  45222 non-null  object
 14  wage_class      45222 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.5+ MB


In [19]:
for feature in combined_set.columns: # Loop through all columns in the dataframe
    if combined_set[feature].dtype == 'object': # Only apply for columns with categorical strings
        combined_set[feature] = pd.Categorical(combined_set[feature]).codes # Replace strings with an integer

In [20]:
combined_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45222 entries, 0 to 16280
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             45222 non-null  int64
 1   workclass       45222 non-null  int8 
 2   fnlwgt          45222 non-null  int64
 3   education       45222 non-null  int8 
 4   education_num   45222 non-null  int64
 5   marital_status  45222 non-null  int8 
 6   occupation      45222 non-null  int8 
 7   relationship    45222 non-null  int8 
 8   race            45222 non-null  int8 
 9   sex             45222 non-null  int8 
 10  capital_gain    45222 non-null  int64
 11  capital_loss    45222 non-null  int64
 12  hours_per_week  45222 non-null  int64
 13  native_country  45222 non-null  int8 
 14  wage_class      45222 non-null  int8 
dtypes: int64(6), int8(9)
memory usage: 2.8 MB


In [21]:
combined_set.shape

(45222, 15)

In [22]:
combined_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,5,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,4,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,2,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,2,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,2,338409,9,13,2,9,5,2,0,0,0,40,4,0


In [23]:
# creation of X and Y variable
x=combined_set.drop(["wage_class"],1)
x.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,39,5,77516,9,13,4,0,1,4,1,2174,0,40,38
1,50,4,83311,9,13,2,3,0,4,1,0,0,13,38
2,38,2,215646,11,9,0,5,1,4,1,0,0,40,38
3,53,2,234721,1,7,2,5,0,2,1,0,0,40,38
4,28,2,338409,9,13,2,9,5,2,0,0,0,40,4


In [24]:
y = combined_set["wage_class"]
y.head()

0    0
1    0
2    0
3    0
4    0
Name: wage_class, dtype: int8

In [25]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=150)
x_train.shape ,x_test.shape, y_train.shape , y_test.shape

((31655, 14), (13567, 14), (31655,), (13567,))

In [26]:
model= XGBClassifier()
model.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints=None,
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
       validate_parameters=False, verbosity=None)

In [27]:
y_pred = model.predict(x_test)
y_pred

array([1, 1, 1, ..., 0, 1, 0], dtype=int8)

In [28]:
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8696100832903368

In [29]:
param_grid = {
      'learning_rate':[1,0.5,0.1,0.01,0.001],
       'max_depth': [3,5,10,20],
       'n_estimators':[10,50,100,200]
}

In [30]:
from sklearn.model_selection import GridSearchCV
grid= GridSearchCV(XGBClassifier(objective='binary:logistic'),param_grid, verbose=3)

In [31]:
grid.fit(x_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 80 candidates, totalling 240 fits
[CV] learning_rate=1, max_depth=3, n_estimators=10 ...................
[CV]  learning_rate=1, max_depth=3, n_estimators=10, score=0.8606083578129442, total=   0.2s
[CV] learning_rate=1, max_depth=3, n_estimators=10 ...................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV]  learning_rate=1, max_depth=3, n_estimators=10, score=0.858117713960762, total=   0.1s
[CV] learning_rate=1, max_depth=3, n_estimators=10 ...................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV]  learning_rate=1, max_depth=3, n_estimators=10, score=0.85716993649891, total=   0.1s
[CV] learning_rate=1, max_depth=3, n_estimators=50 ...................
[CV]  learning_rate=1, max_depth=3, n_estimators=50, score=0.8609873969487349, total=   0.8s
[CV] learning_rate=1, max_depth=3, n_estimators=50 ...................
[CV]  learning_rate=1, max_depth=3, n_estimators=50, score=0.8640887119704294, total=   0.8s
[CV] learning_rate=1, max_depth=3, n_estimators=50 ...................
[CV]  learning_rate=1, max_depth=3, n_estimators=50, score=0.8620983793005402, total=   0.8s
[CV] learning_rate=1, max_depth=3, n_estimators=100 ..................
[CV]  learning_rate=1, max_depth=3, n_estimators=100, score=0.8601345588932057, total=   1.7s
[CV] learning_rate=1, max_depth=3, n_estimators=100 ..................
[CV]  learning_rate=1, max_depth=3, n_estimators=100, score=0.8653208226708369, total=   1.6s
[CV] learning_rate=1, max_depth=3, n_estimators=100 ..................
[CV]  learning_r

[CV]  learning_rate=0.5, max_depth=3, n_estimators=50, score=0.8671215998483556, total=   0.8s
[CV] learning_rate=0.5, max_depth=3, n_estimators=50 .................
[CV]  learning_rate=0.5, max_depth=3, n_estimators=50, score=0.8649417116860961, total=   0.8s
[CV] learning_rate=0.5, max_depth=3, n_estimators=100 ................
[CV]  learning_rate=0.5, max_depth=3, n_estimators=100, score=0.8651568274424334, total=   1.6s
[CV] learning_rate=0.5, max_depth=3, n_estimators=100 ................
[CV]  learning_rate=0.5, max_depth=3, n_estimators=100, score=0.8663633778788741, total=   1.6s
[CV] learning_rate=0.5, max_depth=3, n_estimators=100 ................
[CV]  learning_rate=0.5, max_depth=3, n_estimators=100, score=0.8648469339399109, total=   1.7s
[CV] learning_rate=0.5, max_depth=3, n_estimators=200 ................
[CV]  learning_rate=0.5, max_depth=3, n_estimators=200, score=0.8626930730597934, total=   3.3s
[CV] learning_rate=0.5, max_depth=3, n_estimators=200 ................


[CV]  learning_rate=0.1, max_depth=3, n_estimators=100, score=0.858807921917938, total=   1.7s
[CV] learning_rate=0.1, max_depth=3, n_estimators=100 ................
[CV]  learning_rate=0.1, max_depth=3, n_estimators=100, score=0.8612453795848735, total=   1.8s
[CV] learning_rate=0.1, max_depth=3, n_estimators=100 ................
[CV]  learning_rate=0.1, max_depth=3, n_estimators=100, score=0.8597289356459104, total=   1.8s
[CV] learning_rate=0.1, max_depth=3, n_estimators=200 ................
[CV]  learning_rate=0.1, max_depth=3, n_estimators=200, score=0.8661044252819103, total=   3.6s
[CV] learning_rate=0.1, max_depth=3, n_estimators=200 ................
[CV]  learning_rate=0.1, max_depth=3, n_estimators=200, score=0.8649417116860961, total=   3.6s
[CV] learning_rate=0.1, max_depth=3, n_estimators=200 ................
[CV]  learning_rate=0.1, max_depth=3, n_estimators=200, score=0.8656051559093925, total=   3.7s
[CV] learning_rate=0.1, max_depth=5, n_estimators=10 .................

[CV]  learning_rate=0.01, max_depth=3, n_estimators=100, score=0.8384987205004265, total=   1.7s
[CV] learning_rate=0.01, max_depth=3, n_estimators=200 ...............
[CV]  learning_rate=0.01, max_depth=3, n_estimators=200, score=0.8398559651283996, total=   3.2s
[CV] learning_rate=0.01, max_depth=3, n_estimators=200 ...............
[CV]  learning_rate=0.01, max_depth=3, n_estimators=200, score=0.8430480523173159, total=   3.3s
[CV] learning_rate=0.01, max_depth=3, n_estimators=200 ...............
[CV]  learning_rate=0.01, max_depth=3, n_estimators=200, score=0.8412472751397971, total=   3.3s
[CV] learning_rate=0.01, max_depth=5, n_estimators=10 ................
[CV]  learning_rate=0.01, max_depth=5, n_estimators=10, score=0.8467734293565811, total=   0.2s
[CV] learning_rate=0.01, max_depth=5, n_estimators=10 ................
[CV]  learning_rate=0.01, max_depth=5, n_estimators=10, score=0.849398161311724, total=   0.2s
[CV] learning_rate=0.01, max_depth=5, n_estimators=10 ............

[CV]  learning_rate=0.001, max_depth=3, n_estimators=200, score=0.8328437411162702, total=   4.7s
[CV] learning_rate=0.001, max_depth=3, n_estimators=200 ..............
[CV]  learning_rate=0.001, max_depth=3, n_estimators=200, score=0.8359397213534262, total=   3.6s
[CV] learning_rate=0.001, max_depth=3, n_estimators=200 ..............
[CV]  learning_rate=0.001, max_depth=3, n_estimators=200, score=0.8337598331911668, total=   3.7s
[CV] learning_rate=0.001, max_depth=5, n_estimators=10 ...............
[CV]  learning_rate=0.001, max_depth=5, n_estimators=10, score=0.8411826021036672, total=   0.3s
[CV] learning_rate=0.001, max_depth=5, n_estimators=10 ...............
[CV]  learning_rate=0.001, max_depth=5, n_estimators=10, score=0.8478817173727609, total=   0.2s
[CV] learning_rate=0.001, max_depth=5, n_estimators=10 ...............
[CV]  learning_rate=0.001, max_depth=5, n_estimators=10, score=0.8472182731494645, total=   0.2s
[CV] learning_rate=0.001, max_depth=5, n_estimators=50 .....

[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 16.3min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
       colsample_bynode=None, colsample_bytree=None, gamma=None,
       gpu_id=None, importance_type='gain', interaction_constraints=None,
       learning_rate=None, max_delta_step=None, max_depth=None,
       min_child_w...pos_weight=None, subsample=None,
       tree_method=None, validate_parameters=False, verbosity=None),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': [1, 0.5, 0.1, 0.01, 0.001], 'max_depth': [3, 5, 10, 20], 'n_estimators': [10, 50, 100, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [32]:
grid.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}

In [33]:
new_model=XGBClassifier(learning_rate= 0.1, max_depth=20, n_estimators= 200)

In [34]:
new_model.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.1, max_delta_step=0, max_depth=20,
       min_child_weight=1, missing=nan, monotone_constraints=None,
       n_estimators=200, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
       validate_parameters=False, verbosity=None)

In [35]:
y_pred1=new_model.predict(x_test)

In [36]:
accuracy2 =accuracy_score(y_test,y_pred1)
accuracy2

0.8561214712169234