In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold


import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_data = pd.read_csv("adult.data", header=None)
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
col_labels = ['age', 'workclass','fnlwgt','education','education_num','marital_status','occupation','relationship',
              'race','sex','capital_gain','capital_loss','hours_per_week','native_country','wage_class'] 

In [4]:
train_data.columns = col_labels

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  wage_class      32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
#train_data.isnull().sum()


In [7]:
for col in train_data.columns:
    print(col, ':', train_data[col].unique(), "\n")

age : [39 50 38 53 28 37 49 52 31 42 30 23 32 40 34 25 43 54 35 59 56 19 20 45
 22 48 21 24 57 44 41 29 18 47 46 36 79 27 67 33 76 17 55 61 70 64 71 68
 66 51 58 26 60 90 75 65 77 62 63 80 72 74 69 73 81 78 88 82 83 84 85 86
 87] 

workclass : [' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked'] 

fnlwgt : [ 77516  83311 215646 ...  34066  84661 257302] 

education : [' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th'] 

education_num : [13  9  7 14  5 10 12 11  4 16 15  3  6  2  1  8] 

marital_status : [' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed'] 

occupation : [' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving'
 ' Fa

As we can see we have some invlaid data such as '?'

In [8]:
num_data = train_data.shape[0]
num_data

32561

# Finding which columns has invalid data

In [9]:
count = 0
for col in train_data.columns:
    invalid = train_data[col].isin([" ?"]).sum()
    count += invalid
    
    if invalid > 0:
        print(col)
        print("Invalid record :",invalid)
        print("percent of invalid record :" ,(float(invalid)/len(train_data) *100), " \n")
   

workclass
Invalid record : 1836
percent of invalid record : 5.638647461687294  

occupation
Invalid record : 1843
percent of invalid record : 5.660145572924664  

native_country
Invalid record : 583
percent of invalid record : 1.7904855501980899  



we have 12% invalid recored so i will  remove those record

In [10]:
# Removing the rows that contain invlaid data
for col in train_data.columns:
    train_data = train_data[train_data[col]!=' ?']

In [11]:
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [12]:
train_data.shape

(30162, 15)

# Exploring Numerical data


 As we know captila gain and captil loss impact the income. with the help of this two columns i will find capital income

In [13]:
train_data.select_dtypes(np.number)


Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40
...,...,...,...,...,...,...
32556,27,257302,12,0,0,38
32557,40,154374,9,0,0,40
32558,58,151910,9,0,0,40
32559,22,201490,9,0,0,20


In [14]:
train_data['capital_income'] = train_data['capital_gain'] - train_data['capital_loss']


In [15]:
# dropping capital gain and loss col
train_data = train_data.drop(['capital_gain', 'capital_loss'], axis =1)

In [16]:
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,native_country,wage_class,capital_income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K,2174
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K,0


# Exploring Categorical data


In [17]:
train_data.select_dtypes(object)


Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country,wage_class
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States,<=50K
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States,>50K
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States,<=50K
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States,<=50K


In [18]:
train_data.select_dtypes(object).columns

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'wage_class'],
      dtype='object')

In [19]:
# as education and education num gives the same information i will remove education num 

train_data = train_data.drop('education',axis=1)

In [20]:
# martial and relationship gives the same info.
train_data = train_data.drop('relationship',axis=1)

In [21]:
for col in train_data.select_dtypes(object).columns:
    print(col, ':', train_data[col].unique(), "\n")

workclass : [' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' Self-emp-inc' ' Without-pay'] 

marital_status : [' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed'] 

occupation : [' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Transport-moving' ' Farming-fishing'
 ' Machine-op-inspct' ' Tech-support' ' Craft-repair' ' Protective-serv'
 ' Armed-Forces' ' Priv-house-serv'] 

race : [' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other'] 

sex : [' Male' ' Female'] 

native_country : [' United-States' ' Cuba' ' Jamaica' ' India' ' Mexico' ' Puerto-Rico'
 ' Honduras' ' England' ' Canada' ' Germany' ' Iran' ' Philippines'
 ' Poland' ' Columbia' ' Cambodia' ' Thailand' ' Ecuador' ' Laos'
 ' Taiwan' ' Haiti' ' Portugal' ' Dominican-Republic' ' El-Salvador'
 ' France' ' Guatemala' ' Italy' ' China' ' South' ' Japan' ' Yugoslav

# Converting the categorical value into numeric value

In [22]:
# lets convert the native country col with label encode becacuse 
# if we covert with the help of one end hot encode its column may invalid with test countries column
# as we do not have all the countries.

# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
train_data['native_country'] = labelencoder.fit_transform(train_data['native_country'])
train_data['wage_class'] = labelencoder.fit_transform(train_data['wage_class'])
train_data['sex'] = labelencoder.fit_transform(train_data['sex'])
train_data['marital_status'] = labelencoder.fit_transform(train_data['marital_status'])


In [23]:
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education_num,marital_status,occupation,race,sex,hours_per_week,native_country,wage_class,capital_income
0,39,State-gov,77516,13,4,Adm-clerical,White,1,40,38,0,2174
1,50,Self-emp-not-inc,83311,13,2,Exec-managerial,White,1,13,38,0,0
2,38,Private,215646,9,0,Handlers-cleaners,White,1,40,38,0,0
3,53,Private,234721,7,2,Handlers-cleaners,Black,1,40,38,0,0
4,28,Private,338409,13,2,Prof-specialty,Black,0,40,4,0,0


In [24]:
# convert with the help of on end hot encoding
dummycol = ['workclass','occupation','race']
dummy = pd.get_dummies(train_data[dummycol],drop_first=True)
dummy.head()

Unnamed: 0,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,occupation_ Armed-Forces,occupation_ Craft-repair,occupation_ Exec-managerial,occupation_ Farming-fishing,...,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


In [25]:
# deleting the onverted col from dataset
train_data = train_data.drop(dummycol,axis=1)

In [26]:
train_data = pd.concat([train_data,dummy],axis=1)

In [27]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30162 entries, 0 to 32560
Data columns (total 32 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   age                            30162 non-null  int64
 1   fnlwgt                         30162 non-null  int64
 2   education_num                  30162 non-null  int64
 3   marital_status                 30162 non-null  int32
 4   sex                            30162 non-null  int32
 5   hours_per_week                 30162 non-null  int64
 6   native_country                 30162 non-null  int32
 7   wage_class                     30162 non-null  int32
 8   capital_income                 30162 non-null  int64
 9   workclass_ Local-gov           30162 non-null  uint8
 10  workclass_ Private             30162 non-null  uint8
 11  workclass_ Self-emp-inc        30162 non-null  uint8
 12  workclass_ Self-emp-not-inc    30162 non-null  uint8
 13  workclass_ State

# cheking for imbalnce data

In [28]:
(train_data['wage_class'].value_counts() /len(train_data)) *100

0    75.107751
1    24.892249
Name: wage_class, dtype: float64

as we can see that data is imbalanced but no worries as we are going to use Xgboost algorithm it handles imbalanced data

# Deciding lables and feature 

In [29]:
fea = train_data.drop('wage_class',axis=1)
label = train_data['wage_class']

# Coverting features to standard scalar 

In [30]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()


In [31]:
scaled_fea =scalar.fit_transform(fea)
scaled_fea.shape


(30162, 31)

# Train and Test implementation

In [32]:
xtrain, xtest, ytrain,ytest = train_test_split(scaled_fea,label,test_size=0.3,random_state=42)

In [33]:
model = XGBClassifier(objective='binary:logistic')

model.fit(xtrain,ytrain)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [34]:
# accuracy of training data
y_pred=model.predict(xtrain)
print(confusion_matrix(ytrain,y_pred))
print(accuracy_score(ytrain,y_pred))
print(classification_report(ytrain,y_pred))

[[15237   650]
 [ 1275  3951]]
0.9088239473310282
              precision    recall  f1-score   support

           0       0.92      0.96      0.94     15887
           1       0.86      0.76      0.80      5226

    accuracy                           0.91     21113
   macro avg       0.89      0.86      0.87     21113
weighted avg       0.91      0.91      0.91     21113



In [35]:
# accuracy of Test data
y_pred=model.predict(xtest)
print(confusion_matrix(ytest,y_pred))
print(accuracy_score(ytest,y_pred))
print(classification_report(ytest,y_pred))

[[6323  444]
 [ 762 1520]]
0.8667256050392309
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      6767
           1       0.77      0.67      0.72      2282

    accuracy                           0.87      9049
   macro avg       0.83      0.80      0.81      9049
weighted avg       0.86      0.87      0.86      9049



# Lets Do hyperParamter tuning for improving result. 


In [36]:
param_grid ={
    'learning_rate':[0.1,0.2,0.3,0.4,0.5],
    'max_depth' : [3,4,5,6],     
    'tree_method':['approx','hist','auto'],
     
}


In [37]:
grid = GridSearchCV(XGBClassifier(objective='binary:logistic'),param_grid,verbose=3)
grid.fit(xtrain,ytrain)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV 1/5] END learning_rate=0.1, max_depth=3, tree_method=approx;, score=0.865 total time=   1.5s
[CV 2/5] END learning_rate=0.1, max_depth=3, tree_method=approx;, score=0.860 total time=   1.4s
[CV 3/5] END learning_rate=0.1, max_depth=3, tree_method=approx;, score=0.850 total time=   1.4s
[CV 4/5] END learning_rate=0.1, max_depth=3, tree_method=approx;, score=0.854 total time=   1.4s
[CV 5/5] END learning_rate=0.1, max_depth=3, tree_method=approx;, score=0.850 total time=   1.4s
[CV 1/5] END learning_rate=0.1, max_depth=3, tree_method=hist;, score=0.862 total time=   0.3s
[CV 2/5] END learning_rate=0.1, max_depth=3, tree_method=hist;, score=0.865 total time=   0.2s
[CV 3/5] END learning_rate=0.1, max_depth=3, tree_method=hist;, score=0.851 total time=   0.2s
[CV 4/5] END learning_rate=0.1, max_depth=3, tree_method=hist;, score=0.856 total time=   0.2s
[CV 5/5] END learning_rate=0.1, max_depth=3, tree_method=hist;, score=0.8

[CV 1/5] END learning_rate=0.2, max_depth=4, tree_method=auto;, score=0.870 total time=   1.1s
[CV 2/5] END learning_rate=0.2, max_depth=4, tree_method=auto;, score=0.873 total time=   1.1s
[CV 3/5] END learning_rate=0.2, max_depth=4, tree_method=auto;, score=0.859 total time=   1.1s
[CV 4/5] END learning_rate=0.2, max_depth=4, tree_method=auto;, score=0.863 total time=   1.1s
[CV 5/5] END learning_rate=0.2, max_depth=4, tree_method=auto;, score=0.867 total time=   1.1s
[CV 1/5] END learning_rate=0.2, max_depth=5, tree_method=approx;, score=0.869 total time=   1.9s
[CV 2/5] END learning_rate=0.2, max_depth=5, tree_method=approx;, score=0.868 total time=   1.9s
[CV 3/5] END learning_rate=0.2, max_depth=5, tree_method=approx;, score=0.857 total time=   1.9s
[CV 4/5] END learning_rate=0.2, max_depth=5, tree_method=approx;, score=0.854 total time=   1.9s
[CV 5/5] END learning_rate=0.2, max_depth=5, tree_method=approx;, score=0.863 total time=   1.9s
[CV 1/5] END learning_rate=0.2, max_dept

[CV 2/5] END learning_rate=0.3, max_depth=6, tree_method=hist;, score=0.872 total time=   0.3s
[CV 3/5] END learning_rate=0.3, max_depth=6, tree_method=hist;, score=0.863 total time=   0.3s
[CV 4/5] END learning_rate=0.3, max_depth=6, tree_method=hist;, score=0.856 total time=   0.3s
[CV 5/5] END learning_rate=0.3, max_depth=6, tree_method=hist;, score=0.862 total time=   0.3s
[CV 1/5] END learning_rate=0.3, max_depth=6, tree_method=auto;, score=0.865 total time=   1.6s
[CV 2/5] END learning_rate=0.3, max_depth=6, tree_method=auto;, score=0.865 total time=   1.6s
[CV 3/5] END learning_rate=0.3, max_depth=6, tree_method=auto;, score=0.859 total time=   1.7s
[CV 4/5] END learning_rate=0.3, max_depth=6, tree_method=auto;, score=0.855 total time=   1.6s
[CV 5/5] END learning_rate=0.3, max_depth=6, tree_method=auto;, score=0.859 total time=   1.6s
[CV 1/5] END learning_rate=0.4, max_depth=3, tree_method=approx;, score=0.870 total time=   1.4s
[CV 2/5] END learning_rate=0.4, max_depth=3, tre

[CV 3/5] END learning_rate=0.5, max_depth=4, tree_method=approx;, score=0.852 total time=   1.7s
[CV 4/5] END learning_rate=0.5, max_depth=4, tree_method=approx;, score=0.851 total time=   1.6s
[CV 5/5] END learning_rate=0.5, max_depth=4, tree_method=approx;, score=0.855 total time=   1.6s
[CV 1/5] END learning_rate=0.5, max_depth=4, tree_method=hist;, score=0.866 total time=   0.2s
[CV 2/5] END learning_rate=0.5, max_depth=4, tree_method=hist;, score=0.871 total time=   0.2s
[CV 3/5] END learning_rate=0.5, max_depth=4, tree_method=hist;, score=0.859 total time=   0.2s
[CV 4/5] END learning_rate=0.5, max_depth=4, tree_method=hist;, score=0.859 total time=   0.2s
[CV 5/5] END learning_rate=0.5, max_depth=4, tree_method=hist;, score=0.863 total time=   0.2s
[CV 1/5] END learning_rate=0.5, max_depth=4, tree_method=auto;, score=0.870 total time=   1.1s
[CV 2/5] END learning_rate=0.5, max_depth=4, tree_method=auto;, score=0.868 total time=   1.1s
[CV 3/5] END learning_rate=0.5, max_depth=4,

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, validate_parameter

In [38]:
grid.best_params_

{'learning_rate': 0.2, 'max_depth': 5, 'tree_method': 'auto'}

In [39]:
print('accuracy of training data')
y2_pred=grid.predict(xtrain)
print(confusion_matrix(ytrain,y2_pred))
print("Accuracy score", accuracy_score(ytrain,y2_pred))
print(classification_report(ytrain,y2_pred))

print("------------------------------------------------------")
print('accuracy of test data')

y2_pred=grid.predict(xtest)
print(confusion_matrix(ytest,y2_pred))
print("Accuracy score", accuracy_score(ytest,y2_pred))
print(classification_report(ytest,y2_pred))

accuracy of training data
[[15104   783]
 [ 1621  3605]]
Accuracy score 0.8861365035759958
              precision    recall  f1-score   support

           0       0.90      0.95      0.93     15887
           1       0.82      0.69      0.75      5226

    accuracy                           0.89     21113
   macro avg       0.86      0.82      0.84     21113
weighted avg       0.88      0.89      0.88     21113

------------------------------------------------------
accuracy of test data
[[6349  418]
 [ 781 1501]]
Accuracy score 0.8674991711791358
              precision    recall  f1-score   support

           0       0.89      0.94      0.91      6767
           1       0.78      0.66      0.71      2282

    accuracy                           0.87      9049
   macro avg       0.84      0.80      0.81      9049
weighted avg       0.86      0.87      0.86      9049

