In [1]:
!pip install eli5
!pip install xgboost



## Import of Libraries needed

In [59]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from category_encoders import OrdinalEncoder
from xgboost import XGBClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, plot_confusion_matrix, plot_roc_curve
import matplotlib.pyplot as plt
from skopt import BayesSearchCV

## Import Datasets

In [24]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
census = pd.read_csv('census.csv')
print(train.shape, test.shape, census.shape)

(32561, 15) (16281, 15) (48842, 15)


## Begin EDA

In [25]:
#checking for null values and column types, interesting to see no 'missing' values I'll dive a little further.
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       48842 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      48842 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48842 non-null  object
 14  income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [26]:
#Aha missing values are disguised as '?'.  Lets fix that.
census['workclass'].value_counts()

Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: workclass, dtype: int64

In [27]:
#Found 3 Object Columns with '?' for missing values.  We will fill these with the top value of each row.
census.isin(['?']).sum()

age                  0
workclass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     857
income               0
dtype: int64

In [28]:
#Time to make the 'missing' values into NaN so we can work with them
census.replace({'?': np.NaN}, inplace=True)

In [29]:
#No more '?'
census.workclass.value_counts()

Private             33906
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: workclass, dtype: int64

In [9]:
# They are now registered as NaN.  These will be replaced with the top value_counts in each column
census.isnull().sum()

age                  0
workclass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     857
income               0
dtype: int64

In [30]:
census.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [31]:
#Printing Top Values to Fill NaNs
print('Top Value:',census['native-country'].describe())
print('Top Value:',census['occupation'].describe())
print('Top Value:',census['workclass'].describe())

Top Value: count             47985
unique               41
top       United-States
freq              43832
Name: native-country, dtype: object
Top Value: count              46033
unique                14
top       Prof-specialty
freq                6172
Name: occupation, dtype: object
Top Value: count       46043
unique          8
top       Private
freq        33906
Name: workclass, dtype: object


In [32]:
#filling NaN values
census['workclass'].replace({np.NaN : 'Private'},inplace=True)
census['occupation'].replace({np.NaN : 'Prof-specialty'}, inplace=True)
census['native-country'].replace({np.NaN : 'United-States'},inplace=True)

In [33]:
#Sanity check to assure NaNs have been fixed with working values.
census.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [34]:
#checking for high cardinality in the dataset as well as seeing what to do with the features. Looks like 'fnlwgt' has a very high cardinality and isnt useful for the model
census.astype(object).nunique()

age                  74
workclass             8
fnlwgt            28523
education            16
education-num        16
marital-status        7
occupation           14
relationship          6
race                  5
sex                   2
capital-gain        123
capital-loss         99
hours-per-week       96
native-country       41
income                2
dtype: int64

#Working on the wrangle function.  Not sure how to get these three def/if/else functions wrapped into one working or multi working function inside of a wranglefunction🤔

In [35]:
#Create a New Feature that changes the income column into a 1 if they make more than 50K a year and 0 if they make 50K or less.  New Feature called 'makes-50K+'.
def over50K(census):
    if census['income'] == '>50K':
        val = 1
    else:
        val = 0
    return val
census['makes-50K+'] = census.apply(over50K, axis=1)

In [36]:
#Create a New Feature that changes the hours worked per week column into a 1 if they worked more than 40 hrs a week and 0 if they worked 40 or less.  New Feature called 'over40hrs'.
def over40(census):
    if census['hours-per-week'] >40:
        val = 1
    else:
        val = 0
    return val
census['over40hrs+'] = census.apply(over40, axis=1)

In [37]:
#Create a New Feature that changes the sex column into a 1 if they were Female and 0 if they were Male.  New Feature called 'gender-F/1-M/0'. This is new Target column.
def gender(census):
    if census['sex'] == 'Female':
            val = 1
    else:
            val = 0
    return val
census['gender-F/1-M/0'] = census.apply(gender, axis=1)

In [38]:
#checking to see new features were successful. They are all there.
census.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,makes-50K+,over40hrs+,gender-F/1-M/0
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K,0,0,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K,0,1,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K,1,0,0
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K,1,0,0
4,18,Private,103497,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,30,United-States,<=50K,0,0,1


In [39]:
# Time to drop columns we don't need anylonger.  Feature'fnlwgt' is high card and Unnecessary while 'sex' would now become a leaky feature and income and hours per week are now redundant
census = census.drop(columns=['fnlwgt','income','hours-per-week','sex'])

In [40]:
census

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,native-country,makes-50K+,over40hrs+,gender-F/1-M/0
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,0,0,United-States,0,0,0
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,0,0,United-States,0,1,0
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,0,0,United-States,1,0,0
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,7688,0,United-States,1,0,0
4,18,Private,Some-college,10,Never-married,Prof-specialty,Own-child,White,0,0,United-States,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,0,0,United-States,0,0,1
48838,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,0,0,United-States,1,0,0
48839,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,0,0,United-States,0,0,1
48840,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,0,0,United-States,0,0,0


#Splitting the Data

In [41]:
# I want to predict whether the dataset can tell if its a Male or Female based on the information given
target = 'gender-F/1-M/0'
y = census[target]
X = census.drop(target,axis=1)

In [43]:
#checking that target is no longer in X
X.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,native-country,makes-50K+,over40hrs+
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,0,0,United-States,0,0
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,0,0,United-States,0,1
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,0,0,United-States,1,0
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,7688,0,United-States,1,0
4,18,Private,Some-college,10,Never-married,Prof-specialty,Own-child,White,0,0,United-States,0,0


In [49]:
#Split data randomly with a 60/20/20 split

train, val, test = np.split(census.sample(frac=1), [int(.6*len(census)), int(.8*len(census))])
print('Training Set:',train.head(1))
print('Validation Set:',val.head(1))
print('Test Set',test.head(1))

Training Set:        age         workclass  education  education-num      marital-status  \
46209   47  Self-emp-not-inc  Doctorate             16  Married-civ-spouse   

             occupation relationship   race  capital-gain  capital-loss  \
46209  Transport-moving      Husband  White             0          2002   

      native-country  makes-50K+  over40hrs+  gender-F/1-M/0  
46209  United-States           0           1               0  
Validation Set:        age    workclass     education  education-num marital-status  \
14975   23  Federal-gov  Some-college             10  Never-married   

         occupation   relationship   race  capital-gain  capital-loss  \
14975  Adm-clerical  Not-in-family  White             0             0   

      native-country  makes-50K+  over40hrs+  gender-F/1-M/0  
14975  United-States           0           1               0  
Test Set        age workclass  education  education-num      marital-status  \
39020   27   Private  Bachelors          

In [51]:
#Split the data into X and y for training the model and making predictions
y_train = train[target]
X_train = train.drop(target,axis=1)

In [52]:
y_val = val[target]
X_val = val.drop(target,axis=1)

In [54]:
y_test = test[target]
X_test = test.drop(target,axis=1)

# Establishing the Baseline

In [56]:
#First I will check that the target feature is between 50-70%.  Its almost to far off but still within the parameters to continue.
y_train.value_counts(normalize=True)

0    0.666098
1    0.333902
Name: gender-F/1-M/0, dtype: float64

In [57]:
y_train.value_counts()

0    19520
1     9785
Name: gender-F/1-M/0, dtype: int64

In [58]:
print('Baseline Accuracy:', y_train.value_counts(normalize=True).max())

Baseline Accuracy: 0.6660979355058864


#Building the Model

In [68]:
#Starting with a pipeline. Using OrdinalEncoder for the object columns, we do not need and Imputer since they were all filled with top values and I am working with XGBClassifier.
model = make_pipeline(
    OrdinalEncoder(),
    XGBClassifier(n_jobs=-1)
)
model.fit(X_train,y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['workclass', 'education',
                                      'marital-status', 'occupation',
                                      'relationship', 'race',
                                      'native-country'],
                                mapping=[{'col': 'workclass',
                                          'data_type': dtype('O'),
                                          'mapping': Self-emp-not-inc    1
Private             2
Local-gov           3
State-gov           4
Federal-gov         5
Self-emp-inc        6
Never-worked        7
Without-pay         8
NaN                -2
dtype: int64},
                                         {'col': 'education',
                                          'data_ty...
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                

In [69]:
print('Training accuracy:', model.score(X_train, y_train))
print('Validation accuracy:', model.score(X_val, y_val))

Training accuracy: 0.8848660638116362
Validation accuracy: 0.8391687141687142


In [None]:
# Next I will work with a cross validation Known as BayesSearchCV to see if we can tune the model for better results

In [76]:
from sklearn.model_selection import StratifiedKFold

In [None]:
ore = OrdinalEncoder()
XTO_train = ore.fit_transform(X_train)

model = XGBClassifier()

model.fit(XTO_train,y_train)

In [97]:
import xgboost as xgb

In [99]:
model_cv = BayesSearchCV(estimator= xgb.XGBClassifier(
                                n_jobs = -1,
                                objective = 'binary:logistic',
                                eval_metric = 'auc',
                                learning_rate = 0.1,
                                n_estimators = (100,3000,100)
                                tree_method='approx'),
                        
    search_spaces = {
        'min_child_weight': (15, 20),
        'max_depth': (6, 8),
        'max_delta_step': (0, 20),
        'subsample': (0.01, 1.0, 'uniform'),
        'colsample_bytree': (0.01, 1.0, 'uniform'),
        'colsample_bylevel': (0.01, 1.0, 'uniform'),
        'reg_lambda': (1e-2, 1000, 'log-uniform'),
        'reg_alpha': (1e-2, 1.0, 'log-uniform'),
        'gamma': (1e-2, 0.5, 'log-uniform'),
        'min_child_weight': (0, 20),
        'scale_pos_weight': (1e-6, 500, 'log-uniform')
    },    
    scoring = 'roc_auc',
    cv = StratifiedKFold(
        n_splits=3,
        shuffle=True,
        random_state=42),
    n_jobs = 3,
    n_iter = 10,   
    verbose = 500,
    refit = True,
    random_state = 42)
_ = model_cv.fit(XTO_train, y_train)


Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed:  2.2min
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:  2.2min remaining:    0.0s
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:  2.2min finished
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed:  2.7min
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:  2.7min remaining:    0.0s
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:  2.7min finished
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed:  2.0min
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:  2.0min remaining:    0.0s
[Parallel(n_jobs=3)]: Done   3 out 

ValueError: All integer values shouldbe greater than 0.010000

In [104]:
print('Training accuracy:', model_cv.score(XTO_train, y_train))
print('Validation accuracy:', model_cv.best_score_)
      #(X_val, y_val))

AttributeError: 'BayesSearchCV' object has no attribute 'best_estimator_'

In [105]:
XTO_train


Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,native-country,makes-50K+,over40hrs+
46209,47,1,1,16,1,1,1,1,0,2002,1,0,1
41682,48,1,2,10,2,2,2,1,3325,0,1,0,1
23902,44,2,3,14,1,3,1,1,0,0,1,1,0
21478,63,2,4,13,1,2,1,1,0,1740,1,0,0
31092,50,2,2,10,1,4,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14390,60,2,5,9,5,7,6,2,0,0,31,0,0
4858,22,2,2,10,3,4,3,1,0,0,1,0,0
33806,27,3,4,13,3,4,2,1,0,0,1,0,1
22672,41,4,4,13,1,4,4,4,0,0,1,1,1
