In [1]:
!pip install eli5
!pip install xgboost



## Import of Libraries needed

In [34]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from category_encoders import OrdinalEncoder
from xgboost import XGBClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report, plot_confusion_matrix, plot_roc_curve
import matplotlib.pyplot as plt
from skopt import BayesSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

## Import Datasets

In [165]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
census = pd.read_csv('census.csv')
print(train.shape, test.shape, census.shape)

(32561, 15) (16281, 15) (48842, 15)


## Begin EDA

In [166]:
#checking for null values and column types, interesting to see no 'missing' values I'll dive a little further.
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       48842 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      48842 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48842 non-null  object
 14  income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [167]:
#Aha missing values are disguised as '?'.  Lets fix that.
census['workclass'].value_counts()

Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: workclass, dtype: int64

In [168]:
#Found 3 Object Columns with '?' for missing values.  We will fill these with the top value of each row.
census.isin(['?']).sum()

age                  0
workclass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     857
income               0
dtype: int64

In [169]:
#Time to make the 'missing' values into NaN so we can work with them
census.replace({'?': np.NaN}, inplace=True)

In [170]:
#No more '?'
census.workclass.value_counts()

Private             33906
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: workclass, dtype: int64

In [171]:
# They are now registered as NaN.  These will be replaced with the top value_counts in each column
census.isnull().sum()

age                  0
workclass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     857
income               0
dtype: int64

In [172]:
census.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [173]:
#Printing Top Values to Fill NaNs
print('Top Value:',census['native-country'].describe())
print('Top Value:',census['occupation'].describe())
print('Top Value:',census['workclass'].describe())

Top Value: count             47985
unique               41
top       United-States
freq              43832
Name: native-country, dtype: object
Top Value: count              46033
unique                14
top       Prof-specialty
freq                6172
Name: occupation, dtype: object
Top Value: count       46043
unique          8
top       Private
freq        33906
Name: workclass, dtype: object


In [174]:
#filling NaN values
census['workclass'].replace({np.NaN : 'Private'},inplace=True)
census['occupation'].replace({np.NaN : 'Prof-specialty'}, inplace=True)
census['native-country'].replace({np.NaN : 'United-States'},inplace=True)

In [175]:
#Sanity check to assure NaNs have been fixed with working values.
census.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [176]:
#checking for high cardinality in the dataset as well as seeing what to do with the features. Looks like 'fnlwgt' has a very high cardinality and isnt useful for the model
census.astype(object).nunique()

age                  74
workclass             8
fnlwgt            28523
education            16
education-num        16
marital-status        7
occupation           14
relationship          6
race                  5
sex                   2
capital-gain        123
capital-loss         99
hours-per-week       96
native-country       41
income                2
dtype: int64

#Working on the wrangle function.  Not sure how to get these three def/if/else functions wrapped into one working or multi working function inside of a wranglefunction🤔

In [177]:
#Create a New Feature that changes the income column into a 1 if they make more than 50K a year and 0 if they make 50K or less.  New Feature called 'makes-50K+'.
def over50K(census):
    if census['income'] == '>50K':
        val = 1
    else:
        val = 0
    return val
census['makes-50K+'] = census.apply(over50K, axis=1)

In [178]:
#Create a New Feature that changes the hours worked per week column into a 1 if they worked more than 40 hrs a week and 0 if they worked 40 or less.  New Feature called 'over40hrs'.
def over40(census):
    if census['hours-per-week'] >40:
        val = 1
    else:
        val = 0
    return val
census['over40hrs+'] = census.apply(over40, axis=1)

In [179]:
#Create a New Feature that changes the sex column into a 1 if they were Female and 0 if they were Male.  New Feature called 'gender-F/1-M/0'. This is new Target column.
def gender(census):
    if census['sex'] == 'Female':
            val = 1
    else:
            val = 0
    return val
census['gender-F/1-M/0'] = census.apply(gender, axis=1)

In [180]:
#checking to see new features were successful. They are all there.
census.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,makes-50K+,over40hrs+,gender-F/1-M/0
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K,0,0,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K,0,1,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K,1,0,0
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K,1,0,0
4,18,Private,103497,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,30,United-States,<=50K,0,0,1


In [181]:
# Time to drop columns we don't need anylonger.  Feature'fnlwgt' is high card and Unnecessary while 'sex' would now become a leaky feature and income and hours per week are now redundant
census = census.drop(columns=['fnlwgt','income','hours-per-week','sex'])

In [182]:
census

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,native-country,makes-50K+,over40hrs+,gender-F/1-M/0
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,0,0,United-States,0,0,0
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,0,0,United-States,0,1,0
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,0,0,United-States,1,0,0
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,7688,0,United-States,1,0,0
4,18,Private,Some-college,10,Never-married,Prof-specialty,Own-child,White,0,0,United-States,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,0,0,United-States,0,0,1
48838,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,0,0,United-States,1,0,0
48839,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,0,0,United-States,0,0,1
48840,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,0,0,United-States,0,0,0


# Splitting the Data

In [183]:
#Split data randomly with a 60/20/20 split

train, val, test = np.split(census.sample(frac=1), [int(.6*len(census)), int(.8*len(census))])
print('Training Set:',train.head(1))
print('Validation Set:',val.head(1))
print('Test Set',test.head(1))

Training Set:        age workclass education  education-num marital-status       occupation  \
29024   49   Private   HS-grad              9       Divorced  Exec-managerial   

      relationship   race  capital-gain  capital-loss native-country  \
29024    Unmarried  White             0             0  United-States   

       makes-50K+  over40hrs+  gender-F/1-M/0  
29024           0           0               1  
Validation Set:       age workclass education  education-num marital-status      occupation  \
9425   24   Private   Masters             14  Never-married  Prof-specialty   

       relationship                race  capital-gain  capital-loss  \
9425  Not-in-family  Asian-Pac-Islander             0             0   

     native-country  makes-50K+  over40hrs+  gender-F/1-M/0  
9425         France           0           0               0  
Test Set        age workclass     education  education-num marital-status  \
22516   47   Private  Some-college             10       Divorce

In [184]:
#Split the data into X and y for training the model and making predictions
y_train = train[target]
X_train = train.drop(target,axis=1)

In [185]:
y_val = val[target]
X_val = val.drop(target,axis=1)

In [186]:
y_test = test[target]
X_test = test.drop(target,axis=1)

# Establishing the Baseline

In [187]:
#First I will check that the target feature is between 50-70%.  Its almost to far off but still within the parameters to continue.
y_train.value_counts(normalize=True)

0    0.669681
1    0.330319
Name: gender-F/1-M/0, dtype: float64

In [189]:
y_train.value_counts()

0    19625
1     9680
Name: gender-F/1-M/0, dtype: int64

In [188]:
print('Baseline Accuracy:', y_train.value_counts(normalize=True).max())

Baseline Accuracy: 0.6696809418188022


# Building the Model

In [190]:
#Starting with a pipeline. Using OrdinalEncoder for the object columns, we do not need and Imputer since they were all filled with top values and I am working with XGBClassifier.
modelxgb = make_pipeline(
    OrdinalEncoder(),
    XGBClassifier(n_jobs=-1)
)
modelxgb.fit(X_train,y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['workclass', 'education',
                                      'marital-status', 'occupation',
                                      'relationship', 'race',
                                      'native-country'],
                                mapping=[{'col': 'workclass',
                                          'data_type': dtype('O'),
                                          'mapping': Private             1
Local-gov           2
State-gov           3
Self-emp-not-inc    4
Federal-gov         5
Self-emp-inc        6
Never-worked        7
Without-pay         8
NaN                -2
dtype: int64},
                                         {'col': 'education',
                                          'data_ty...
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                

In [191]:
print('Training accuracy:', modelxgb.score(X_train, y_train))
print('Validation accuracy:', modelxgb.score(X_val, y_val))

Training accuracy: 0.8840470909401126
Validation accuracy: 0.8447993447993448


In [192]:
scores = cross_val_score(modelxgb, X_train, y_train, cv=20)
scores

  elif pd.api.types.is_categorical(cols):


array([0.83287858, 0.83969986, 0.83901774, 0.83765348, 0.8526603 ,
       0.85870307, 0.82730375, 0.85392491, 0.84982935, 0.86075085,
       0.82525597, 0.84778157, 0.8443686 , 0.85119454, 0.84368601,
       0.8334471 , 0.84300341, 0.84300341, 0.85802048, 0.8552901 ])

In [194]:
pipeline = make_pipeline(
    OrdinalEncoder(),
    RandomForestClassifier(random_state=42)
)

params = {
    'randomforestclassifier__n_estimators': range(50,500,50),
    'randomforestclassifier__max_depth': range(5,101,5),
    'randomforestclassifier__max_samples': np.arange(0.2, 0.7, 0.2)
}

model = RandomizedSearchCV(
    pipeline,
    param_distributions=params,
    cv=5,
    verbose=1,
    n_iter=5
)

In [195]:
model.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:   31.5s finished


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('ordinalencoder',
                                              OrdinalEncoder()),
                                             ('randomforestclassifier',
                                              RandomForestClassifier(random_state=42))]),
                   n_iter=5,
                   param_distributions={'randomforestclassifier__max_depth': range(5, 101, 5),
                                        'randomforestclassifier__max_samples': array([0.2, 0.4, 0.6]),
                                        'randomforestclassifier__n_estimators': range(50, 500, 50)},
                   verbose=1)

In [213]:
print('Training accuracy:', model.score(X_train, y_train))
print('Validation accuracy:', model.score(X_val, y_val))

Training accuracy: 0.9025081044190412
Validation accuracy: 0.8391687141687142


In [214]:

# make predictions for test data
y_pred = model.predict(X_test)

# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 83.81%


In [212]:

# k-fold cross validation evaluation of xgboost model
from numpy import loadtxt
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# load data
# split data into X and y
# CV model
model_w_kf = modelxg
kfold = KFold(n_splits=5, random_state=7)
results = cross_val_score(modelxgb, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 84.23% (0.28%)


In [201]:
from sklearn.linear_model import Ridge, LinearRegression, LogisticRegression
log_model = make_pipeline(
      OrdinalEncoder(),
      LogisticRegression(max_iter=5)
  )

log_model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['workclass', 'education',
                                      'marital-status', 'occupation',
                                      'relationship', 'race',
                                      'native-country'],
                                mapping=[{'col': 'workclass',
                                          'data_type': dtype('O'),
                                          'mapping': Private             1
Local-gov           2
State-gov           3
Self-emp-not-inc    4
Federal-gov         5
Self-emp-inc        6
Never-worked        7
Without-pay         8
NaN                -2
dtype: int64},
                                         {'col': 'education',
                                          'data_ty...
Mexico                        13
Dominican-Republic            14
Columbia                      15
Germany                       16
China                         17
Cuba                          18
Eng

In [202]:
print('Training accuracy:', log_model.score(X_train, y_train))
print('Validation accuracy:', log_model.score(X_val, y_val))

Training accuracy: 0.6696809418188022
Validation accuracy: 0.6678951678951679


In [204]:
from sklearn.svm import SVC
svc_model = make_pipeline(
      OrdinalEncoder(),
      SVC()
  )

svc_model.fit(X_train, y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['workclass', 'education',
                                      'marital-status', 'occupation',
                                      'relationship', 'race',
                                      'native-country'],
                                mapping=[{'col': 'workclass',
                                          'data_type': dtype('O'),
                                          'mapping': Private             1
Local-gov           2
State-gov           3
Self-emp-not-inc    4
Federal-gov         5
Self-emp-inc        6
Never-worked        7
Without-pay         8
NaN                -2
dtype: int64},
                                         {'col': 'education',
                                          'data_ty...
Italy                          8
Hong                           9
Ireland                       10
India                         11
France                        12
Mexico                        13
Dom

In [205]:
print('Training accuracy:', svc_model.score(X_train, y_train))
print('Validation accuracy:', svc_model.score(X_val, y_val))

Training accuracy: 0.6696809418188022
Validation accuracy: 0.6678951678951679


In [207]:
lin_model = make_pipeline(
      OrdinalEncoder(),
      LinearRegression()
  )

lin_model.fit(X_train, y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['workclass', 'education',
                                      'marital-status', 'occupation',
                                      'relationship', 'race',
                                      'native-country'],
                                mapping=[{'col': 'workclass',
                                          'data_type': dtype('O'),
                                          'mapping': Private             1
Local-gov           2
State-gov           3
Self-emp-not-inc    4
Federal-gov         5
Self-emp-inc        6
Never-worked        7
Without-pay         8
NaN                -2
dtype: int64},
                                         {'col': 'education',
                                          'data_ty...
India                         11
France                        12
Mexico                        13
Dominican-Republic            14
Columbia                      15
Germany                       16
Chi

In [208]:
print('Training accuracy:', lin_model.score(X_train, y_train))
print('Validation accuracy:', lin_model.score(X_val, y_val))

Training accuracy: 0.2037651970148272
Validation accuracy: 0.2011317411636686


In [215]:
modelxgb.fit(X_train, y_train)
# make predictions for test data
y_pred = modelxgb.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

  elif pd.api.types.is_categorical(cols):


Accuracy: 84.38%
