**Loading Data and creating benchmark model**

In [0]:
# Defining the path to the Github repository
file_url = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter17/adult.csv'

In [46]:
# Loading data using pandas
import pandas as pd
adultData = pd.read_csv(file_url,sep=",",na_values = " ?")
adultData.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours,native,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [50]:
adultData = adultData.dropna(axis = 0, how = 'any')
adultData.shape

(30162, 15)

In [0]:
# Removing the target variable
Y = adultData.pop('label')

In [0]:
from sklearn.model_selection import train_test_split

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(adultData, Y, test_size=0.3, random_state=123)

In [0]:
# Using pipeline to transform categorical variable and numeric variables
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])



In [0]:
# Defining data types for numeric and categorical features
numeric_features = adultData.select_dtypes(include=['int64', 'float64']).columns
categorical_features = adultData.select_dtypes(include=['object']).columns



In [0]:
# Defining preprocessor
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [0]:
# Defining the estimator for processing and classification
from sklearn.linear_model import LogisticRegression
estimator = Pipeline(steps=[('preprocessor', preprocessor),                      
                           ('classifier',LogisticRegression(random_state=123))])


In [57]:
# Fit the estimator on the training set
estimator.fit(X_train, y_train)  
print("model score: %.2f" % estimator.score(X_test, y_test)) 

model score: 0.85




In [0]:
# Predict on the test set
pred = estimator.predict(X_test)

In [59]:
# Generating classification report
from sklearn.metrics import classification_report

print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           0       0.93      0.88      0.90      7188
           1       0.62      0.75      0.68      1861

    accuracy                           0.85      9049
   macro avg       0.77      0.81      0.79      9049
weighted avg       0.87      0.85      0.86      9049



**Establishing entities and relationship**

In [0]:
# Creating the Ids for parent entity
adultData['parentID'] = adultData.index.values

adultData['parentID'] = 'record' + adultData['parentID'].astype(str)

In [0]:
# Creating unique Ids for entity workclass
adultData.loc[adultData.workclass == ' Federal-gov','workId']= 1
adultData.loc[adultData.workclass == ' Local-gov','workId']= 2
adultData.loc[adultData.workclass == ' Private','workId']= 3
adultData.loc[adultData.workclass == ' Self-emp-inc','workId']= 4
adultData.loc[adultData.workclass == ' Self-emp-not-inc','workId']= 5
adultData.loc[adultData.workclass == ' State-gov','workId']= 6
adultData.loc[adultData.workclass == ' Without-pay','workId']= 7

In [0]:
# Creating unique IDs for occupation
adultData.loc[adultData.occupation == ' Adm-clerical','occuId']= 1
adultData.loc[adultData.occupation == ' Armed-Forces','occuId']= 2
adultData.loc[adultData.occupation == ' Craft-repair','occuId']= 3
adultData.loc[adultData.occupation == ' Exec-managerial','occuId']= 4
adultData.loc[adultData.occupation == ' Farming-fishing','occuId']= 5
adultData.loc[adultData.occupation == ' Handlers-cleaners','occuId']= 6
adultData.loc[adultData.occupation == ' Machine-op-inspct','occuId']= 7
adultData.loc[adultData.occupation == ' Other-service','occuId']= 8
adultData.loc[adultData.occupation == ' Priv-house-serv','occuId']= 9
adultData.loc[adultData.occupation == ' Prof-specialty','occuId']= 10
adultData.loc[adultData.occupation == ' Protective-serv','occuId']= 11
adultData.loc[adultData.occupation == ' Sales','occuId']= 12
adultData.loc[adultData.occupation == ' Tech-support','occuId']= 13
adultData.loc[adultData.occupation == ' Transport-moving','occuId']= 14

In [0]:
# Importing necessary libraries
import featuretools as ft
import numpy as np

In [0]:
# creating the entity set 'adultentities'
adultentities = ft.EntitySet(id = 'Adult')

In [72]:
# Mapping a dataframe to the entityset to form the parent entity
adultentities.entity_from_dataframe(entity_id = 'Parent Data', dataframe = adultData, index = 'parentID')

Entityset: Adult
  Entities:
    Parent Data [Rows: 30162, Columns: 17]
  Relationships:
    No relationships

In [73]:
# Mapping to parent entity and setting the relationship
adultentities.normalize_entity(base_entity_id='Parent Data', new_entity_id='education', index = 'education-num', 
additional_variables = ['education'])

adultentities.normalize_entity(base_entity_id='Parent Data', new_entity_id='Workclass', index = 'workId', 
additional_variables = ['workclass'])

adultentities.normalize_entity(base_entity_id='Parent Data', new_entity_id='Occupation', index = 'occuId', 
additional_variables = ['occupation'])


Entityset: Adult
  Entities:
    Parent Data [Rows: 30162, Columns: 14]
    education [Rows: 16, Columns: 2]
    Workclass [Rows: 7, Columns: 2]
    Occupation [Rows: 14, Columns: 2]
  Relationships:
    Parent Data.education-num -> education.education-num
    Parent Data.workId -> Workclass.workId
    Parent Data.occuId -> Occupation.occuId

**Feature Engineering**

In [0]:
# Creating aggregation and transformation primitives
aggPrimitives=[
        'std', 'min', 'max', 'mean', 
         'last', 'count'
        
]
tranPrimitives=[
        'percentile', 
         'subtract', 'divide']

In [75]:
# Defining the new set of features
feature_set, feature_names = ft.dfs(entityset=adultentities, 
target_entity = 'Parent Data',
agg_primitives=aggPrimitives,
trans_primitives=tranPrimitives, 
max_depth = 2, 
verbose = 1, 
n_jobs = 1)

Built 1080 features
Elapsed: 00:29 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


In [0]:
# Reindexing the feature_set
feature_set = feature_set.reindex(index=adultData['parentID'])
feature_set = feature_set.reset_index()

In [77]:
# Displaying the feature set 
feature_set.shape

(30162, 1081)

**Cleaning na values and infinity values**

In [0]:
# Dropping all Ids
X = feature_set[feature_set.columns[~feature_set.columns.str.contains(
    'parentID|education-num|workId|occuId')]]


In [0]:
# Replacing all columns with infinity with nan
X = X.replace([np.inf, -np.inf], np.nan)



In [81]:
# Dropping all columns with nan
X = X.dropna(axis=1, how='any')
X.shape

(30162, 897)

**Modelling phase**

In [0]:
# Splitting train and test sets
from sklearn.model_selection import train_test_split

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)

In [0]:
# Creating the preprocessing pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [84]:
# Creating the estimator function and fitting the training set
estimator = Pipeline(steps=[('preprocessor', preprocessor),                      
                           ('classifier',LogisticRegression(random_state=123))])
estimator.fit(X_train, y_train)  
print("model score: %.2f" % estimator.score(X_test, y_test)) 



model score: 0.86


In [0]:
# Predicting on the test set
pred = estimator.predict(X_test)

In [86]:
# Generating the classification report
from sklearn.metrics import classification_report

print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91      7138
           1       0.64      0.76      0.70      1911

    accuracy                           0.86      9049
   macro avg       0.79      0.82      0.80      9049
weighted avg       0.87      0.86      0.86      9049

