# Feature Engineering

### Feature Engineering Version 1 -- Kavin

Let us turn all of our features into versions that can be well-used within a model.

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
#data read + process
mycwd = os.getcwd()
os.chdir("..")
df = pd.read_csv("data/" + "adult.data", 
            index_col=False, 
            names=['age', 
                   'workclass', 
                   'fnlwgt', 
                   'education', 
                   'education-num', 
                   'marital-status', 
                   'occupation', 
                   'relationship', 
                   'race', 
                   'sex', 
                   'capital-gain', 
                   'capital-loss',
                   'hours-per-week',
                   'native-country',
                   'income'])
os.chdir(mycwd)
df.head()
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

I think this would be a great time to one-hot encode categorical variables for our future model's sake.

##### One-hot Encoding:

In [3]:
#workclass
dummies = pd.get_dummies(df["workclass"])
df = pd.concat([df, dummies], axis=1)

#education
dummies = pd.get_dummies(df["education"])
df = pd.concat([df, dummies], axis=1)

#marital-status
dummies = pd.get_dummies(df["marital-status"])
df = pd.concat([df, dummies], axis=1)

#occupation
dummies = pd.get_dummies(df["occupation"])
df = pd.concat([df, dummies], axis=1)

#relationship
dummies = pd.get_dummies(df["relationship"])
df = pd.concat([df, dummies], axis=1)

#race
dummies = pd.get_dummies(df["race"])
df = pd.concat([df, dummies], axis=1)

#sex
dummies = pd.get_dummies(df["sex"])
df = pd.concat([df, dummies], axis=1)

#native-country
dummies = pd.get_dummies(df["native-country"])
df = pd.concat([df, dummies], axis=1)

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,0,0,0,0,0,0,0,1,0,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,0,0,0,0,0,0,0,1,0,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,0,0,0,0,0,0,0,1,0,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,0,0,0,0,0,0,0,1,0,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,0,0,0,0,0,0,0,0,0,0


Now, let us conduct some log-transforms to account for the weighting of numerical values.

##### Log-transforms:
Help from this article: https://towardsdatascience.com/feature-engineering-for-machine-learning-3a5e293a5114#3abe

One key transform is age. Older years make less of a difference in earnings potential than younger years, so we should account for this.

In [4]:
#log transform age
df['age log transformed'] = (df['age']+1).transform(np.log)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,age log transformed
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,0,0,0,0,0,0,1,0,0,3.688879
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,0,0,0,0,0,0,1,0,0,3.931826
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,0,0,0,0,0,0,1,0,0,3.663562
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,0,0,0,0,0,0,1,0,0,3.988984
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,0,0,0,0,0,0,0,0,0,3.367296


The same logic can be applied to the number of years spent in education.

There is a diminishing returns aspect to the amount of years one spends in education and their earnings potential. As a result, a log transform will help us put a heavier emphasis on jumps in lower years spent in education.

In [5]:
#log transform education-num
df['years in education log transformed'] = (df['education-num']+1).transform(np.log)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,age log transformed,years in education log transformed
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,0,0,0,0,0,1,0,0,3.688879,2.639057
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,0,0,0,0,0,1,0,0,3.931826,2.639057
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,0,0,0,0,0,1,0,0,3.663562,2.302585
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,0,0,0,0,0,1,0,0,3.988984,2.079442
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,0,0,0,0,0,0,0,0,3.367296,2.639057


We see that there is a fairly large skew for capital-gain and capital-loss, where most people have 0, but a subset of people have much higher values than 0. To account for this extreme skew, we can apply a log-transform to both capital-gain and capital loss.

In [6]:
#log transform capital-gain
df['capital-gain log transformed'] = (df['capital-gain']+1).transform(np.log)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,age log transformed,years in education log transformed,capital-gain log transformed
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,0,0,0,0,1,0,0,3.688879,2.639057,7.684784
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,0,0,0,0,1,0,0,3.931826,2.639057,0.0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,0,0,0,0,1,0,0,3.663562,2.302585,0.0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,0,0,0,0,1,0,0,3.988984,2.079442,0.0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,0,0,0,0,0,0,0,3.367296,2.639057,0.0


In [7]:
#log transform capital-loss
df['capital-loss log transformed'] = (df['capital-loss']+1).transform(np.log)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,age log transformed,years in education log transformed,capital-gain log transformed,capital-loss log transformed
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,0,0,0,1,0,0,3.688879,2.639057,7.684784,0.0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,0,0,0,1,0,0,3.931826,2.639057,0.0,0.0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,0,0,0,1,0,0,3.663562,2.302585,0.0,0.0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,0,0,0,1,0,0,3.988984,2.079442,0.0,0.0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,0,0,0,0,0,0,3.367296,2.639057,0.0,0.0


Finally, for similar reasons we will looked at the hours worked per week. In terms of $50k+, there is a diminishing returns aspect to working more than a certain number of hours. Hours 40 and below are probably more important for figuring out <=50k or >50k. We could do a complicated metric for this, but I think the best way would be to just do a log-transform that give higher weighting to these lower values.

In [8]:
#log transform hours-per-week
df['hours-per-week log transformed'] = (df['hours-per-week']+1).transform(np.log)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,age log transformed,years in education log transformed,capital-gain log transformed,capital-loss log transformed,hours-per-week log transformed
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,0,0,1,0,0,3.688879,2.639057,7.684784,0.0,3.713572
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,0,0,1,0,0,3.931826,2.639057,0.0,0.0,2.639057
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,0,0,1,0,0,3.663562,2.302585,0.0,0.0,3.713572
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,0,0,1,0,0,3.988984,2.079442,0.0,0.0,3.713572
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,0,0,0,0,0,3.367296,2.639057,0.0,0.0,3.713572


From here, I can make some combined features based on features I think will correlate well together.

##### Combined Features:

I think an interesting one is the comparison between years in education and hours worked per week. I assume that those with less education are all over that spectrum, but those with more education tend to work less hours. Those with higher ratios would be more likely to be higher earners. This could help especially with classifying those with >50k in income.

In [9]:
df["years educated / hours worked"] = df["years in education log transformed"] / df["hours-per-week log transformed"]
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,age log transformed,years in education log transformed,capital-gain log transformed,capital-loss log transformed,hours-per-week log transformed,years educated / hours worked
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,0,1,0,0,3.688879,2.639057,7.684784,0.0,3.713572,0.710652
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,0,1,0,0,3.931826,2.639057,0.0,0.0,2.639057,1.0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,0,1,0,0,3.663562,2.302585,0.0,0.0,3.713572,0.620046
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,0,1,0,0,3.988984,2.079442,0.0,0.0,3.713572,0.559957
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,0,0,0,0,3.367296,2.639057,0.0,0.0,3.713572,0.710652


Another interesting one could be capital-gain and age. I would expect them to correlate together, where older people are more likely to haver a higher capital-gains number. Hence, multiplying the two could be a useful feature for the model.

In [10]:
df["capital gains * age"] = df["capital-gain log transformed"] * df["age log transformed"]
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,United-States,Vietnam,Yugoslavia,age log transformed,years in education log transformed,capital-gain log transformed,capital-loss log transformed,hours-per-week log transformed,years educated / hours worked,capital gains * age
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,1,0,0,3.688879,2.639057,7.684784,0.0,3.713572,0.710652,28.348242
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,1,0,0,3.931826,2.639057,0.0,0.0,2.639057,1.0,0.0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,1,0,0,3.663562,2.302585,0.0,0.0,3.713572,0.620046,0.0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,1,0,0,3.988984,2.079442,0.0,0.0,3.713572,0.559957,0.0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,0,0,0,3.367296,2.639057,0.0,0.0,3.713572,0.710652,0.0


##### Adjust target variable

I think another useful thing to do would be to adjust our target variable of the income, to be a binary categorical variable. We can have it so >50k in income is 1 and anything below is 0.

This can be done by using one-hot encoding and just dropping the >50k column.

We will also drop the original income column here.

In [11]:
#income
dummies = pd.get_dummies(df["income"])
df = pd.concat([df, dummies], axis=1)

df.drop(['income'], axis=1, inplace=True)
#drop >50K
df = df.iloc[:, :-1]

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,Vietnam,Yugoslavia,age log transformed,years in education log transformed,capital-gain log transformed,capital-loss log transformed,hours-per-week log transformed,years educated / hours worked,capital gains * age,<=50K
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,0,0,3.688879,2.639057,7.684784,0.0,3.713572,0.710652,28.348242,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,0,0,3.931826,2.639057,0.0,0.0,2.639057,1.0,0.0,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,0,0,3.663562,2.302585,0.0,0.0,3.713572,0.620046,0.0,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,0,0,3.988984,2.079442,0.0,0.0,3.713572,0.559957,0.0,1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,0,0,3.367296,2.639057,0.0,0.0,3.713572,0.710652,0.0,1


#### Feature Dropping:

I will go ahead and now drop the columns that will not be used in our model.

In [14]:
df.drop(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
         'marital-status', 'occupation', 'relationship', 'race', 'sex',
         'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'], 
        axis = 1, 
        inplace = True)

df.head()

Unnamed: 0,?,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay,10th,...,Vietnam,Yugoslavia,age log transformed,years in education log transformed,capital-gain log transformed,capital-loss log transformed,hours-per-week log transformed,years educated / hours worked,capital gains * age,<=50K
0,0,0,0,0,0,0,0,1,0,0,...,0,0,3.688879,2.639057,7.684784,0.0,3.713572,0.710652,28.348242,1
1,0,0,0,0,0,0,1,0,0,0,...,0,0,3.931826,2.639057,0.0,0.0,2.639057,1.0,0.0,1
2,0,0,0,0,1,0,0,0,0,0,...,0,0,3.663562,2.302585,0.0,0.0,3.713572,0.620046,0.0,1
3,0,0,0,0,1,0,0,0,0,0,...,0,0,3.988984,2.079442,0.0,0.0,3.713572,0.559957,0.0,1
4,0,0,0,0,1,0,0,0,0,0,...,0,0,3.367296,2.639057,0.0,0.0,3.713572,0.710652,0.0,1


Awesome, we now have an amazing dataframe to train our model on!

#### Now let us turn this feature engineering into a utility function:

For future convenience and calling this feature engineering in other notebooks, let us turn all the feature engineering done here into a utility function in our utils.py file.

We will write out the function here for clarity and then move it to our utils.py function.

In [15]:
def featureEngineeringKavinV1(df):
    #workclass
    dummies = pd.get_dummies(df["workclass"])
    df = pd.concat([df, dummies], axis=1)

    #education
    dummies = pd.get_dummies(df["education"])
    df = pd.concat([df, dummies], axis=1)

    #marital-status
    dummies = pd.get_dummies(df["marital-status"])
    df = pd.concat([df, dummies], axis=1)

    #occupation
    dummies = pd.get_dummies(df["occupation"])
    df = pd.concat([df, dummies], axis=1)

    #relationship
    dummies = pd.get_dummies(df["relationship"])
    df = pd.concat([df, dummies], axis=1)

    #race
    dummies = pd.get_dummies(df["race"])
    df = pd.concat([df, dummies], axis=1)

    #sex
    dummies = pd.get_dummies(df["sex"])
    df = pd.concat([df, dummies], axis=1)

    #native-country
    dummies = pd.get_dummies(df["native-country"])
    df = pd.concat([df, dummies], axis=1)
    
    #log transform age
    df['age log transformed'] = (df['age']+1).transform(np.log)
    
    #log transform education-num
    df['years in education log transformed'] = (df['education-num']+1).transform(np.log)
    
    #log transform hours-per-week
    df['hours-per-week log transformed'] = (df['hours-per-week']+1).transform(np.log)
    
    df["years educated / hours worked"] = df["years in education log transformed"] / df["hours-per-week log transformed"]

    df["capital gains * age"] = df["capital-gain log transformed"] * df["age log transformed"]

    #income
    dummies = pd.get_dummies(df["income"])
    df = pd.concat([df, dummies], axis=1)

    #drop original income column
    df.drop(['income'], axis=1, inplace=True)
    #drop >50K
    df = df.iloc[:, :-1]
    
    
    #dropping of columns that will be unused by model
    df.drop(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
         'marital-status', 'occupation', 'relationship', 'race', 'sex',
         'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'], 
        axis = 1, 
        inplace = True)
    
    return df

### Feature Engineering Version 2 -- Naomi

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

In [2]:
data = pd.read_csv("../data/adult.data",
                   names = ['age', 'workclass', 'fnlwgt', 'education','education-num',
                            'marital-status','occupation','relationship','race','sex',
                           'capital-gain','capital-loss','hours-per-week',
                            'native-country','income'])

In [3]:
# replace ? to NaN
data = data.replace('\?', np.nan, regex=True)
# remove the rows that have NaN values
data = data.dropna()

In [4]:
# Label Encoding
for col in data.columns:
    if data[col].dtypes == 'object':
        encoder = LabelEncoder()
        data[col] = encoder.fit_transform(data[col])

In [5]:
# Selecting and Scaling
X = data.drop(['income'], axis = 1)
y = data['income']

scaler = StandardScaler()
dataset = scaler.fit_transform(X)
X = pd.DataFrame(dataset, columns = X.columns)

In [6]:
# the data is imbalanced 
y.value_counts()

0    22654
1     7508
Name: income, dtype: int64

In [7]:
# Resampling
df_majority = data[(data['income'] == 0)]
df_minority = data[(data['income'] == 1)]

upsample = resample(df_minority, replace = True, n_samples = 22654, random_state = 1 )
df_upsample = pd.concat([upsample, df_majority])

df_upsample['income'].value_counts()

1    22654
0    22654
Name: income, dtype: int64

### Feature Engineering Version 3 -- George

In [None]:
...

### Feature Engineering Version 4 -- Winston

In [None]:
...