# Create Regression Models to Predict Income
## **A Notebook for Using Demographic Data to Predict Income/Wage**

#### Import the data from pickles, and separate into the features/targets

In [1]:
import pandas as pd
import numpy as np

In [2]:
us_personal = pd.read_pickle('preprocessed_data/onehot_data.zip')
us_personal

Unnamed: 0,AGEP,JWMNP,WAGP,WKHP,PINCP,DIVISION_East North Central,DIVISION_East South Central,DIVISION_Middle Atlantic,DIVISION_Mountain,DIVISION_New England,...,RAC1P_Pacific Islander,RAC1P_White,WAOB_Africa,WAOB_Asia,WAOB_Europe,WAOB_Latin America,WAOB_North America,WAOB_Oceania and at Sea,WAOB_PR and US Island,WAOB_US State
0,19,0.0,0.0,0.0,-1500.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,18,0.0,1600.0,21.0,1600.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,53,0.0,10000.0,40.0,10000.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,28,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3214534,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3214535,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3214536,49,5.0,18500.0,40.0,18500.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3214537,19,45.0,11500.0,40.0,11500.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [28]:
from sklearn.preprocessing import RobustScaler # a scaler to overcome outliers from the data
from sklearn.linear_model import ElasticNet, LogisticRegression # baseline regression models
from keras import Sequential # all relevant imports for Keras (TensorFlow based neural net)
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

from sklearn.metrics import mean_squared_error, r2_score # metrics to exaluate the performance of our regressors
from sklearn.model_selection import GridSearchCV, PredefinedSplit

In [8]:
# Separate the training, validation, and testing sets
# Use a 70, 15, 15 percentage split for the three sets
def train_val_test_split(df, train_size, val_size, test_size):
    assert np.round(np.sum([train_size, val_size, test_size])) == 1 # ensure the sizes are valid
    # create the index boundaries for the dataframe split
    idx_a = int(train_size * len(df)); idx_b = int((train_size + val_size) * len(df))
    train, validation, test = np.split(df.sample(frac=1), [idx_a, idx_b])
    return train, validation, test

train, validation, test = train_val_test_split(us_personal, 0.70, 0.15, 0.15)

In [27]:
# Create the predefined split for using grid search to tune hyperparameters
validation_split_index = [0 if i in validation.index else -1 for i in us_personal.index]
predefined_split = PredefinedSplit(test_fold=validation_split_index)

In [31]:
# Separate the data into three groups: incomes or wages (targets), and features (all other columns)
# and return a corresponding dictionary with the three sets
def features_targets(df):
    features = df.drop(columns=['PINCP', 'WAGP'])
    incomes = df['PINCP']
    wages = df['WAGP']
    return {'features':features, 'incomes':incomes, 'wages':wages}

# create a dictionary representing the three sets and their features/targets
dataset = {
    'full':features_targets(us_personal),
    'train':features_targets(train),
    'validation':features_targets(validation),
    'test':features_targets(test)
}

In [32]:
# Initialize the RobustScaler on the training data (fit before training each regression model)
robust_scaler = RobustScaler().fit(dataset['full']['features'])

## Linear Regression
Implemented with ElasticNet, which combines L1 and L2 regularization

In [34]:
linear_model_params = {
    'alpha':[0.0, 0.01, 0.1, 1.0, 10.0],
    'l1_ratio':[0.00, 0.25, 0.50, 0.75, 1.00]
}
linear_model = GridSearchCV(estimator=ElasticNet(fit_intercept=True, max_iter=1000000), param_grid=linear_model_params, cv=predefined_split)

## Logisitc Regression

## Keras Neural Net Regression