# Create Regression Models to Predict Income
## **A Notebook for Using Demographic Data to Predict Income/Wage**

#### Import the data from pickles, and separate into the features/targets

In [1]:
import pandas as pd
import numpy as np

In [2]:
from datetime import datetime

def log(message):
    print(datetime.now().strftime("%H:%M:%S -"), message)
    
def printnow():
    print(datetime.now().strftime("Current time: %H:%M:%S"))

In [3]:
log('Reading data')
us_personal = pd.read_pickle('preprocessed_data/onehot_data.zip')
log('Done\n')

22:42:00 - Reading data
22:42:16 - Done



In [4]:
from sklearn.preprocessing import RobustScaler # a scaler to overcome outliers from the data
from sklearn.linear_model import ElasticNet, SGDRegressor # baseline regression models
from keras import Sequential # all relevant imports for Keras (TensorFlow based neural net)
from keras.layers import Dense, LeakyReLU, ELU
from keras.wrappers.scikit_learn import KerasRegressor

# metrics to exaluate the performance of our regressors
from sklearn.metrics import mean_squared_error, median_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split

Using TensorFlow backend.


In [5]:
# Separate the data into three groups: incomes or wages (targets), and features (all other columns)
# and return a corresponding dictionary with the three sets
def features_targets(df):
    features = df.drop(columns=['PINCP', 'WAGP'])
    incomes = df['PINCP']
    wages = df['WAGP']
    return {'features':features, 'incomes':incomes, 'wages':wages}

# create a dictionary representing the three sets and their features/targets
log('Splitting Train/Test Data')
train, test = train_test_split(us_personal, test_size=0.2)
dataset = {
    'train':features_targets(train),
    'test':features_targets(test)}


# Initialize the RobustScaler on the training data (fit before training each regression model)
log('Scaling Train/Test Features')
robust_scaler = RobustScaler().fit(dataset['train']['features'])
dataset['train']['features'] = robust_scaler.transform(dataset['train']['features'])
dataset['test']['features'] = robust_scaler.transform(dataset['test']['features'])
log('Done\n')

22:42:19 - Splitting Train/Test Data
22:42:40 - Scaling Train/Test Features
22:43:06 - Done



In [6]:
# a function to fit, test, and print results of a given estimator for a given target column
def fit_test_print(estimator, estimator_name, target, target_name, grid=1):
    log(f'Training {estimator_name}')
    estimator.fit(dataset['train']['features'], dataset['train'][target].array)
    y_pred_train = estimator.predict(dataset['train']['features'])
    y_pred_test = estimator.predict(dataset['test']['features'])
    log('Done\n')
    
    print(f"Results for {target_name} with {estimator_name}")
    if grid == 1: print(f"- Best parameters: {estimator.best_params_}")

    print(f"- Training Set")
    print(f"\tMean Squared Error: {mean_squared_error(dataset['train'][target], y_pred_train)}")
    print(f"\tMedian Absolute Error: {median_absolute_error(dataset['train'][target], y_pred_train)}")
    print(f"\tr-Squared: {r2_score(dataset['train'][target], y_pred_train)}")

    print(f"- Test Set")
    print(f"\tMean Squared Error: {mean_squared_error(dataset['test'][target], y_pred_test)}")
    print(f"\tMedian Absolute Error: {median_absolute_error(dataset['test'][target], y_pred_test)}")
    print(f"\tr-Squared: {r2_score(dataset['test'][target], y_pred_test)}")

## Linear Regression
Implemented with ElasticNet, which combines L1 and L2 regularization

In [9]:
linear_model_params = {
    'alpha':[0.01, 0.1, 1, 10],
    'l1_ratio':[0.25, 0.50, 0.75, 1]
}

linear_model_i = GridSearchCV(estimator=ElasticNet(), param_grid=linear_model_params, 
                            scoring=['neg_median_absolute_error', 'r2'], refit='r2', cv=4)
linear_model_w = GridSearchCV(estimator=ElasticNet(), param_grid=linear_model_params, 
                            scoring=['neg_median_absolute_error', 'r2'], refit='r2', cv=4)

In [8]:
fit_test_print(linear_model_i, 'Linear Model L1/L2 Regularized', 'incomes', 'Income')

16:14:08 - Training Linear Model L1/L2 Regularized
16:26:53 - Done

Results for Income with Linear Model L1/L2 Regularized
- Best parameters: {'alpha': 10, 'l1_ratio': 0.6}
- Training Set
	Mean Squared Error: 3274596194.857993
	Median Absolute Error: 22929.83070571476
	r-Squared: 0.11891055597779776
- Test Set
	Mean Squared Error: 3281850782.70632
	Median Absolute Error: 22924.74442348032
	r-Squared: 0.11853319389596961


In [10]:
fit_test_print(linear_model_w, 'Linear Model L1/L2 Regularized', 'wages', 'Wage')

16:29:04 - Training Linear Model L1/L2 Regularized
16:37:42 - Done

Results for Wage with Linear Model L1/L2 Regularized
- Best parameters: {'alpha': 10, 'l1_ratio': 0.6}
- Training Set
	Mean Squared Error: 2351956604.698796
	Median Absolute Error: 17558.869250567495
	r-Squared: 0.15201561556297183
- Test Set
	Mean Squared Error: 2333282440.602135
	Median Absolute Error: 17562.433428652887
	r-Squared: 0.15276740461628124


## SGD Regression
Loss set to Huber to be more robust to outliers in income and wage

In [7]:
sgd_model_params = {
    'alpha':[0.000001, 0.00001, 0.0001, 0.001]
}

sgd_model_i = GridSearchCV(estimator=SGDRegressor(penalty='elasticnet', learning_rate='adaptive'), param_grid=sgd_model_params, cv=4)
sgd_model_w = GridSearchCV(estimator=SGDRegressor(penalty='elasticnet', learning_rate='adaptive'), param_grid=sgd_model_params, cv=4)

In [8]:
fit_test_print(sgd_model_i, 'SGD Model L1/L2 Regularized', 'incomes', 'Income')

22:43:06 - Training SGD Model L1/L2 Regularized
23:35:27 - Done

Results for Income with SGD Model L1/L2 Regularized
- Best parameters: {'alpha': 0.0001}
- Training Set
	Mean Squared Error: 2313730116.8329206
	Median Absolute Error: 12513.364387014095
	r-Squared: 0.3787176688793734
- Test Set
	Mean Squared Error: 2288257245.7425137
	Median Absolute Error: 12522.970271169928
	r-Squared: 0.380348437367074


In [10]:
fit_test_print(sgd_model_w, 'Logistic Model L1/L2 Regularized', 'wages', 'Wage')

23:35:28 - Training Logistic Model L1/L2 Regularized
00:21:39 - Done

Results for Wage with Logistic Model L1/L2 Regularized
- Best parameters: {'alpha': 0.0001}
- Training Set
	Mean Squared Error: 1565194628.2093012
	Median Absolute Error: 9285.146097658673
	r-Squared: 0.4363179455367291
- Test Set
	Mean Squared Error: 1536207636.0796754
	Median Absolute Error: 9283.257766323872
	r-Squared: 0.4396291032552606


## Keras Neural Net Regression
Implemented with feed-forward dense layers 

In [13]:
# a function to initialize the model
def init_keras():
    model = Sequential()
    model.add(Dense(200, input_dim=151))
    model.add(ELU())
    model.add(Dense(200))
    model.add(LeakyReLU())
    model.add(Dense(50))
    model.add(LeakyReLU())
    model.add(Dense(20))
    model.add(LeakyReLU())
    model.add(Dense(1))
    model.compile(loss='huber_loss', optimizer='adam', metrics=['mean_absolute_error', 'mean_squared_error'])
    return model

nn_model_i = KerasRegressor(build_fn=init_keras, epochs=50, batch_size=32, verbose=1)
nn_model_w = KerasRegressor(build_fn=init_keras, epochs=50, batch_size=32, verbose=1)

In [14]:
fit_test_print(nn_model_i, 'Keras Neural Net', 'incomes', 'Income', grid=0)

00:29:41 - Training Keras Neural Net
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
03:11:25 - Done

Results for Income with Keras Neural Net
- Training Set
	Mean Squared Error: 2042122702.2769258
	Median Absolute Error: 5002.96484375
	r-Squared: 0.4516496355064821
- Test Set
	Mean Squared Error: 2101368726.2001605
	Median Absolute Error: 5240.8330078125
	r-Squared: 0.43095715427949255


In [15]:
fit_test_print(nn_model_w, 'Keras Neural Net', 'wages', 'Wage', grid=0)

03:11:26 - Training Keras Neural Net
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
05:49:24 - Done

Results for Wage with Keras Neural Net
- Training Set
	Mean Squared Error: 1189567093.7948844
	Median Absolute Error: 13.535892486572266
	r-Squared: 0.5715947325225952
- Test Set
	Mean Squared Error: 1236934931.6712804
	Median Absolute Error: 13.763057708740234
	r-Squared: 0.5487964513414394
