# Create Regression Models to Predict Income
## **A Notebook for Using Demographic Data to Predict Income/Wage**

#### Import the data from pickles, and separate into the features/targets

In [None]:
import pandas as pd
import numpy as np

In [None]:
from datetime import datetime

def log(message):
    print(datetime.now().strftime("%H:%M:%S -"), message)
    
def printnow():
    print(datetime.now().strftime("Current time: %H:%M:%S"))

In [None]:
log('Reading data')
us_personal = pd.read_pickle('preprocessed_data/onehot_data.zip')
log('Done\n')

In [None]:
from sklearn.preprocessing import RobustScaler # a scaler to overcome outliers from the data
from sklearn.linear_model import ElasticNet, LogisticRegression # baseline regression models
from keras import Sequential # all relevant imports for Keras (TensorFlow based neural net)
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

# metrics to exaluate the performance of our regressors
from sklearn.metrics import mean_squared_error, median_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split

In [None]:
# Separate the data into three groups: incomes or wages (targets), and features (all other columns)
# and return a corresponding dictionary with the three sets
def features_targets(df):
    features = df.drop(columns=['PINCP', 'WAGP'])
    incomes = df['PINCP']
    wages = df['WAGP']
    return {'features':features, 'incomes':incomes, 'wages':wages}

# create a dictionary representing the three sets and their features/targets
log('Splitting Train/Test Data')
train, test = train_test_split(us_personal, test_size=0.2)
dataset = {
    'train':features_targets(train),
    'test':features_targets(test)}


# Initialize the RobustScaler on the training data (fit before training each regression model)
log('Scaling Train/Test Features')
robust_scaler = RobustScaler().fit(dataset['train']['features'])
dataset['train']['features'] = robust_scaler.transform(dataset['train']['features'])
dataset['test']['features'] = robust_scaler.transform(dataset['test']['features'])
log('Done\n')

In [None]:
# a function to fit, test, and print results of a given estimator for a given target column
def fit_test_print(estimator, estimator_name, target, target_name, grid=1):
    log(f'Training {estimator_name}')
    estimator.fit(dataset['train']['features'], dataset['train'][target])
    y_pred_train = estimator.predict(dataset['train']['features'])
    y_pred_test = estimator.predict(dataset['test']['features'])
    log('Done\n')
    
    print(f"Results for {target_name} with {estimator_name}")
    if grid == 1: print(f"- Best parameters: {estimator.best_params_}")

    print(f"- Training Set")
    print(f"\tMean Squared Error: {mean_squared_error(dataset['train'][target], y_pred_train)}")
    print(f"\tMedian Absolute Error: {median_absolute_error(dataset['train'][target], y_pred_train)}")
    print(f"\tr-Squared: {r2_score(dataset['train'][target], y_pred_train)}")

    print(f"- Test Set")
    print(f"\tMean Squared Error: {mean_squared_error(dataset['test'][target], y_pred_test)}")
    print(f"\tMedian Absolute Error: {median_absolute_error(dataset['test'][target], y_pred_test)}")
    print(f"\tr-Squared: {r2_score(dataset['test'][target], y_pred_test)}")

## Linear Regression
Implemented with ElasticNet, which combines L1 and L2 regularization

In [None]:
linear_model_params = {
    'alpha':[10, 100, 1000, 10000],
    'l1_ratio':[0.40, 0.60]
}

linear_model = GridSearchCV(estimator=ElasticNet(), param_grid=linear_model_params, 
                            scoring=['neg_mean_absolute_error', 'r2'], 
                            refit='r2', n_jobs=4, cv=4)

In [None]:
fit_test_print(linear_model, 'Linear Model L1/L2 Regularized', 'incomes', 'Income')

In [None]:
fit_test_print(linear_model, 'Linear Model L1/L2 Regularized', 'wages', 'Wage')

## Logisitc Regression

In [None]:
logistic_model_params = {
    'C':[1, 10, 100, 1000],
    'l1_ratio':[0, 0.5, 1]
}

logistic_model = GridSearchCV(estimator=LogisticRegression(solver='saga'), param_grid=logistic_model_params, 
                            scoring=['neg_mean_absolute_error', 'r2'], 
                            refit='r2', n_jobs=4, cv=4)

In [None]:
fit_test_print(logistic_model, 'Logistic Model L1/L2 Regularized', 'incomes', 'Income')

In [None]:
fit_test_print(logistic_model, 'Logistic Model L1/L2 Regularized', 'wages', 'Wage')

## Keras Neural Net Regression
Implemented with feed-forward dense layers 

In [None]:
# a function to initialize the model
def init_keras():
    model = Sequential()
    model.add(Dense(200, input_dim=151, activation='relu'))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(1))
    model.compile(loss=['huber_loss'], optimizer='adam', metrics=['mean_absolute_error', 'cosine_proximity'])
    return model

neural_net_model = KerasRegressor(build_fn=init_keras, epochs=50, batch_size=32, verbose=1)

In [None]:
fit_test_print(neural_net_model, 'Keras Neural Net', 'incomes', 'Income', grid=0)

In [None]:
fit_test_print(neural_net_model, 'Keras Neural Net', 'wages', 'Wage', grid=0)