In [1]:
# Import required modules

import pandas as pd
import numpy as np
import yaml

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error


In [2]:
# Import raw data

raw_df = pd.read_csv('nsmo_v50_1321_puf.csv')
raw_df.shape

(50542, 543)

In [3]:
# Load YAML files containing metadata into Python as dictionaries

# Load the variable labels
with open('variable_labels.yaml', 'r') as file:
    variable_labels_dict = yaml.safe_load(file)
    
# Load the the format of each variable into a dictionary
with open('variable_formats.yaml', 'r') as file:
    variable_formats_dict = yaml.safe_load(file)
    
# Load the categories for every categorical variable (exclude null categories)
with open('categorical_variables_categories.yaml', 'r') as file:
    categorical_variables_categories_dict = yaml.safe_load(file)

In [4]:
# Clean data by converting negative values and "." values (representing missing values) into null values

for col in raw_df.columns:
    # Exclude the Mortgage Performance Status variables because they have letters representing specific categories
    if variable_formats_dict[col] != 'PSTATFM':
        raw_df.loc[raw_df[col] < 0, col] = np.nan
        raw_df.loc[raw_df[col] == ".", col] = np.nan

In [5]:
# Check out a few obs after data cleaning

raw_df.tail()

Unnamed: 0,nsmoid,survey_wave,analysis_weight,x05a,x05b,x05c,x05d,x05e,x05f,x05g,...,mtmltv0621,mtmltv0921,mtmltv1221,mtmltv0322,mtmltv0622,mtmltv0922,mtmltv1222,mtmltv0323,mtmltv0623,mtmltv0923
50537,531289.0,34.0,2117.79,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,64.0,61.0,59.0,59.0,59.0,59.0,59.0,60.0
50538,546643.0,34.0,1738.92,3.0,3.0,2.0,2.0,2.0,1.0,3.0,...,,,79.0,77.0,74.0,72.0,72.0,71.0,71.0,71.0
50539,512993.0,34.0,2353.26,1.0,2.0,2.0,2.0,2.0,1.0,2.0,...,,,95.0,91.0,88.0,85.0,84.0,84.0,83.0,82.0
50540,518631.0,34.0,5283.75,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,,,56.0,53.0,50.0,49.0,49.0,49.0,48.0,48.0
50541,544740.0,34.0,1738.92,1.0,1.0,1.0,1.0,1.0,1.0,2.0,...,,,80.0,74.0,69.0,66.0,65.0,64.0,63.0,63.0


In [6]:
# Create a set of all variable formats

variable_formats_set = set(variable_formats_dict.values())

In [7]:
# Create a list of the categorical variables and a list of the numeric variables

categorical_variables = []
numeric_variables = []

categorical_variable_formats = set(categorical_variables_categories_dict.keys())
numeric_variable_formats = variable_formats_set - categorical_variable_formats

for col in raw_df.columns:
    if variable_formats_dict[col] in categorical_variable_formats:
        categorical_variables.append(col)
    elif variable_formats_dict[col] in numeric_variable_formats:
        numeric_variables.append(col)
    else:
        print("Error in bifurcation")

In [8]:
raw_df.tail()

Unnamed: 0,nsmoid,survey_wave,analysis_weight,x05a,x05b,x05c,x05d,x05e,x05f,x05g,...,mtmltv0621,mtmltv0921,mtmltv1221,mtmltv0322,mtmltv0622,mtmltv0922,mtmltv1222,mtmltv0323,mtmltv0623,mtmltv0923
50537,531289.0,34.0,2117.79,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,64.0,61.0,59.0,59.0,59.0,59.0,59.0,60.0
50538,546643.0,34.0,1738.92,3.0,3.0,2.0,2.0,2.0,1.0,3.0,...,,,79.0,77.0,74.0,72.0,72.0,71.0,71.0,71.0
50539,512993.0,34.0,2353.26,1.0,2.0,2.0,2.0,2.0,1.0,2.0,...,,,95.0,91.0,88.0,85.0,84.0,84.0,83.0,82.0
50540,518631.0,34.0,5283.75,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,,,56.0,53.0,50.0,49.0,49.0,49.0,48.0,48.0
50541,544740.0,34.0,1738.92,1.0,1.0,1.0,1.0,1.0,1.0,2.0,...,,,80.0,74.0,69.0,66.0,65.0,64.0,63.0,63.0


In [9]:
# Check out a few obs for just the categorical variable

raw_df[categorical_variables].tail()

Unnamed: 0,x05a,x05b,x05c,x05d,x05e,x05f,x05g,x06,x07,x08a,...,forb0621,forb0921,forb1221,forb0322,forb0622,forb0922,forb1222,forb0323,forb0623,forb0923
50537,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,2.0,1.0,...,,,,,2.0,2.0,2.0,2.0,2.0,2.0
50538,3.0,3.0,2.0,2.0,2.0,1.0,3.0,3.0,3.0,2.0,...,,,,2.0,2.0,2.0,2.0,2.0,2.0,2.0
50539,1.0,2.0,2.0,2.0,2.0,1.0,2.0,3.0,2.0,1.0,...,,,,,2.0,2.0,2.0,2.0,2.0,2.0
50540,3.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0,1.0,1.0,...,,,,,2.0,2.0,2.0,2.0,2.0,2.0
50541,1.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,2.0,1.0,...,,,,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [10]:
# Check out a few obs for just the numeric variable

raw_df[numeric_variables].tail()

Unnamed: 0,nsmoid,survey_wave,analysis_weight,x74r,rate_spread,pmms,term,ltv,cltv,dti,...,mtmltv0621,mtmltv0921,mtmltv1221,mtmltv0322,mtmltv0622,mtmltv0922,mtmltv1222,mtmltv0323,mtmltv0623,mtmltv0923
50537,531289.0,34.0,2117.79,57.0,0.64,3.11,40.0,64.0,64.0,42.0,...,,,64.0,61.0,59.0,59.0,59.0,59.0,59.0,60.0
50538,546643.0,34.0,1738.92,37.0,0.03,3.1,30.0,79.0,79.0,33.0,...,,,79.0,77.0,74.0,72.0,72.0,71.0,71.0,71.0
50539,512993.0,34.0,2353.26,26.0,,3.1,30.0,95.0,95.0,35.0,...,,,95.0,91.0,88.0,85.0,84.0,84.0,83.0,82.0
50540,518631.0,34.0,5283.75,36.0,,3.1,20.0,56.0,56.0,46.0,...,,,56.0,53.0,50.0,49.0,49.0,49.0,48.0,48.0
50541,544740.0,34.0,1738.92,42.0,0.08,3.05,30.0,80.0,80.0,20.0,...,,,80.0,74.0,69.0,66.0,65.0,64.0,63.0,63.0


In [11]:
# View survey answers for any given observation in a human readable format using the YAML metadata

if False:
    one_obs = raw_df.iloc[50541]
    # Loop through all columns for one obs
    for col, value in one_obs.items():
        # if it's a categorical variable, then look up the category
        if not(pd.isna(value)) and col in categorical_variables:
            print(variable_labels_dict[col], ":", categorical_variables_categories_dict[variable_formats_dict[col]][value])
        # else it's a numeric variable or null
        else:
            print(variable_labels_dict[col], ":", value)

In [12]:
# Create dummy variables for each category for each categorical variable

processed_df = pd.get_dummies(raw_df, columns=categorical_variables)

In [13]:
# Remove the ".0" in many of the dummy variable due to the columns in the raw data being floats

new_columns_list = []
for col in processed_df.columns:
    new_col = col.replace(".0", "")
    new_columns_list.append(new_col)
    
processed_df.columns = new_columns_list

In [14]:
# Retrieve the names of the new categorical variables (i.e., the dummy variables)

new_categorical_variables = []
for col in processed_df.columns:
    if col not in numeric_variables:
        new_categorical_variables.append(col)

In [15]:
# Identify the target variable

target_variable = 'rate_spread'

In [16]:
# List the NSMO variables to be excluded

list_of_NSMO_variables = ['nsmoid',         # NSMO Identification Number
                          'survey_wave',    # NSMO Survey Wave (Quarterly)
                          'analysis_weight' # NSMO Analysis Weight (Sampling Weight x Non-response Adjustment)
                         ]

In [17]:
# Create a list of all excluded variables

list_of_excluded_variables = list([target_variable]) + list_of_NSMO_variables
print(list_of_excluded_variables)

['rate_spread', 'nsmoid', 'survey_wave', 'analysis_weight']


In [18]:
# Segregate the predictor variables from the target variable

X = processed_df.drop(columns=list_of_excluded_variables)
y = processed_df[target_variable]

In [19]:
# Impute missing values just for the target variable

y = y.fillna(y.mean())

In [20]:
# Split the data into training and testing partitions

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=0)

In [21]:
# Fit a Histogram-based Gradient Boosting Regression Tree on the training partition
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html

params = {'max_depth': 4, 'learning_rate': 0.01, 'random_state':0}
regr = HistGradientBoostingRegressor(**params)
regr.fit(X_train, y_train)

In [22]:
# Compute the performance metrics on the holdout testing partition

mae = mean_absolute_error(y_test, regr.predict(X_test))
mse = mean_squared_error(y_test, regr.predict(X_test))
rmse = np.sqrt(mse)
print("Mean Absolute Error: ", mae)
print("Root Mean Squared Error: ", rmse)

Mean Absolute Error:  0.21811575627492227
Root Mean Squared Error:  0.42845198444621235
