In [5]:
# Import the required modules

import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import numpy as np
import scipy as sp

import yaml
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin


In [6]:
# Read in the cleaned and joined data
processed_df = pd.read_csv('bin/cleaned_and_joined.csv')
processed_df.shape

(50542, 1696)

In [None]:
# # Load the labels of each variable into a dictionary
# # https://www.fhfa.gov/sites/default/files/2024-06/nsmo_v50_labels.sas

# with open('variable_labels.yaml', 'r', encoding='utf-8') as file:
#     variable_labels_dict = yaml.safe_load(file)

# # Load the the format of each variable into a dictionary
# # https://www.fhfa.gov/sites/default/files/2024-06/nsmo_v50_labels.sas

# with open('variable_formats.yaml', 'r', encoding='utf-8') as file:
#     variable_formats_dict = yaml.safe_load(file)

# # Load the categories for every categorical variable (exclude null categories) into a dictionary
# # https://www.fhfa.gov/sites/default/files/2024-06/nsmo_v50_formats.sas

# with open('categorical_variables_categories.yaml', 'r', encoding='utf-8') as file:
#     categorical_variables_categories_dict = yaml.safe_load(file)

# # Create a set of all variable formats

# variable_formats_set = set(variable_formats_dict.values())

# # Create a list of the categorical variables and a list of the numeric variables

# categorical_variables = []
# numeric_variables = []

# categorical_variable_formats = set(categorical_variables_categories_dict.keys())
# numeric_variable_formats = variable_formats_set - categorical_variable_formats

# for col in processed_df.columns:
#     if variable_formats_dict[col] in categorical_variable_formats:
#         categorical_variables.append(col)
#     elif variable_formats_dict[col] in numeric_variable_formats:
#         numeric_variables.append(col) 
#     else:
#         print("Error in bifurcation")

In [8]:
target_variable = 'Beta_winsorized'

In [9]:
# List variables to be excluded

exclude_variables = ['nsmoid',          # NSMO Identification Number
                     'survey_wave',     # NSMO Survey Wave (Quarterly)
                     'analysis_weight', # NSMO Analysis Weight (Sampling Weight x Non-response Adjustment)
                     'rate_spread',     # Mortgage Interest Rate Spread at Origination (Percent)
                     'pmms',            # Freddie Mac's Primary Mortgage Market Survey (PMMS) Rate at Origination (Percent)
                     'DGS30',           # Market Yield on U.S. Treasury Securities at 30-Year Constant Maturity, Quoted on an Investment Basis
                     'Beta'             # Original Beta before it was winsorized
                    ]

In [10]:
# Append the new numeric variables to the list of all numeric variables

numeric_variables.append('DGS30')
numeric_variables.append('Beta')
numeric_variables.append('Beta_winsorized')

NameError: name 'numeric_variables' is not defined

In [None]:
# Create a complete list of all excluded variables and exclude them from the list of numeric variables
# Updated this cell from the previous workbook to utilize the new cleaned and joined dataset

list_of_excluded_variables = list([target_variable]) + exclude_variables
print("Excluded variables:", list_of_excluded_variables)
print("Numeric variables before exclusion:", numeric_variables)

# Filter out excluded variables using list comprehension
numeric_variables = [var for var in numeric_variables if var not in list_of_excluded_variables]

print("Numeric variables after exclusion:", numeric_variables)

In [None]:
# Calculate the percentage of missing values in the target variable

sum_of_missing_values_in_target = processed_df[target_variable].isna().sum()
print("sum of missing values in target: ", sum_of_missing_values_in_target, 
      "\ntotal observations in data: ", processed_df.shape[0], 
      "\npercentage of target with missing values: ", round(sum_of_missing_values_in_target / processed_df.shape[0] * 100,1), "%")

In [None]:
# Drop observations in the data where the target variable has missing values

processed_df = processed_df.dropna(subset=target_variable)

In [None]:
# Segregate the predictor variables from the target variable

X = processed_df.drop(columns=list_of_excluded_variables)
y = processed_df[target_variable]

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")


In [None]:
# Split the data into training and testing partitions

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=0)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


In [None]:
# Additional debugging to understand the data
print(f"X_train shape: {X_train.shape}")
print(f"X_train columns: {X_train.columns.tolist()}")
print(f"X_train dtypes:\n{X_train.dtypes}")

# Check what variables were excluded
print(f"\nTarget variable: {target_variable}")
print(f"Exclude variables: {exclude_variables}")
print(f"List of excluded variables: {list_of_excluded_variables}")

In [None]:
# Impute missing values for the numeric variables using the mean values

# Instantiate an imputer object
imputer = SimpleImputer(strategy='mean')
print(len(numeric_variables))

# Prevent "data leakage" by fitting the imputer on just the training data, and then apply it to the holdout testing data
# imputer.fit(X_train[numeric_variables])
# print(f"Imputer fitted for {len(numeric_variables)} numeric variables")

# # Transform both the training and testing data using the imputer fitted on just the training data
# X_train[numeric_variables] = imputer.transform(X_train[numeric_variables])
# X_test[numeric_variables] = imputer.transform(X_test[numeric_variables])

# print(f"Imputation completed for {len(numeric_variables)} numeric variables")
# print(f"X_train shape: {X_train.shape}")
# print(f"X_test shape: {X_test.shape}")


In [None]:
# Scale values for the numeric variables

# Instantiate a scaler object
scaler = StandardScaler()

# Prevent "data leakage" by fitting the scaler on just the imputed training data, and then apply it to the holdout testing data
scaler.fit(X_train[numeric_variables])

# Scale both the training data and testing data using the scaler fitted on just the training data
X_train[numeric_variables] = scaler.transform(X_train[numeric_variables])
X_test[numeric_variables] = scaler.transform(X_test[numeric_variables])