In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import tensorflow as tf

# Import our input dataset
df_1991 = pd.read_csv('../data_csv/clean_1991_df.csv')
df_1991.head()

Unnamed: 0.1,Unnamed: 0,hpi_type,hpi_flavor,frequency,level,place_name,place_id,yr,period,index_nsa,index_sa
0,0,traditional,monthly,USA or Census Division,East North Central Division,DV_ENC,1991,purchase-only,1,100.0,100.0
1,1,traditional,monthly,USA or Census Division,East North Central Division,DV_ENC,1991,purchase-only,2,100.92,100.98
2,2,traditional,monthly,USA or Census Division,East North Central Division,DV_ENC,1991,purchase-only,3,101.31,100.93
3,3,traditional,monthly,USA or Census Division,East North Central Division,DV_ENC,1991,purchase-only,4,101.7,101.0
4,4,traditional,monthly,USA or Census Division,East North Central Division,DV_ENC,1991,purchase-only,5,102.32,101.37


In [2]:
# Drop the non-beneficial ID columns
df_1991 = df_1991.drop(columns = ['Unnamed: 0'], axis= 1)
df_1991.head()

Unnamed: 0,hpi_type,hpi_flavor,frequency,level,place_name,place_id,yr,period,index_nsa,index_sa
0,traditional,monthly,USA or Census Division,East North Central Division,DV_ENC,1991,purchase-only,1,100.0,100.0
1,traditional,monthly,USA or Census Division,East North Central Division,DV_ENC,1991,purchase-only,2,100.92,100.98
2,traditional,monthly,USA or Census Division,East North Central Division,DV_ENC,1991,purchase-only,3,101.31,100.93
3,traditional,monthly,USA or Census Division,East North Central Division,DV_ENC,1991,purchase-only,4,101.7,101.0
4,traditional,monthly,USA or Census Division,East North Central Division,DV_ENC,1991,purchase-only,5,102.32,101.37


In [3]:
#check for bucketing
# Generate our categorical variable list
cat_1991 = df_1991.dtypes[df_1991.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
df_1991[cat_1991].nunique()

hpi_type        2
hpi_flavor      2
frequency       3
level         161
place_name    161
yr              2
dtype: int64

In [4]:
#11 uniques values needs to be checked 
# Check the unique value counts to see if binning is required
df_1991.level.value_counts()

East North Central Division    20
South Atlantic Division        20
East South Central Division    20
United States                  20
West North Central Division    20
                               ..
Fresno, CA                      4
Gary, IN (MSAD)                 4
Grand Rapids-Kentwood, MI       4
Greensboro-High Point, NC       4
Worcester, MA-CT                4
Name: level, Length: 161, dtype: int64

In [5]:
#there are a substantial number of datapoints. leave it alone no need to bucket
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df_1991[cat_1991]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat_1991)
encode_df.head()

Unnamed: 0,hpi_type_distress-free,hpi_type_traditional,hpi_flavor_monthly,hpi_flavor_quarterly,frequency_MSA,frequency_State,frequency_USA or Census Division,"level_Akron, OH",level_Alabama,level_Alaska,...,place_name_USA,place_name_UT,place_name_VA,place_name_VT,place_name_WA,place_name_WI,place_name_WV,place_name_WY,yr_expanded-data,yr_purchase-only
0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
# Merge one-hot encoded features and drop the originals
df_1991 = df_1991.merge(encode_df,left_index=True, right_index=True)
df_1991 = df_1991.drop(cat_1991,1)
df_1991.head()

  df_1991 = df_1991.drop(cat_1991,1)


Unnamed: 0,place_id,period,index_nsa,index_sa,hpi_type_distress-free,hpi_type_traditional,hpi_flavor_monthly,hpi_flavor_quarterly,frequency_MSA,frequency_State,...,place_name_USA,place_name_UT,place_name_VA,place_name_VT,place_name_WA,place_name_WI,place_name_WV,place_name_WY,yr_expanded-data,yr_purchase-only
0,1991,1,100.0,100.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1991,2,100.92,100.98,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1991,3,101.31,100.93,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1991,4,101.7,101.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1991,5,102.32,101.37,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
# Remove loan status target from features data
y = df_1991.hpi_type_traditional
X = df_1991.drop(columns=["index_sa","index_nsa"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 1.000


In [9]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 24
hidden_nodes_layer2 = 12

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2022-05-21 14:44:08.442237: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
10/10 - 0s - loss: 1.2596e-04 - accuracy: 1.0000 - 154ms/epoch - 15ms/step
Loss: 0.00012595854059327394, Accuracy: 1.0
