<a href="https://colab.research.google.com/github/RyakDL/Project-Loan-Charge-Offs/blob/main/RandomForests_final_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import Dependencies
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

In [2]:
# Import DataFrame
main_df = pd.read_csv('Rates_MO.csv')
main_df.columns

Index(['Unnamed: 0', 'observation_date', 'C&I_DELNQ', 'CCARD_CO',
       'CCARD_DELNQ', 'CORP_DEBT_NET_WORTH', 'CORP_SAVINGS_LEVEL', 'CRE_CO',
       'CRE_DELNQ', 'GDP', 'Homeowner_Vacancy_rate', 'Household_DBT_Inc',
       'Mortgage_CO', 'Mortgage_DELNQ', 'Rental_Vacancy_Rate',
       'Consumer_Confidence', 'FEDFUNDS', 'Manufacturing_Confidence',
       'SAVINGS_RATE_MO', 'UNRATE', 'C&I_CO'],
      dtype='object')

In [3]:
# Create the group datasets: credit cards, mortgage loans, C&I loans, CRE loans
ccard_df = main_df[['observation_date', 'CCARD_CO', 'CCARD_DELNQ', 'GDP', 'Household_DBT_Inc', 'Consumer_Confidence', 'FEDFUNDS', 'SAVINGS_RATE_MO', 'UNRATE']]
mort_df = main_df[['observation_date', 'Mortgage_CO', 'Mortgage_DELNQ', 'GDP', 'Household_DBT_Inc', 'Consumer_Confidence', 'FEDFUNDS', 'SAVINGS_RATE_MO', 'UNRATE']]
CI_df = main_df[['observation_date', 'C&I_CO', 'C&I_DELNQ', 'GDP', 'CORP_DEBT_NET_WORTH', 'Manufacturing_Confidence', 'FEDFUNDS', 'CORP_SAVINGS_LEVEL', 'UNRATE']]
CRE_df = main_df[['observation_date', 'CRE_CO', 'CRE_DELNQ', 'Rental_Vacancy_Rate', 'GDP', 'CORP_DEBT_NET_WORTH', 'Manufacturing_Confidence', 'FEDFUNDS', 'CORP_SAVINGS_LEVEL', 'UNRATE']]


**GROUP 1: Credit Cards**

In [4]:
# Inspect data types
ccard_bin_df = ccard_df.copy()
ccard_bin_df.dtypes

observation_date        object
CCARD_CO               float64
CCARD_DELNQ            float64
GDP                    float64
Household_DBT_Inc      float64
Consumer_Confidence    float64
FEDFUNDS               float64
SAVINGS_RATE_MO        float64
UNRATE                 float64
dtype: object

In [5]:
# Binning process
# Binning the data for classification Question: should we be using pd cut to get more bins?
ccard_bin_df["CCARD_CO_BIN"] = pd.qcut(ccard_df['CCARD_CO'],4, labels= [1, 2, 3, 4])
ccard_bin_df["CCARD_DELNQ_BIN"] = pd.qcut(ccard_df['CCARD_DELNQ'],4, labels= ['low', 'medium-low', 'medium-high', 'high'])
ccard_bin_df["GDP_BIN"] = pd.qcut(ccard_df['GDP'],4, labels= ['low', 'medium-low', 'medium-high', 'high'])
ccard_bin_df["Household_DBT_Inc_BIN"] = pd.qcut(ccard_df['Household_DBT_Inc'],4, labels= ['low', 'medium-low', 'medium-high', 'high'])
ccard_bin_df["Consumer_Confidence_BIN"] = pd.qcut(ccard_df['Consumer_Confidence'],4, labels= ['low', 'medium-low', 'medium-high', 'high'])
ccard_bin_df["FEDFUNDS_BIN"] = pd.qcut(ccard_df['FEDFUNDS'],4, labels= ['low', 'medium-low', 'medium-high', 'high'])
ccard_bin_df["SAVINGS_RATE_MO_BIN"] = pd.qcut(ccard_df['SAVINGS_RATE_MO'],4, labels= ['low', 'medium-low', 'medium-high', 'high'])
ccard_bin_df["UNRATE_BIN"] = pd.qcut(ccard_df['UNRATE'],4, labels= ['low', 'medium-low', 'medium-high', 'high'])
ccard_bin_df

Unnamed: 0,observation_date,CCARD_CO,CCARD_DELNQ,GDP,Household_DBT_Inc,Consumer_Confidence,FEDFUNDS,SAVINGS_RATE_MO,UNRATE,CCARD_CO_BIN,CCARD_DELNQ_BIN,GDP_BIN,Household_DBT_Inc_BIN,Consumer_Confidence_BIN,FEDFUNDS_BIN,SAVINGS_RATE_MO_BIN,UNRATE_BIN
0,1991-01-01,4.16,5.26,-1.9,11.578032,66.8,6.91,9.4,6.4,3,high,low,medium-high,low,high,high,medium-high
1,1991-02-01,4.16,5.26,-1.9,11.578032,70.4,6.25,9.0,6.6,3,high,low,medium-high,low,high,high,medium-high
2,1991-03-01,4.16,5.26,-1.9,11.578032,87.7,6.12,8.1,6.8,3,high,low,medium-high,medium-low,high,high,high
3,1991-04-01,4.60,5.48,3.2,11.434237,81.8,5.91,8.7,6.7,3,high,medium-high,medium-high,medium-low,high,high,medium-high
4,1991-05-01,4.60,5.48,3.2,11.434237,78.3,5.78,8.5,6.9,3,high,medium-high,medium-high,medium-low,high,high,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,2023-02-01,2.89,2.43,2.2,9.848832,67.0,4.57,4.7,3.6,1,low,medium-low,low,low,medium-high,low,low
386,2023-03-01,2.89,2.43,2.2,9.848832,62.0,4.65,5.2,3.5,1,low,medium-low,low,low,medium-high,medium-low,low
387,2023-04-01,3.15,2.77,2.1,9.826692,63.5,4.83,5.2,3.4,1,medium-low,medium-low,low,low,medium-high,medium-low,low
388,2023-05-01,3.15,2.77,2.1,9.826692,59.2,5.06,5.3,3.7,1,medium-low,medium-low,low,low,high,medium-low,low


Obtain Scaled/Classified Target-Features and Run Random Forest

In [6]:
# Initialize the target variable (same target for both continuous and classified features)
y_ccard = ccard_bin_df["CCARD_CO_BIN"].astype(int).values
y_ccard[:5]

array([3, 3, 3, 3, 3])

In [7]:
## Obtain continous features
ccard_bin_data = ccard_bin_df.drop(columns=["observation_date", "CCARD_CO", "CCARD_DELNQ", "GDP", "Household_DBT_Inc", "Consumer_Confidence", "FEDFUNDS", "SAVINGS_RATE_MO", "UNRATE", "CCARD_CO_BIN"])
X_ccard = pd.get_dummies(ccard_bin_data)

Run the Random Forest Model

In [8]:
## Splitting into Train and Test Sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X_ccard, y_ccard, random_state=78)

In [9]:
# Create the StandardScaler instance
scaler1 = StandardScaler()

# fit data for both types
X1_scaler = scaler1.fit(X1_train)

# Scaling data
X1_train_scaled = X1_scaler.transform(X1_train)
X1_test_scaled = X1_scaler.transform(X1_test)


In [10]:
# Create a random forest classifier
cc_model_class = RandomForestClassifier(n_estimators=100, random_state=78)

# fit the model
# cc_model = cc_model.fit(X1_train, y1_train)
cc_model_class = cc_model_class.fit(X1_train_scaled, y1_train)

# Make predictions based on the model's testing data
cc_predictions_class = cc_model_class.predict(X1_test_scaled)


Analyze Results

In [11]:
# Confusion Matrix
# cm1 = confusion_matrix(y1_test, cc_predictions)
cm1_class = confusion_matrix(y1_test, cc_predictions_class)
cm_class_df1 = pd.DataFrame(
    cm1_class, index=["Actual Low", "Actual Medium-Low", "Actual Medium-High", "High"], columns=["Predicted Low", "Predicted Medium-Low", "Predicted Medium-High", "Predicted High"]
)

# Accuracy score
acc_score1_class = accuracy_score(y1_test, cc_predictions_class)

In [12]:
# Displaying Class Feature results
print("Confusion Matrix")
display(cm_class_df1)
print(f"Accuracy Score : {acc_score1_class}")
print("Classification Report")
print(classification_report(y1_test, cc_predictions_class))

Confusion Matrix


Unnamed: 0,Predicted Low,Predicted Medium-Low,Predicted Medium-High,Predicted High
Actual Low,24,1,0,0
Actual Medium-Low,1,19,4,0
Actual Medium-High,0,0,21,3
High,0,1,1,23


Accuracy Score : 0.8877551020408163
Classification Report
              precision    recall  f1-score   support

           1       0.96      0.96      0.96        25
           2       0.90      0.79      0.84        24
           3       0.81      0.88      0.84        24
           4       0.88      0.92      0.90        25

    accuracy                           0.89        98
   macro avg       0.89      0.89      0.89        98
weighted avg       0.89      0.89      0.89        98



Run the Logistic Regression Model

In [13]:
# Create the logisitc regression instance
from sklearn.linear_model import LogisticRegression
# classifier1 = LogisticRegression(solver='lbfgs', random_state=1)
classifier1 = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model
classifier1.fit(X1_train, y1_train)

In [14]:
# Predict outcomes for test data set
cc_predictions_reg = classifier1.predict(X1_test)
pd.DataFrame({"Prediction": cc_predictions_reg, "Actual": y1_test})

Unnamed: 0,Prediction,Actual
0,1,1
1,4,3
2,4,4
3,4,4
4,4,4
...,...,...
93,1,1
94,1,1
95,4,3
96,1,1


In [15]:
# Display the accuracy score for the test dataset.
accuracy_score(y1_test, cc_predictions_reg)

0.8061224489795918

Run the Deep Learning Model

In [17]:
import tensorflow as tf

# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X1_train_scaled)
hidden_nodes_layers1 = 100
hidden_nodes_layers2 = 100
hidden_nodes_layers3 = 100

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layers1, input_dim=28, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layers2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layers3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 100)               2900      
                                                                 
 dense_1 (Dense)             (None, 100)               10100     
                                                                 
 dense_2 (Dense)             (None, 100)               10100     
                                                                 
 dense_3 (Dense)             (None, 1)                 101       
                                                                 
Total params: 23201 (90.63 KB)
Trainable params: 23201 (90.63 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
# Compile then train the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

fit_nn = nn.fit(X1_train_scaled, y1_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [21]:
# Evaluate the model
cc_model_loss, cc_model_accuracy = nn.evaluate(X1_test_scaled, y1_test,verbose=2)
print(f"Loss: {cc_model_loss}, Accuracy: {cc_model_accuracy}")

4/4 - 0s - loss: -5.8761e+07 - accuracy: 0.2551 - 50ms/epoch - 13ms/step
Loss: -58760572.0, Accuracy: 0.2551020383834839
