<a href="https://colab.research.google.com/github/RishabhKedia10/trackingRecession/blob/main/notebooks/Random-Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

df = pd.read_csv('drive/MyDrive/recession/recession_data.csv')
df.shape

(787, 23)

In [None]:
# Drop the 'Dates' column for the correlation calculation
df_no_dates = df.drop(columns=['Dates'])

# Compute the correlation matrix for the remaining columns
corr_matrix = df_no_dates.corr()

# Extract correlation values with 'Recession' and sort them
corr_with_recession = corr_matrix['Recession'].sort_values(ascending=False)

# Display the top 12 features (excluding 'Recession' itself)
top_12_features = corr_with_recession[1:13].index
print("Top 12 features correlated with Recession:")
top_12_features

# Visualize the correlation with a heatmap
# import seaborn as sns
# import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 6))
# sns.heatmap(df_no_dates[top_12_features].corr(), annot=True, cmap='coolwarm')
# plt.title('Correlation Matrix of Top 12 Features')
# plt.show()

Top 12 features correlated with Recession:


Index(['S&P_500_3mo_vs_12mo', 'CPI_12mo_pct_chg', 'Unemployment_Rate_12mo_chg',
       '3M_10Y_Treasury_Spread_12mo_chg', 'CPI_3mo_pct_chg_annualized',
       '10Y_Treasury_Rate_12mo_chg', 'Unemployment_Rate',
       'Real_Fed_Funds_Rate', 'CPI_3mo_vs_12mo',
       'Real_Fed_Funds_Rate_12mo_chg', '3M_10Y_Treasury_Spread',
       '3M_Treasury_Rate_12mo_chg'],
      dtype='object')

In [None]:
from sklearn.model_selection import train_test_split

# Select the top 12 features
X = df[top_12_features]
y = df['Recession']

# Split the data into 75% training and 25% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print(f'Training set size: {X_train.shape[0]} rows')
print(f'Test set size: {X_test.shape[0]} rows')

Training set size: 590 rows
Test set size: 197 rows


## 1. Row Sampling

In [None]:
# Here we have not taken into consideration "max_samples", thus including all the rows everytime

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest with 3 decision trees
rf_model = RandomForestClassifier(n_estimators=13, max_features=None, bootstrap=True, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_model.predict(X_test)

In [None]:
y_pred

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9644670050761421

## 2. Feature Sampling

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest with feature sampling
# 'max_features' controls the number of features considered for each split in a tree.
rf_feature_sampling_model = RandomForestClassifier(
    n_estimators=13,  # 15 decision trees
    max_features=5,  # Feature sampling: 5 number of features
    bootstrap=False,  # No row sampling, only column sampling
    random_state=42
)

# Train the model with the training data
rf_feature_sampling_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_feature_sampling = rf_feature_sampling_model.predict(X_test)

In [None]:
y_pred_feature_sampling

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
from sklearn.metrics import accuracy_score
accuracy_feature_sampling = accuracy_score(y_test, y_pred_feature_sampling)
accuracy_feature_sampling

# print(f'Accuracy with Feature Sampling: {accuracy_feature_sampling:.2f}')

0.9746192893401016

## 3. Combined Sampling

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest with combined row and feature sampling
rf_combined_sampling_model = RandomForestClassifier(
    n_estimators=13,  # 13 decision trees
    max_features=5,   # Feature sampling: 5 features for each tree
    bootstrap=True,   # Row sampling (bootstrapping)
    random_state=42
)

# Train the model with the training data
rf_combined_sampling_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_combined_sampling = rf_combined_sampling_model.predict(X_test)

In [None]:
y_pred_combined_sampling

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
from sklearn.metrics import accuracy_score
accuracy_combined_sampling = accuracy_score(y_test, y_pred_combined_sampling)
accuracy_combined_sampling

0.9746192893401016