Random Forest
==========

In [1]:
import pandas as pd
import numpy as np
import os
import sys
import argparse
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [2]:
processed_df = pd.read_csv("EBITDA_new.csv")
processed_df.head(3)

Unnamed: 0,ticker,fiscalQuarter,commonSharesOutstanding_,commonSharesTraded_,capitalExpenditure_,netIncome_,operatingIncome_,shareholdersEquity_,totalAssets_,EPS_,...,EBITDA,salePrice,realGDPSA,m2SA,m2m1GrowthGap,m2Velocity,primeRate,EBITDA_lag1,year,quarter
0,ACAN,2013Q1,-0.000933,12.247579,0.0,-0.073272,-0.095742,-0.203545,0.0,0.0,...,50.369,13.56496,-0.867729,9.261987,0.172473,1.581,3.25,,2013,1
1,ACAN,2013Q2,-0.000933,12.247579,0.0,-0.07321,-0.095754,-0.203545,0.0,0.0,...,42.617,13.651304,-0.849252,9.272702,-0.64055,1.572,3.25,50.369,2013,2
2,ACAN,2013Q3,-0.000933,12.247579,0.0,-0.073519,-0.095692,-0.203545,0.0,0.0,...,46.147,13.650265,-0.790315,9.286502,-0.328302,1.571,3.25,42.617,2013,3


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Select features and target variable
# features exclude: 
features = [col for col in processed_df.columns if col not in [ 'EBITDA', 'ticker', 'companyName','fiscalQuarter']]
target = 'EBITDA'

# Remove missing values
processed_df = processed_df.dropna(subset=features + [target])

# Split the dataset based on year

train_df = processed_df[processed_df['year'] <= 2020]
test_df = processed_df[processed_df['year'] > 2020]

# Update train and test sets
X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 475.18480204598814
R^2 Score: 0.9217680048739663


In [4]:
# Feature importance
feature_importances = rf_model.feature_importances_

# Map feature importances to feature names
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)

                     Feature  Importance
22               EBITDA_lag1    0.927436
0   commonSharesOutstanding_    0.028308
2        capitalExpenditure_    0.005879
14                       mva    0.004161
12             revenueGrowth    0.004019
9          debtToTotalAssets    0.003804
6               totalAssets_    0.003741
5        shareholdersEquity_    0.002462
1        commonSharesTraded_    0.002274
3                 netIncome_    0.001886
7                       EPS_    0.001753
15              ebitdaGrowth    0.001743
20                m2Velocity    0.001369
18                      m2SA    0.001331
19             m2m1GrowthGap    0.001250
17                 realGDPSA    0.001129
16                 salePrice    0.001025
13           netIncomeGrowth    0.000966
4           operatingIncome_    0.000961
10                 niToAsset    0.000916
11                       ROA    0.000885
24                   quarter    0.000880
8           costOfGoodsSold_    0.000814
21              