<a href="https://colab.research.google.com/github/Ray7788/FT5005-Group6/blob/main/Cleaned%20Data/classicML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab\ Notebooks/FT5005/Cleaned\ Data/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/FT5005/Cleaned Data


In [25]:
import pandas as pd
import numpy as np
import os
import sys
import argparse
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [26]:
processed_df = pd.read_csv("EBITDA_new.csv")
processed_df.head(3)


# Set up prediction target, either EBITDA or revenue_
target = 'EBITDA'
# target = 'revenue_' # must include "_"

# Implement LightGBM

In [27]:
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb

# Select features and target variable
# features exclude:
features = [col for col in processed_df.columns if col not in [ target, 'fiscalQuarter']
]

# Remove missing values
processed_df = processed_df.dropna(subset=features + [target])

# Split the dataset based on year

train_df = processed_df[processed_df['year'] <= 2020]
test_df = processed_df[processed_df['year'] > 2020]

# Update train and test sets
X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]
# Create LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Define parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'max_depth': -1,
    'verbose': -1
}

# Train the model
lgb_model = lgb.train(params, train_data, valid_sets=[train_data, test_data], num_boost_round=100)

# Make predictions
y_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R2 Score: {r2}")

RMSE: 208.7180942776796
R2 Score: 0.9676908431145301


# Implement XGBoost

In [28]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score


xgb_model = XGBRegressor(
    objective='reg:squarederror',
    learning_rate=0.1,
    max_depth=6,
    n_estimators=1000,
    early_stopping_rounds=50,
    verbosity=1
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],

)

# prediction
y_pred_xgb = xgb_model.predict(X_test)

# evaluation
rmse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost RMSE: {rmse_xgb}")
print(f"XGBoost R2 Score: {r2_xgb}")

[0]	validation_0-rmse:68.17862	validation_1-rmse:72.97890
[1]	validation_0-rmse:61.60848	validation_1-rmse:66.16905
[2]	validation_0-rmse:55.69505	validation_1-rmse:60.28874
[3]	validation_0-rmse:50.34266	validation_1-rmse:54.88415
[4]	validation_0-rmse:45.51241	validation_1-rmse:50.02833
[5]	validation_0-rmse:41.15149	validation_1-rmse:45.67956
[6]	validation_0-rmse:37.21607	validation_1-rmse:41.90996
[7]	validation_0-rmse:33.66935	validation_1-rmse:38.46768
[8]	validation_0-rmse:30.47401	validation_1-rmse:35.41859
[9]	validation_0-rmse:27.59320	validation_1-rmse:32.67406
[10]	validation_0-rmse:24.98570	validation_1-rmse:30.33502
[11]	validation_0-rmse:22.62754	validation_1-rmse:28.11714
[12]	validation_0-rmse:20.51025	validation_1-rmse:26.17450
[13]	validation_0-rmse:18.59227	validation_1-rmse:24.52615
[14]	validation_0-rmse:16.85720	validation_1-rmse:23.13984
[15]	validation_0-rmse:15.29023	validation_1-rmse:21.86740
[16]	validation_0-rmse:13.87717	validation_1-rmse:20.78890
[17]	va

In [29]:
feature_importances = xgb_model.feature_importances_

# Map feature importances to feature names
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)

                     Feature  Importance
25               EBITDA_lag1    0.798901
15                       mva    0.146762
1   commonSharesOutstanding_    0.023626
0                     ticker    0.006439
10         debtToTotalAssets    0.006131
7               totalAssets_    0.004494
13             revenueGrowth    0.002906
3        capitalExpenditure_    0.002802
16              ebitdaGrowth    0.001624
19                 realGDPSA    0.001347
18                 salePrice    0.000837
27                   quarter    0.000834
4                 netIncome_    0.000594
22                m2Velocity    0.000519
6        shareholdersEquity_    0.000399
14           netIncomeGrowth    0.000335
11                 niToAsset    0.000306
9           costOfGoodsSold_    0.000276
8                       EPS_    0.000229
12                       ROA    0.000145
21             m2m1GrowthGap    0.000143
5           operatingIncome_    0.000127
2        commonSharesTraded_    0.000105
23              

# Implement Random Forest model

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 257.67034461470075
R^2 Score: 0.9601131295410656


In [31]:
feature_importances = rf_model.feature_importances_

# Map feature importances to feature names
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)

                     Feature  Importance
25               EBITDA_lag1    0.853513
15                       mva    0.081705
1   commonSharesOutstanding_    0.022946
7               totalAssets_    0.007860
10         debtToTotalAssets    0.006403
3        capitalExpenditure_    0.004497
13             revenueGrowth    0.003244
5           operatingIncome_    0.002019
16              ebitdaGrowth    0.001534
4                 netIncome_    0.001524
6        shareholdersEquity_    0.001440
27                   quarter    0.001347
12                       ROA    0.001168
11                 niToAsset    0.001130
8                       EPS_    0.001087
2        commonSharesTraded_    0.001035
0                     ticker    0.000950
9           costOfGoodsSold_    0.000935
24                  revenue_    0.000854
17               companyName    0.000784
19                 realGDPSA    0.000707
22                m2Velocity    0.000664
14           netIncomeGrowth    0.000657
21             m