<a href="https://colab.research.google.com/github/Ray7788/FT5005-Group6/blob/main/Cleaned%20Data/classicML_revenue.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab\ Notebooks/FT5005/Cleaned\ Data/

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/FT5005/Cleaned Data


In [2]:
import pandas as pd
import numpy as np
import os
import sys
import argparse
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
processed_df = pd.read_csv("EBITDA_new.csv")
processed_df.head(3)


# Set up prediction target, either EBITDA or revenue_
# target = 'EBITDA'
target = 'revenue_' # must include "_"

# Implement LightGBM

In [4]:
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb

# Select features and target variable
# features exclude:
features = [col for col in processed_df.columns if col not in [ target, 'fiscalQuarter']
]

# Remove missing values
processed_df = processed_df.dropna(subset=features + [target])

# Split the dataset based on year

train_df = processed_df[processed_df['year'] <= 2020]
test_df = processed_df[processed_df['year'] > 2020]

# Update train and test sets
X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]
# Create LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Define parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'max_depth': -1,
    'verbose': -1
}

# Train the model
lgb_model = lgb.train(params, train_data, valid_sets=[train_data, test_data], num_boost_round=100)

# Make predictions
y_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R2 Score: {r2}")

RMSE: 16.964519738548105
R2 Score: 0.9989434129086335


# Implement XGBoost

In [5]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score


xgb_model = XGBRegressor(
    objective='reg:squarederror',
    learning_rate=0.1,
    max_depth=6,
    n_estimators=1000,
    early_stopping_rounds=50,
    verbosity=1
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],

)

# prediction
y_pred_xgb = xgb_model.predict(X_test)

# evaluation
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost RMSE: {rmse_xgb}")
print(f"XGBoost R2 Score: {r2_xgb}")

[0]	validation_0-rmse:522.05614	validation_1-rmse:474.02140
[1]	validation_0-rmse:471.01964	validation_1-rmse:430.40689
[2]	validation_0-rmse:424.99492	validation_1-rmse:390.73989
[3]	validation_0-rmse:383.50622	validation_1-rmse:355.23979
[4]	validation_0-rmse:346.08141	validation_1-rmse:322.76102
[5]	validation_0-rmse:312.33144	validation_1-rmse:293.44804
[6]	validation_0-rmse:281.88070	validation_1-rmse:264.08752
[7]	validation_0-rmse:254.39569	validation_1-rmse:240.19523
[8]	validation_0-rmse:229.59904	validation_1-rmse:218.21323
[9]	validation_0-rmse:207.24266	validation_1-rmse:198.66691
[10]	validation_0-rmse:187.08700	validation_1-rmse:179.26692
[11]	validation_0-rmse:168.84879	validation_1-rmse:163.50485
[12]	validation_0-rmse:152.44854	validation_1-rmse:147.73559
[13]	validation_0-rmse:137.62035	validation_1-rmse:134.83424
[14]	validation_0-rmse:124.22315	validation_1-rmse:123.02745
[15]	validation_0-rmse:112.13065	validation_1-rmse:112.47276
[16]	validation_0-rmse:101.24018	v

In [6]:
feature_importances = xgb_model.feature_importances_

# Map feature importances to feature names
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)

                     Feature    Importance
6        shareholdersEquity_  9.853559e-01
0                     ticker  1.444983e-02
7               totalAssets_  7.211386e-05
5           operatingIncome_  6.163237e-05
1   commonSharesOutstanding_  4.225534e-05
21                      m2SA  5.403246e-06
10         debtToTotalAssets  5.387793e-06
4                 netIncome_  9.489915e-07
23                m2Velocity  8.140358e-07
8                       EPS_  6.924125e-07
19                 salePrice  6.347980e-07
3        capitalExpenditure_  5.633736e-07
9           costOfGoodsSold_  5.048854e-07
16              ebitdaGrowth  4.402974e-07
13             revenueGrowth  4.211877e-07
25               EBITDA_lag1  3.899926e-07
2        commonSharesTraded_  3.359559e-07
14           netIncomeGrowth  3.097017e-07
24                 primeRate  3.093878e-07
18                    EBITDA  2.820607e-07
22             m2m1GrowthGap  2.081360e-07
20                 realGDPSA  2.019953e-07
27         

# Implement Random Forest model

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate model performance
mse_rf = mean_squared_error(y_test, y_pred)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred)

print(f"Random Forest RMSE: {rmse_rf}")
print(f"R^2 Score: {r2_rf}")

Random Forest RMSE: 25.969491084947382
R^2 Score: 0.9975240091255737


In [8]:
feature_importances = rf_model.feature_importances_

# Map feature importances to feature names
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)

                     Feature  Importance
6        shareholdersEquity_    0.991350
7               totalAssets_    0.003704
0                     ticker    0.001641
17               companyName    0.000859
25               EBITDA_lag1    0.000699
9           costOfGoodsSold_    0.000610
10         debtToTotalAssets    0.000246
18                    EBITDA    0.000162
8                       EPS_    0.000147
21                      m2SA    0.000083
20                 realGDPSA    0.000060
5           operatingIncome_    0.000056
1   commonSharesOutstanding_    0.000056
15                       mva    0.000043
22             m2m1GrowthGap    0.000037
19                 salePrice    0.000031
26                      year    0.000031
16              ebitdaGrowth    0.000030
23                m2Velocity    0.000026
12                       ROA    0.000024
11                 niToAsset    0.000020
3        capitalExpenditure_    0.000018
24                 primeRate    0.000014
14           net