In [8]:
import pandas as pd
import numpy as np 
import warnings
warnings.filterwarnings("ignore")

In [9]:
df = pd.read_csv("EBITDA_new.csv")
df

Unnamed: 0,ticker,fiscalQuarter,commonSharesOutstanding_,commonSharesTraded_,capitalExpenditure_,netIncome_,operatingIncome_,shareholdersEquity_,totalAssets_,EPS_,...,salePrice,realGDPSA,m2SA,m2m1GrowthGap,m2Velocity,primeRate,revenue_,EBITDA_lag1,year,quarter
0,0,2013Q1,-0.000933,12.247579,0.000000,-0.073272,-0.095742,-0.003005,0.00000,0.00,...,13.564960,-0.867729,9.261987,0.172473,1.581,3.250000,-0.00300,,2013,1
1,0,2013Q2,-0.000933,12.247579,0.000000,-0.073210,-0.095754,-0.003005,0.00000,0.00,...,13.651304,-0.849252,9.272702,-0.640550,1.572,3.250000,-0.00300,50.369,2013,2
2,0,2013Q3,-0.000933,12.247579,0.000000,-0.073519,-0.095692,-0.003005,0.00000,0.00,...,13.650265,-0.790315,9.286502,-0.328302,1.571,3.250000,-0.00300,42.617,2013,3
3,0,2013Q4,-0.000933,12.247579,0.000000,-0.072407,-0.095729,0.000000,0.00000,0.00,...,13.978878,-0.729457,9.305890,-1.494308,1.563,3.250000,0.00000,46.147,2013,4
4,0,2014Q1,-0.000933,12.247579,0.000000,-0.073519,-0.095692,-0.006018,0.00000,0.00,...,13.796973,-0.753759,9.321372,-1.579023,1.539,3.250000,-0.00600,67.587,2014,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,11,2022Q4,1.116244,14.488942,3.970292,2.422222,2.417428,,4848.35925,1.12,...,14.303885,0.925248,9.968103,1.365277,1.253,6.819672,-904.00000,137.428,2022,4
535,11,2023Q1,1.077239,14.488942,2.564949,2.422222,2.417428,,4848.35925,0.81,...,14.193683,0.985434,9.956538,1.945551,1.288,7.693548,-943.29475,177.311,2023,1
536,11,2023Q2,1.037409,14.488942,3.367296,2.422222,2.417428,,4848.35925,1.25,...,14.029054,1.038587,9.941376,2.367239,1.322,8.158730,-943.29475,95.775,2023,2
537,11,2023Q3,1.009074,14.488942,3.761200,2.422222,2.417428,,4848.35925,1.49,...,14.013110,1.132989,9.940176,1.509829,1.348,8.432540,-943.29475,149.372,2023,3


In [None]:
df.dropna(inplace=True)
df.drop

In [11]:
train = df[df['year'] < 2021]
test = df[df['year'] >= 2021]

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

# use lag1 as predicted value
y_true_train = train["EBITDA"]
y_pred_train = train["EBITDA_lag1"]

y_true_test = test["EBITDA"]
y_pred_test = test["EBITDA_lag1"]

def calculatePerformance(y_true, y_pred):
    # Compute RMSE, R-Squared, and MPAE for each target column
    rmse = mean_squared_error(y_true, y_pred)
    r_squared = r2_score(y_true, y_pred)
    mpae = mean_absolute_percentage_error(y_true, y_pred)

    performance_metrics = {'RMSE': rmse, 'R-Squared': r_squared, 'MPAE': mpae}
    # Convert results to DataFrame for better visualization
    performance_df = pd.DataFrame.from_dict(performance_metrics, orient='index')
    return performance_df

performan_train = calculatePerformance(y_true_train, y_pred_train)
performan_test = calculatePerformance(y_true_test, y_pred_test)

print("performance on training set\n", performan_train)
print("\n")
print("performance on testing set\n", performan_test)

performance on training set
                     0
RMSE       389.118776
R-Squared    0.931669
MPAE       116.172742


performance on testing set
                     0
RMSE       300.531013
R-Squared    0.953478
MPAE         0.889287


In [17]:
# OLS model
import statsmodels.api as sm

# drop non-numerical columns
train = train.drop(columns=['fiscalQuarter'])
test = test.drop(columns=['fiscalQuarter'])

# split X and y
X_train = train.drop(columns=['EBITDA']) 
y_train = train['EBITDA'] 

X_test = test.drop(columns=['EBITDA'])  
y_test = test['EBITDA']  

# add bias
X_train = sm.add_constant(X_train)  
X_test = sm.add_constant(X_test)  

# build OLS
ols_model = sm.OLS(y_train, X_train).fit() 

# predtction
y_pred_train_ols = ols_model.predict(X_train) 
y_pred_test_ols = ols_model.predict(X_test) 

performan_train_ols = calculatePerformance(y_true_train, y_pred_train_ols)
performan_test_ols = calculatePerformance(y_true_test, y_pred_test_ols)

print("OLS performance on training set\n", performan_train_ols)
print("\n")
print("OLS performance on testing set\n", performan_test_ols)

OLS performance on training set
                     0
RMSE       353.524561
R-Squared    0.937919
MPAE       122.731157


OLS performance on testing set
                     0
RMSE       372.466513
R-Squared    0.942343
MPAE        52.276606
