In [1]:
import numpy as np
import statistics as st
import pandas as pd 
import statsmodels.api as sm
from scipy import stats

In [12]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Load data from CSV files for banks and GDP
df_TCB = pd.read_csv('financial_data_TCB_filtered.csv', encoding='ISO-8859-1')
df_VCB = pd.read_csv('financial_data_VCB_filtered.csv', encoding='ISO-8859-1')
df_MBB = pd.read_csv('financial_data_MBB_filtered.csv', encoding='ISO-8859-1')
df_BID = pd.read_csv('financial_data_BID_filtered.csv', encoding='ISO-8859-1')
df_GDP = pd.read_csv('gdp_quarterly_2013_2023.csv', encoding='ISO-8859-1')

# Selected top 5 features with high correlation to "total assets"
selected_features = [
    'year', 'quarter',  'interestMargin', 'badDebtPercentage', 'loanOnEarnAsset', 'operationProfit'
]

# Helper function to filter and rename columns for each bank
def filter_and_rename(data, bank_name):
    available_features = [col for col in selected_features if col in data.columns]
    filtered_data = data[['year', 'quarter'] + available_features].copy()
    filtered_data.columns = ['year', 'quarter'] + [f"{bank_name}_{col}" for col in available_features]
    return filtered_data

# Apply the filter function to each bank's dataset
vcb_filtered = filter_and_rename(df_VCB, 'VCB')
tcb_filtered = filter_and_rename(df_TCB, 'TCB')
mbb_filtered = filter_and_rename(df_MBB, 'MBB')
bid_filtered = filter_and_rename(df_BID, 'BID')

# Prepare GDP data by extracting the year and quarter from the Date column
df_GDP['year'] = pd.to_datetime(df_GDP['Date']).dt.year
df_GDP['quarter'] = pd.to_datetime(df_GDP['Date']).dt.quarter
gdp_all_quarters = df_GDP[(df_GDP['year'] >= 2013) & (df_GDP['year'] <= 2023)]

# Merge the bank data with GDP data on 'year' and 'quarter'
merged_data = gdp_all_quarters[['year', 'quarter', 'Quarterly_GDP']].merge(vcb_filtered, on=['year', 'quarter'], how='left') \
                                                                  .merge(tcb_filtered, on=['year', 'quarter'], how='left') \
                                                                  .merge(mbb_filtered, on=['year', 'quarter'], how='left') \
                                                                  .merge(bid_filtered, on=['year', 'quarter'], how='left')

# Drop any rows with missing data after merging
merged_data.dropna(inplace=True)

# Define the independent variables (predictors) and the formula
independent_vars = merged_data.columns.difference(['year', 'quarter', 'Quarterly_GDP']).tolist()
formula = 'Quarterly_GDP ~ ' + ' + '.join(independent_vars)

# Fit the multivariate linear regression model using OLS (formula-based)
model = ols(formula, data=merged_data).fit()

# Display the model summary to check coefficients and p-values
print("Model Summary:")
print(model.summary())

# Check p-values to determine statistical significance
significant_features = model.pvalues[model.pvalues < 0.05].index.tolist()
print("\nSignificant features with p-value < 0.05:", significant_features)


Model Summary:
                            OLS Regression Results                            
Dep. Variable:          Quarterly_GDP   R-squared:                       0.994
Model:                            OLS   Adj. R-squared:                  0.989
Method:                 Least Squares   F-statistic:                     216.5
Date:                Thu, 14 Nov 2024   Prob (F-statistic):           5.52e-23
Time:                        10:26:31   Log-Likelihood:                -522.19
No. Observations:                  44   AIC:                             1082.
Df Residuals:                      25   BIC:                             1116.
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept      

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import statsmodels.api as sm
import numpy as np

# Load data from CSV files for banks and GDP
df_TCB = pd.read_csv('financial_data_TCB_filtered.csv', encoding='ISO-8859-1')
df_VCB = pd.read_csv('financial_data_VCB_filtered.csv', encoding='ISO-8859-1')
df_MBB = pd.read_csv('financial_data_MBB_filtered.csv', encoding='ISO-8859-1')
df_BID = pd.read_csv('financial_data_BID_filtered.csv', encoding='ISO-8859-1')
df_GDP = pd.read_csv('gdp_quarterly_2013_2023.csv', encoding='ISO-8859-1')

# Selected top 5 features with high correlation to "total assets"
selected_features = [
    'year', 'quarter', 'interestMargin', 'badDebtPercentage', 'loanOnEarnAsset', 'operationProfit','equityOnTotalAsset'
]

# Helper function to filter and rename columns for each bank
def filter_and_rename(data, bank_name):
    available_features = [col for col in selected_features if col in data.columns]
    filtered_data = data[['year', 'quarter'] + available_features].copy()
    filtered_data.columns = ['year', 'quarter'] + [f"{bank_name}_{col}" for col in available_features]
    return filtered_data

# Apply the filter function to each bank's dataset
vcb_filtered = filter_and_rename(df_VCB, 'VCB')
tcb_filtered = filter_and_rename(df_TCB, 'TCB')
mbb_filtered = filter_and_rename(df_MBB, 'MBB')
bid_filtered = filter_and_rename(df_BID, 'BID')

# Prepare GDP data by extracting the year and quarter from the Date column
df_GDP['year'] = pd.to_datetime(df_GDP['Date']).dt.year
df_GDP['quarter'] = pd.to_datetime(df_GDP['Date']).dt.quarter
gdp_all_quarters = df_GDP[(df_GDP['year'] >= 2013) & (df_GDP['year'] <= 2023)]

# Merge the bank data with GDP data on 'year' and 'quarter'
merged_data = gdp_all_quarters[['year', 'quarter', 'Quarterly_GDP']].merge(vcb_filtered, on=['year', 'quarter'], how='left') \
                                                                  .merge(tcb_filtered, on=['year', 'quarter'], how='left') \
                                                                  .merge(mbb_filtered, on=['year', 'quarter'], how='left') \
                                                                  .merge(bid_filtered, on=['year', 'quarter'], how='left')

# Drop any rows with missing data after merging
merged_data.dropna(inplace=True)

# Define X and y
X = merged_data.drop(columns=['year', 'quarter', 'Quarterly_GDP'])  # Independent variables
y = merged_data['Quarterly_GDP']  # Dependent variable: GDP

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model on the training set
model = sm.OLS(y_train, X_train).fit()

# Predict on the test set
y_pred_test = model.predict(X_test)

# Calculate RMSE on the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
print("RMSE on test set:", rmse_test)

# Calculate MAPE on the test set
mape_test = mean_absolute_percentage_error(y_test, y_pred_test)
print("MAPE on test set:", mape_test)


RMSE on test set: 92963.7204458805
MAPE on test set: 0.042734002129518306


RMSE on test set: 92963.7204458805
MAPE on test set: 0.042734002129518306
