In [1]:
import numpy as np
import statistics as st
import pandas as pd 

In [11]:
import pandas as pd
import statsmodels.api as sm
from scipy import stats

In [99]:
import pandas as pd
import statsmodels.api as sm

# Load data from CSV files for banks and GDP
df_TCB = pd.read_csv('TCB_adjusted_data.csv', encoding='ISO-8859-1')
df_VCB = pd.read_csv('VCB_adjusted_data.csv', encoding='ISO-8859-1')
df_MBB = pd.read_csv('MBB_adjusted_data.csv', encoding='ISO-8859-1')
df_BID = pd.read_csv('BID_adjusted_data.csv', encoding='ISO-8859-1')
df_GDP = pd.read_csv('gdp_quarterly_adjusted.csv', encoding='ISO-8859-1')

# Selected features with potential impact on GDP
selected_features = [
    'year', 'roe', 'roa', 'interestMargin', 'badDebtPercentage', 'creditGrowth', 
    'operationProfit', 'yearOperationProfitGrowth', 'postTaxProfit', 'provisionOnBadDebt', 
    'equityOnTotalAsset'
]

# Helper function to filter and rename columns for each bank
def filter_and_rename(data, bank_name):
    # Retain only columns present in the dataset
    available_features = [col for col in selected_features if col in data.columns]
    filtered_data = data[available_features].copy()
    filtered_data.columns = [f"{bank_name}_{col}" if col != 'year' else col for col in filtered_data.columns]
    return filtered_data

# Apply the filter function to each bank's dataset
vcb_filtered = filter_and_rename(df_VCB, 'VCB')
tcb_filtered = filter_and_rename(df_TCB, 'TCB')
mbb_filtered = filter_and_rename(df_MBB, 'MBB')
bid_filtered = filter_and_rename(df_BID, 'BID')

# Prepare GDP data by extracting the year from the Date column
df_GDP['year'] = pd.to_datetime(df_GDP['Date']).dt.year
gdp_q4 = df_GDP[(df_GDP['year'] >= 2013) & (df_GDP['year'] <= 2023)]

# Merge the bank data with GDP data on 'year'
merged_data = gdp_q4[['year', 'Quarterly_GDP']].merge(vcb_filtered, on='year') \
                                               .merge(tcb_filtered, on='year') \
                                               .merge(mbb_filtered, on='year') \
                                               .merge(bid_filtered, on='year')

# Define the independent variables (X) and dependent variable (y)
X = merged_data.drop(columns=['year', 'Quarterly_GDP'])  # Independent variables
y = merged_data['Quarterly_GDP']  # Dependent variable: Quarterly GDP growth

# Add a constant to the independent variables for the linear regression model
X = sm.add_constant(X)

# Fit the multivariate linear regression model
model = sm.OLS(y, X).fit()

# Print the model summary to show coefficients and p-values
print(model.summary())

# Extract and print significant features based on p-value < 0.05
significant_features = model.pvalues[model.pvalues < 0.05].index.tolist()
print("Significant features with p-value < 0.05:", significant_features)


                            OLS Regression Results                            
Dep. Variable:          Quarterly_GDP   R-squared:                       0.987
Model:                            OLS   Adj. R-squared:                  0.987
Method:                 Least Squares   F-statistic:                 2.854e+04
Date:                Wed, 06 Nov 2024   Prob (F-statistic):               0.00
Time:                        19:25:07   Log-Likelihood:            -1.3066e+05
No. Observations:               10752   AIC:                         2.614e+05
Df Residuals:                   10723   BIC:                         2.616e+05
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                   1.45

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import statsmodels.api as sm
import numpy as np

# Load the datasets
df_TCB = pd.read_csv('TCB_adjusted_data.csv', encoding='ISO-8859-1')
df_VCB = pd.read_csv('VCB_adjusted_data.csv', encoding='ISO-8859-1')
df_MBB = pd.read_csv('MBB_adjusted_data.csv', encoding='ISO-8859-1')
df_BID = pd.read_csv('BID_adjusted_data.csv', encoding='ISO-8859-1')
df_GDP = pd.read_csv('gdp_quarterly_adjusted.csv', encoding='ISO-8859-1')

# Selected features with potential impact on GDP
selected_features = [
    'year', 'roe', 'roa', 'interestMargin', 'badDebtPercentage', 'creditGrowth', 
    'operationProfit', 'yearOperationProfitGrowth', 'postTaxProfit', 'provisionOnBadDebt', 
    'equityOnTotalAsset'
]

# Helper function to filter and rename columns for each bank
def filter_and_rename(data, bank_name):
    available_features = [col for col in selected_features if col in data.columns]
    filtered_data = data[available_features].copy()
    filtered_data.columns = [f"{bank_name}_{col}" if col != 'year' else col for col in filtered_data.columns]
    return filtered_data

# Apply the filter function to each bank's dataset
vcb_filtered = filter_and_rename(df_VCB, 'VCB')
tcb_filtered = filter_and_rename(df_TCB, 'TCB')
mbb_filtered = filter_and_rename(df_MBB, 'MBB')
bid_filtered = filter_and_rename(df_BID, 'BID')

# Prepare GDP data by extracting the year from the Date column
df_GDP['year'] = pd.to_datetime(df_GDP['Date']).dt.year
gdp_q4 = df_GDP[(df_GDP['year'] >= 2013) & (df_GDP['year'] <= 2023)]

# Merge the bank data with GDP data on 'year'
merged_data = gdp_q4[['year', 'Quarterly_GDP']].merge(vcb_filtered, on='year') \
                                               .merge(tcb_filtered, on='year') \
                                               .merge(mbb_filtered, on='year') \
                                               .merge(bid_filtered, on='year')

# Define X and y
X = merged_data.drop(columns=['year', 'Quarterly_GDP'])  # Independent variables
y = merged_data['Quarterly_GDP']  # Dependent variable: GDP growth

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model on the training set
model = sm.OLS(y_train, X_train).fit()

# Predict on the test set
y_pred_test = model.predict(X_test)

# Calculate RMSE on the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
print("RMSE on test set:", rmse_test)

# Calculate MAPE on the test set
mape_test = mean_absolute_percentage_error(y_test, y_pred_test)
print("MAPE on test set:", mape_test)


RMSE on test set: 48236.88008544685
MAPE on test set: 0.021437994335840784
