In [1]:
import numpy as np
import statistics as st
import pandas as pd 
import statsmodels.api as sm
from scipy import stats

In [100]:
import pandas as pd
import statsmodels.api as sm

# Load data from CSV files for banks and GDP
df_TCB = pd.read_csv('financial_data_TCB_filtered.csv', encoding='ISO-8859-1')
df_VCB = pd.read_csv('financial_data_VCB_filtered.csv', encoding='ISO-8859-1')
df_MBB = pd.read_csv('financial_data_MBB_filtered.csv', encoding='ISO-8859-1')
df_BID = pd.read_csv('financial_data_BID_filtered.csv', encoding='ISO-8859-1')
df_GDP = pd.read_csv('GDP_VIetnam_Quarterl_MoiNhat.csv', encoding='ISO-8859-1')

# Selected features with potential impact on GDP
selected_features = [
  'year', 'roe', 'roa', 'interestMargin','postTaxProfit','operationIncome','operationProfit','quarterOperationProfitGrowth','yearOperationProfitGrowth','provisionOnBadDebt'
  'creditGrowth','interestIncome','nonInterestIncome','earningAssetsToTotalAssets'

]

# Helper function to filter and rename columns for each bank
def filter_and_rename(data, bank_name):
    # Retain only columns present in the dataset
    available_features = [col for col in selected_features if col in data.columns]
    filtered_data = data[available_features].copy()
    filtered_data.columns = [f"{bank_name}_{col}" if col != 'year' else col for col in filtered_data.columns]
    return filtered_data

# Apply the filter function to each bank's dataset
vcb_filtered = filter_and_rename(df_VCB, 'VCB')
tcb_filtered = filter_and_rename(df_TCB, 'TCB')
mbb_filtered = filter_and_rename(df_MBB, 'MBB')
bid_filtered = filter_and_rename(df_BID, 'BID')

# Prepare GDP data by extracting the year from the Date column
df_GDP['year'] = pd.to_datetime(df_GDP['Date']).dt.year
gdp_q4 = df_GDP[(df_GDP['year'] >= 2013) & (df_GDP['year'] <= 2023)]

# Merge the bank data with GDP data on 'year'
merged_data = gdp_q4[['year', 'Quarterly_GDP']].merge(vcb_filtered, on='year') \
                                               .merge(tcb_filtered, on='year') \
                                               .merge(mbb_filtered, on='year') \
                                               .merge(bid_filtered, on='year')

# Define the independent variables (X) and dependent variable (y)
X = merged_data.drop(columns=['year', 'Quarterly_GDP'])  # Independent variables
y = merged_data['Quarterly_GDP']  # Dependent variable: Quarterly GDP growth

# Add a constant to the independent variables for the linear regression model
X = sm.add_constant(X)

# Fit the multivariate linear regression model
model = sm.OLS(y, X).fit()

# Print the model summary to show coefficients and p-values
print(model.summary())

# Extract and print significant features based on p-value < 0.05
significant_features = model.pvalues[model.pvalues < 0.05].index.tolist()
print("Significant features with p-value < 0.05:", significant_features)

                            OLS Regression Results                            
Dep. Variable:          Quarterly_GDP   R-squared:                       0.818
Model:                            OLS   Adj. R-squared:                  0.818
Method:                 Least Squares   F-statistic:                     1508.
Date:                Mon, 11 Nov 2024   Prob (F-statistic):               0.00
Time:                        21:16:17   Log-Likelihood:            -1.4903e+05
No. Observations:               10752   AIC:                         2.981e+05
Df Residuals:                   10719   BIC:                         2.984e+05
Df Model:                          32                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const   

In [85]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import statsmodels.api as sm
import numpy as np

df_TCB = pd.read_csv('financial_data_TCB_filtered.csv', encoding='ISO-8859-1')
df_VCB = pd.read_csv('financial_data_VCB_filtered.csv', encoding='ISO-8859-1')
df_MBB = pd.read_csv('financial_data_MBB_filtered.csv', encoding='ISO-8859-1')
df_BID = pd.read_csv('financial_data_BID_filtered.csv', encoding='ISO-8859-1')
df_GDP = pd.read_csv('GDP_VIetnam_Quarterl_MoiNhat.csv', encoding='ISO-8859-1')

# Selected features with potential impact on GDP
selected_features = [
    'year', 'roe', 'roa', 'operationIncome','operationProfit', 'provisionOnBadDebt', 'equityOnTotalAsset',
  'postTaxProfit', 'interestMargin','quarterOperationProfitGrowth','creditGrowth','loanOnAsset','badDebtPercentage'
]

# Helper function to filter and rename columns for each bank
def filter_and_rename(data, bank_name):
    available_features = [col for col in selected_features if col in data.columns]
    filtered_data = data[available_features].copy()
    filtered_data.columns = [f"{bank_name}_{col}" if col != 'year' else col for col in filtered_data.columns]
    return filtered_data

# Apply the filter function to each bank's dataset
vcb_filtered = filter_and_rename(df_VCB, 'VCB')
tcb_filtered = filter_and_rename(df_TCB, 'TCB')
mbb_filtered = filter_and_rename(df_MBB, 'MBB')
bid_filtered = filter_and_rename(df_BID, 'BID')

# Prepare GDP data by extracting the year from the Date column
df_GDP['year'] = pd.to_datetime(df_GDP['Date']).dt.year
gdp_q4 = df_GDP[(df_GDP['year'] >= 2013) & (df_GDP['year'] <= 2023)]

# Merge the bank data with GDP data on 'year'
merged_data = gdp_q4[['year', 'Quarterly_GDP']].merge(vcb_filtered, on='year') \
                                               .merge(tcb_filtered, on='year') \
                                               .merge(mbb_filtered, on='year') \
                                               .merge(bid_filtered, on='year')

# Define X and y
X = merged_data.drop(columns=['year', 'Quarterly_GDP'])  # Independent variables
y = merged_data['Quarterly_GDP']  # Dependent variable: GDP growth

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model on the training set
model = sm.OLS(y_train, X_train).fit()

# Predict on the test set
y_pred_test = model.predict(X_test)

# Calculate RMSE on the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
print("RMSE on test set:", rmse_test)

# Calculate MAPE on the test set
mape_test = mean_absolute_percentage_error(y_test, y_pred_test)
print("MAPE on test set:", mape_test)


RMSE on test set: 254334.52045042964
MAPE on test set: 0.1474272830221592


In [76]:
import pandas as pd
import statsmodels.api as sm

# Load data for banks and GDP
df_TCB = pd.read_csv('financial_data_TCB_filtered.csv', encoding='ISO-8859-1')
df_VCB = pd.read_csv('financial_data_VCB_filtered.csv', encoding='ISO-8859-1')
df_MBB = pd.read_csv('financial_data_MBB_filtered.csv', encoding='ISO-8859-1')
df_BID = pd.read_csv('financial_data_BID_filtered.csv', encoding='ISO-8859-1')
df_GDP = pd.read_csv('GDP_VIetnam_Quarterl_MoiNhat.csv', encoding='ISO-8859-1')

# Define selected features with high relevance to Total Assets
selected_features = [
    'year','loanOnAsset', 'equityOnTotalAsset', 'operationIncome', 
    'operationProfit', 'roe', 'roa', 'creditGrowth'
]

# Helper function to filter and rename columns for each bank
def filter_and_rename(data, bank_name):
    available_features = [col for col in selected_features if col in data.columns]
    filtered_data = data[available_features].copy()
    filtered_data.columns = [f"{bank_name}_{col}" if col != 'year' else col for col in filtered_data.columns]
    return filtered_data

# Apply the filter function to each bank's dataset
vcb_filtered = filter_and_rename(df_VCB, 'VCB')
tcb_filtered = filter_and_rename(df_TCB, 'TCB')
mbb_filtered = filter_and_rename(df_MBB, 'MBB')
bid_filtered = filter_and_rename(df_BID, 'BID')

# Prepare GDP data by extracting the year from the Date column
df_GDP['year'] = pd.to_datetime(df_GDP['Date']).dt.year
gdp_q4 = df_GDP[(df_GDP['year'] >= 2013) & (df_GDP['year'] <= 2023)]

# Merge the bank data with GDP data on 'year'
merged_data = gdp_q4[['year', 'Quarterly_GDP']].merge(vcb_filtered, on='year') \
                                               .merge(tcb_filtered, on='year') \
                                               .merge(mbb_filtered, on='year') \
                                               .merge(bid_filtered, on='year')

# Define the independent variables (X) and dependent variable (y)
X = merged_data.drop(columns=['year', 'Quarterly_GDP'])  # Independent variables
y = merged_data['Quarterly_GDP']  # Dependent variable: Quarterly GDP growth

# Add a constant to the independent variables for the linear regression model
X = sm.add_constant(X)

# Fit the multivariate linear regression model
model = sm.OLS(y, X).fit()

# Print the model summary to show coefficients and p-values
print(model.summary())

# Extract and print significant features based on p-value < 0.05
significant_features = model.pvalues[model.pvalues < 0.05].index.tolist()
print("Significant features with p-value < 0.05:", significant_features)


                            OLS Regression Results                            
Dep. Variable:          Quarterly_GDP   R-squared:                       0.817
Model:                            OLS   Adj. R-squared:                  0.817
Method:                 Least Squares   F-statistic:                     1714.
Date:                Mon, 11 Nov 2024   Prob (F-statistic):               0.00
Time:                        20:36:14   Log-Likelihood:            -1.4905e+05
No. Observations:               10752   AIC:                         2.982e+05
Df Residuals:                   10723   BIC:                         2.984e+05
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                  -3.55

In [87]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import statsmodels.api as sm
import numpy as np

df_TCB = pd.read_csv('financial_data_TCB_filtered.csv', encoding='ISO-8859-1')
df_VCB = pd.read_csv('financial_data_VCB_filtered.csv', encoding='ISO-8859-1')
df_MBB = pd.read_csv('financial_data_MBB_filtered.csv', encoding='ISO-8859-1')
df_BID = pd.read_csv('financial_data_BID_filtered.csv', encoding='ISO-8859-1')
df_GDP = pd.read_csv('GDP_VIetnam_Quarterl_MoiNhat.csv', encoding='ISO-8859-1')

# Selected features with potential impact on GDP
selected_features = [
    'year','loanOnAsset', 'equityOnTotalAsset', 'operationIncome', 
    'operationProfit', 'roe', 'roa', 'creditGrowth'
]

# Helper function to filter and rename columns for each bank
def filter_and_rename(data, bank_name):
    available_features = [col for col in selected_features if col in data.columns]
    filtered_data = data[available_features].copy()
    filtered_data.columns = [f"{bank_name}_{col}" if col != 'year' else col for col in filtered_data.columns]
    return filtered_data

# Apply the filter function to each bank's dataset
vcb_filtered = filter_and_rename(df_VCB, 'VCB')
tcb_filtered = filter_and_rename(df_TCB, 'TCB')
mbb_filtered = filter_and_rename(df_MBB, 'MBB')
bid_filtered = filter_and_rename(df_BID, 'BID')

# Prepare GDP data by extracting the year from the Date column
df_GDP['year'] = pd.to_datetime(df_GDP['Date']).dt.year
gdp_q4 = df_GDP[(df_GDP['year'] >= 2013) & (df_GDP['year'] <= 2023)]

# Merge the bank data with GDP data on 'year'
merged_data = gdp_q4[['year', 'Quarterly_GDP']].merge(vcb_filtered, on='year') \
                                               .merge(tcb_filtered, on='year') \
                                               .merge(mbb_filtered, on='year') \
                                               .merge(bid_filtered, on='year')

# Define X and y
X = merged_data.drop(columns=['year', 'Quarterly_GDP'])  # Independent variables
y = merged_data['Quarterly_GDP']  # Dependent variable: GDP growth

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model on the training set
model = sm.OLS(y_train, X_train).fit()

# Predict on the test set
y_pred_test = model.predict(X_test)

# Calculate RMSE on the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
print("RMSE on test set:", rmse_test)

# Calculate MAPE on the test set
mape_test = mean_absolute_percentage_error(y_test, y_pred_test)
print("MAPE on test set:", mape_test)


RMSE on test set: 255760.80553602424
MAPE on test set: 0.14825598500580106


In [89]:
import pandas as pd
import statsmodels.api as sm

# Load data for banks and GDP
df_TCB = pd.read_csv('financial_data_TCB_filtered.csv', encoding='ISO-8859-1')
df_VCB = pd.read_csv('financial_data_VCB_filtered.csv', encoding='ISO-8859-1')
df_MBB = pd.read_csv('financial_data_MBB_filtered.csv', encoding='ISO-8859-1')
df_BID = pd.read_csv('financial_data_BID_filtered.csv', encoding='ISO-8859-1')
df_GDP = pd.read_csv('GDP_VIetnam_Quarterl_MoiNhat.csv', encoding='ISO-8859-1')

# Define selected features with high relevance to Total Assets
selected_features = [
    'year', 'loanOnAsset', 'equityOnTotalAsset', 'operationIncome', 
    'operationProfit', 'roe', 'roa', 'creditGrowth'
]

# Helper function to filter and rename columns for each bank
def filter_and_rename(data, bank_name):
    available_features = [col for col in selected_features if col in data.columns]
    filtered_data = data[available_features].copy()
    filtered_data.columns = [f"{bank_name}_{col}" if col != 'year' else col for col in filtered_data.columns]
    return filtered_data

# Apply the filter function to each bank's dataset
vcb_filtered = filter_and_rename(df_VCB, 'VCB')
tcb_filtered = filter_and_rename(df_TCB, 'TCB')
mbb_filtered = filter_and_rename(df_MBB, 'MBB')
bid_filtered = filter_and_rename(df_BID, 'BID')

# Prepare GDP data by extracting the year from the Date column
df_GDP['year'] = pd.to_datetime(df_GDP['Date']).dt.year
gdp_q4 = df_GDP[(df_GDP['year'] >= 2013) & (df_GDP['year'] <= 2023)]

# Merge the bank data with GDP data on 'year'
merged_data = gdp_q4[['year', 'Quarterly_GDP']].merge(vcb_filtered, on='year') \
                                               .merge(tcb_filtered, on='year') \
                                               .merge(mbb_filtered, on='year') \
                                               .merge(bid_filtered, on='year')

# Define the independent variables (X) and dependent variable (y)
X = merged_data.drop(columns=['year', 'Quarterly_GDP'])  # Independent variables
y = merged_data['Quarterly_GDP']  # Dependent variable: Quarterly GDP growth

# Add a constant to the independent variables for the linear regression model
X = sm.add_constant(X)

# Fit the multivariate linear regression model
model = sm.OLS(y, X).fit()

# Print the model summary to show coefficients and p-values
print(model.summary())

# Extract and print significant features based on p-value < 0.05
significant_features = model.pvalues[model.pvalues < 0.05].index.tolist()
print("Significant features with p-value < 0.05:", significant_features)

# Check if Total Assets variables (from all banks) significantly influence GDP by reviewing p-values
# If p-value < 0.05, the variable is statistically significant in explaining GDP
total_assets_p_values = model.pvalues
print("\nP-values for Total Assets variables:")
print(total_assets_p_values[total_assets_p_values < 0.05])


                            OLS Regression Results                            
Dep. Variable:          Quarterly_GDP   R-squared:                       0.817
Model:                            OLS   Adj. R-squared:                  0.817
Method:                 Least Squares   F-statistic:                     1714.
Date:                Mon, 11 Nov 2024   Prob (F-statistic):               0.00
Time:                        20:41:47   Log-Likelihood:            -1.4905e+05
No. Observations:               10752   AIC:                         2.982e+05
Df Residuals:                   10723   BIC:                         2.984e+05
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                  -3.55

In [90]:
import pandas as pd
import statsmodels.api as sm

# Load data for banks and GDP
df_TCB = pd.read_csv('financial_data_TCB_filtered.csv', encoding='ISO-8859-1')
df_VCB = pd.read_csv('financial_data_VCB_filtered.csv', encoding='ISO-8859-1')
df_MBB = pd.read_csv('financial_data_MBB_filtered.csv', encoding='ISO-8859-1')
df_BID = pd.read_csv('financial_data_BID_filtered.csv', encoding='ISO-8859-1')
df_GDP = pd.read_csv('GDP_VIetnam_Quarterl_MoiNhat.csv', encoding='ISO-8859-1')

# Define selected features related to Total Assets
selected_features = [
    'year', 'loanOnAsset', 'equityOnTotalAsset', 'operationIncome', 
    'operationProfit', 'creditGrowth'
]

# Helper function to filter and rename columns for each bank
def filter_and_rename(data, bank_name):
    available_features = [col for col in selected_features if col in data.columns]
    filtered_data = data[available_features].copy()
    filtered_data.columns = [f"{bank_name}_{col}" if col != 'year' else col for col in filtered_data.columns]
    return filtered_data

# Apply the filter function to each bank's dataset
vcb_filtered = filter_and_rename(df_VCB, 'VCB')
tcb_filtered = filter_and_rename(df_TCB, 'TCB')
mbb_filtered = filter_and_rename(df_MBB, 'MBB')
bid_filtered = filter_and_rename(df_BID, 'BID')

# Prepare GDP data by extracting the year from the Date column
df_GDP['year'] = pd.to_datetime(df_GDP['Date']).dt.year
gdp_q4 = df_GDP[(df_GDP['year'] >= 2013) & (df_GDP['year'] <= 2023)]

# Merge the bank data with GDP data on 'year'
merged_data = gdp_q4[['year', 'Quarterly_GDP']].merge(vcb_filtered, on='year') \
                                               .merge(tcb_filtered, on='year') \
                                               .merge(mbb_filtered, on='year') \
                                               .merge(bid_filtered, on='year')

# Define the independent variables (X) and dependent variable (y)
X = merged_data.drop(columns=['year', 'Quarterly_GDP'])  # Independent variables
y = merged_data['Quarterly_GDP']  # Dependent variable: Quarterly GDP growth

# Add a constant to the independent variables for the linear regression model
X = sm.add_constant(X)

# Fit the multivariate linear regression model
model = sm.OLS(y, X).fit()

# Print the model summary to show coefficients and p-values
print(model.summary())

# Extract and print significant features based on p-value < 0.05
significant_features = model.pvalues[model.pvalues < 0.05].index.tolist()
print("Significant features with p-value < 0.05:", significant_features)


                            OLS Regression Results                            
Dep. Variable:          Quarterly_GDP   R-squared:                       0.814
Model:                            OLS   Adj. R-squared:                  0.814
Method:                 Least Squares   F-statistic:                     2348.
Date:                Mon, 11 Nov 2024   Prob (F-statistic):               0.00
Time:                        20:41:50   Log-Likelihood:            -1.4915e+05
No. Observations:               10752   AIC:                         2.983e+05
Df Residuals:                   10731   BIC:                         2.985e+05
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                   5.40

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

# Load data for banks and GDP
df_TCB = pd.read_csv('financial_data_TCB_filtered.csv', encoding='ISO-8859-1')
df_VCB = pd.read_csv('financial_data_VCB_filtered.csv', encoding='ISO-8859-1')
df_MBB = pd.read_csv('financial_data_MBB_filtered.csv', encoding='ISO-8859-1')
df_BID = pd.read_csv('financial_data_BID_filtered.csv', encoding='ISO-8859-1')
df_GDP = pd.read_csv('gdp_quarterly_2013_2023.csv', encoding='ISO-8859-1')

# Define selected features related to Total Assets
selected_features = [
    'year', 'loanOnAsset', 'equityOnTotalAsset', 'operationIncome', 
    'operationProfit', 'creditGrowth'
]

# Helper function to filter and rename columns for each bank
def filter_and_rename(data, bank_name):
    available_features = [col for col in selected_features if col in data.columns]
    filtered_data = data[available_features].copy()
    filtered_data.columns = [f"{bank_name}_{col}" if col != 'year' else col for col in filtered_data.columns]
    return filtered_data

# Apply the filter function to each bank's dataset
vcb_filtered = filter_and_rename(df_VCB, 'VCB')
tcb_filtered = filter_and_rename(df_TCB, 'TCB')
mbb_filtered = filter_and_rename(df_MBB, 'MBB')
bid_filtered = filter_and_rename(df_BID, 'BID')

# Prepare GDP data by extracting the year from the Date column
df_GDP['year'] = pd.to_datetime(df_GDP['Date']).dt.year
gdp_q4 = df_GDP[(df_GDP['year'] >= 2013) & (df_GDP['year'] <= 2023)]

# Merge the bank data with GDP data on 'year'
merged_data = gdp_q4[['year', 'Quarterly_GDP']].merge(vcb_filtered, on='year') \
                                               .merge(tcb_filtered, on='year') \
                                               .merge(mbb_filtered, on='year') \
                                               .merge(bid_filtered, on='year')

# Define the independent variables (X) and dependent variable (y)
X = merged_data.drop(columns=['year', 'Quarterly_GDP'])  # Independent variables
y = merged_data['Quarterly_GDP']  # Dependent variable: Quarterly GDP growth

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"MAPE: {mape}")
print(f"R-squared: {r2}")

# Feature importance
feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print("\nFeature Importances:")
print(feature_importances)


RMSE: 189780.66379427782
MAPE: 0.08668191855185013
R-squared: 0.8023084633961602

Feature Importances:
MBB_operationIncome       0.761703
VCB_operationProfit       0.069784
MBB_operationProfit       0.041792
TCB_operationProfit       0.031569
VCB_operationIncome       0.027899
TCB_operationIncome       0.026298
BID_equityOnTotalAsset    0.004904
VCB_creditGrowth          0.004700
VCB_loanOnAsset           0.003995
TCB_equityOnTotalAsset    0.003845
MBB_creditGrowth          0.003146
BID_loanOnAsset           0.002623
TCB_creditGrowth          0.002439
BID_operationProfit       0.002303
MBB_equityOnTotalAsset    0.002253
BID_operationIncome       0.002193
BID_creditGrowth          0.002164
TCB_loanOnAsset           0.002140
MBB_loanOnAsset           0.002135
VCB_equityOnTotalAsset    0.002116
dtype: float64


