In [None]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from sklearn.preprocessing import StandardScaler
import os
from datetime import datetime
from functools import reduce
from scipy.stats import shapiro
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Path to the data folder
data_folder = '/content/drive/MyDrive/Capstone Data/Final Data'
if not os.path.exists(data_folder):
    raise FileNotFoundError(f"Data folder not found: {data_folder}. Contents: {os.listdir('/content/drive/MyDrive')}")
print(f"Files in {data_folder}: {os.listdir(data_folder)}")

files = [
    'Annual Labor Force Survey.csv',
    'clean_import_of_goods_and_services_as_a_percentage_of_GDP (1).csv',
    'clean_inflation_rate (2).csv',
    'clean_labor_force_participation_rate_modeled.csv',
    'Cleaned_GDP_Per_Capita_Data_2010_2023 - Cleaned_GDP_Per_Capita_Data_2010_2023.csv',
    'export_of_goods_and_services___GDP (1).csv',
    'Filtered_FDI_2010_2023.csv',
    'filtered_gdp_usd (1).csv',
    'Filtered_Real_Interest_Rates_Final.csv',
    'Filtered_Unemployment_Rate.csv',
    'population_growth_cleaned.csv',
    'tax_to_gdp_2009_2023.csv',
    'filtered_corruption_score(0-100).csv',
    'Average Wage Data  - Average Wage Data .csv'
]

# Target variable file
target_file = 'Public_Sector_Debt.csv'

# Rename predictors
rename_dict = {
    'Annual Labor Force Survey': 'Labor_Force_Survey',
    'clean_import_of_goods_and_services_as_a_percentage_of_GDP (1)': 'Imports_to_GDP',
    'clean_inflation_rate': 'Inflation_Rate',
    'clean_labor_force_participation_rate_modeled': 'Labor_Force_Participation',
    'Cleaned_GDP_Per_Capita_Data_2010_2023 - Cleaned_GDP_Per_Capita_Data_2010_2023': 'GDP_Per_Capita',
    'export_of_goods_and_services___GDP': 'Exports_to_GDP',
    'Filtered_FDI_2010_2023': 'FDI',
    'filtered_gdp_usd': 'GDP_USD',
    'Filtered_Real_Interest_Rates_Final': 'Real_Interest_Rates',
    'Filtered_Unemployment_Rate': 'Unemployment_Rate',
    'population_growth_cleaned': 'Population_Growth',
    'tax_to_gdp_2009_2023': 'Tax_to_GDP',
    'filtered_corruption_score(0-100).csv':'Corruption',
    'Average Wage Data  - Average Wage Data .csv':'Wage',
}

Files in /content/drive/MyDrive/Capstone Data/Final Data: ['tax_to_gdp_2009_2023.csv', 'population_growth_cleaned.csv', 'Filtered_Unemployment_Rate.csv', 'Filtered_Real_Interest_Rates_Final.csv', 'Filtered_FDI_2010_2023.csv', 'export_of_goods_and_services___GDP (1).csv', 'Cleaned_GDP_Per_Capita_Data_2010_2023 - Cleaned_GDP_Per_Capita_Data_2010_2023.csv', 'clean_labor_force_participation_rate_modeled.csv', 'clean_inflation_rate (2).csv', 'clean_import_of_goods_and_services_as_a_percentage_of_GDP (1).csv', 'Annual Labor Force Survey.csv', 'Tax_Revenue_2010_2023_updated - Tax_Revenue_2010_2023_updated.csv', 'filtered_urban_population(% of total polpulation) - filtered_urban_population(% of total polpulation).csv', 'Average Wage Data  - Average Wage Data .csv', 'filtered_gdp_usd (1).csv', 'WUI_Annual_Averages_2009_2023_Pivot (1).gsheet', 'CPI.csv', 'filtered_corruption_score(0-100).csv', '.ipynb_checkpoints', 'Public_Sector_Debt.csv']


In [None]:
# Function to load and preprocess data
def load_and_preprocess(file_path):
    try:
        df = pd.read_csv(os.path.join(data_folder, file_path))
        df['CCode'] = df['CCode'].astype(str)
        year_cols = [col for col in df.columns if col not in ['CCode', 'Country', 'Country Name'] and col.isdigit()]
        df_melt = pd.melt(df, id_vars=['CCode'], value_vars=year_cols,
                          var_name='Year', value_name=file_path.split('.')[0])
        df_melt['Year'] = df_melt['Year'].astype(int)
        value_col = file_path.split('.')[0]
        if df_melt[value_col].dtype not in ['float64', 'int64']:
            df_melt[value_col] = pd.to_numeric(df_melt[value_col].astype(str).str.replace(',', ''), errors='coerce')
        if value_col in rename_dict:
            df_melt = df_melt.rename(columns={value_col: rename_dict[value_col]})
        return df_melt
    except Exception as e:
        print(f"Error processing {file_path}: {type(e).__name__} - {str(e)}")
        return None

# Load target variable
target_df = load_and_preprocess(target_file)
if target_df is None:
    raise ValueError("Failed to load target file")
target_df = target_df.rename(columns={target_file.split('.')[0]: 'Public_Sector_Debt'})

# Load all predictor datasets
predictor_dfs = []
for file in files:
    df = load_and_preprocess(file)
    if df is not None:
        predictor_dfs.append(df)
    else:
        print(f"Skipping {file} due to loading error")

# Merge all datasets
merged_df = reduce(lambda left, right: pd.merge(left, right, on=['CCode', 'Year'], how='inner'), [target_df] + predictor_dfs)

# Check missing values
print("Missing values before processing:")
print(merged_df.isna().sum())
initial_rows = merged_df.shape[0]
merged_df = merged_df.fillna(merged_df.mean(numeric_only=True))
print(f"Shape of merged dataset after imputation: {merged_df.shape}")
print(f"Rows retained: {merged_df.shape[0]} ({(merged_df.shape[0] / initial_rows) * 100:.2f}%)")

Missing values before processing:
CCode                                      0
Year                                       0
Public_Sector_Debt                         0
Labor_Force_Survey                         0
Imports_to_GDP                             0
clean_inflation_rate (2)                   0
Labor_Force_Participation                  0
GDP_Per_Capita                             0
export_of_goods_and_services___GDP (1)     0
FDI                                        0
filtered_gdp_usd (1)                       0
Real_Interest_Rates                        0
Unemployment_Rate                          0
Population_Growth                          0
Tax_to_GDP                                 0
filtered_corruption_score(0-100)           0
Average Wage Data  - Average Wage Data     0
dtype: int64
Shape of merged dataset after imputation: (518, 17)
Rows retained: 518 (100.00%)


In [None]:
#First attempt, no dumies, not anything!
import statsmodels.api as sm

# Step 1: Define predictors and target
target_variable = 'Public_Sector_Debt'
predictors = [col for col in merged_df.columns if col not in ['CCode', 'Year', target_variable]]

X = merged_df[predictors]
y = merged_df[target_variable]

# Step 2: Add constant (for intercept)
X_with_const = sm.add_constant(X)

# Step 3: Fit OLS Regression
model = sm.OLS(y, X_with_const).fit()

# Step 4: Show the summary
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:     Public_Sector_Debt   R-squared:                       0.442
Model:                            OLS   Adj. R-squared:                  0.426
Method:                 Least Squares   F-statistic:                     28.42
Date:                Mon, 28 Apr 2025   Prob (F-statistic):           6.40e-55
Time:                        17:04:23   Log-Likelihood:                -2532.2
No. Observations:                 518   AIC:                             5094.
Df Residuals:                     503   BIC:                             5158.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                              coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------

In [None]:
countries_to_drop = [
    'COL', 'MEX', 'TUR', 'ESP', 'PRT', 'LVA', 'CRI', 'SVN', 'GRC', 'CZE', 'SVK', 'LTU', 'HUN', 'EST', 'POL', 'CHL'
]

# Filter out the bad countries
merged_df = merged_df[~merged_df['CCode'].isin(countries_to_drop)].copy()

print(f"Remaining countries after manual drop: {merged_df['CCode'].nunique()}")

Remaining countries after manual drop: 21


In [None]:
# Optionally: Drop rows where Region is NaN (unknown countries)
merged_df = merged_df.dropna(subset=['Region'])

# Create Region Dummies
region_dummies = pd.get_dummies(merged_df['Region'], drop_first=True)

# Prepare X matrix
X = merged_df.drop(['CCode', 'Year', 'Public_Sector_Debt', 'Region'], axis=1)
X = pd.concat([X, region_dummies], axis=1)
X = X.select_dtypes(include=['float64', 'int64'])

# Prepare y
y = np.log1p(merged_df['Public_Sector_Debt'])  # log transform if you're still using log1p

# Scale X
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = sm.add_constant(X_scaled)

# Fit model
model_with_regions = sm.OLS(y, X_scaled).fit(cov_type='HC3')

# Print results
print(model_with_regions.summary())


                            OLS Regression Results                            
Dep. Variable:     Public_Sector_Debt   R-squared:                       0.454
Model:                            OLS   Adj. R-squared:                  0.439
Method:                 Least Squares   F-statistic:                     45.01
Date:                Mon, 28 Apr 2025   Prob (F-statistic):           2.24e-79
Time:                        17:10:18   Log-Likelihood:                -373.64
No. Observations:                 518   AIC:                             777.3
Df Residuals:                     503   BIC:                             841.0
Df Model:                          14                                         
Covariance Type:                  HC3                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          3.9185      0.023    173.201      0.0