In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Setting Up google colab
# Basic imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix

# Set plot style
sns.set(style="whitegrid")

#Data Upload

In [None]:
#Datasets
debt_df = pd.read_csv("/content/drive/MyDrive/Capstone Data/Final Data/Public_Sector_Debt.csv")
# Load additional datasets
gdp_df = pd.read_csv("/content/drive/MyDrive/Capstone Data/YOY/Cleaned_National_GDP_Yoy.csv")
pop_df = pd.read_csv("/content/drive/MyDrive/Capstone Data/YOY/Cleaned_Pop_Growth.csv")
unemp_df = pd.read_csv("/content/drive/MyDrive/Capstone Data/YOY/Cleaned_Unemployment_Yoy.csv")
stab_df = pd.read_csv("/content/drive/MyDrive/Cleaned_Data/cleaned_political_stability.csv")
corr_df = pd.read_csv("/content/drive/MyDrive/Capstone Data/Final Data/filtered_corruption_score(0-100).csv")

#Reshape the Data

In [None]:
import pandas as pd

# Load and reshape the Public Sector Debt dataset
debt_df = pd.read_csv("/content/drive/MyDrive/Capstone Data/Final Data/Public_Sector_Debt.csv")
debt_df = debt_df.loc[:, ["Country"] + [col for col in debt_df.columns if col.isdigit()]]
debt_df = debt_df.melt(id_vars=["Country"], var_name="Year", value_name="Public_Sector_Debt")
debt_df["Year"] = debt_df["Year"].astype(int)

# Load and reshape the GDP Growth dataset
gdp_df = pd.read_csv("/content/drive/MyDrive/Capstone Data/YOY/Cleaned_National_GDP_Yoy.csv")
gdp_df = gdp_df.loc[:, ["Country"] + [col for col in gdp_df.columns if col.isdigit()]]
gdp_df = gdp_df.melt(id_vars=["Country"], var_name="Year", value_name="GDP_Growth")
gdp_df["Year"] = gdp_df["Year"].astype(int)

# Load and reshape the Population Growth dataset
pop_df = pd.read_csv("/content/drive/MyDrive/Capstone Data/YOY/Cleaned_Pop_Growth.csv")
pop_df = pop_df.loc[:, ["Country"] + [col for col in pop_df.columns if col.isdigit()]]
pop_df = pop_df.melt(id_vars=["Country"], var_name="Year", value_name="Population_Growth")
pop_df["Year"] = pop_df["Year"].astype(int)

# Load and reshape the Unemployment dataset
unemp_df = pd.read_csv("/content/drive/MyDrive/Cleaned_Data/cleaned_political_stability.csv")
unemp_df = unemp_df.loc[:, ["Country"] + [col for col in unemp_df.columns if col.isdigit()]]
unemp_df = unemp_df.melt(id_vars=["Country"], var_name="Year", value_name="Unemployment")
unemp_df["Year"] = unemp_df["Year"].astype(int)

# Load and reshape the Political Stability dataset
stab_df = pd.read_csv("/content/drive/MyDrive/Cleaned_Data/cleaned_political_stability.csv")
stab_df = stab_df.loc[:, ["Country"] + [col for col in stab_df.columns if col.isdigit()]]
stab_df = stab_df.melt(id_vars=["Country"], var_name="Year", value_name="Political_Stability")
stab_df["Year"] = stab_df["Year"].astype(int)

# Load and reshape the Corruption dataset
corr_df = pd.read_csv("/content/drive/MyDrive/Capstone Data/Final Data/filtered_corruption_score(0-100).csv")
corr_df = corr_df.loc[:, ["Country"] + [col for col in corr_df.columns if col.isdigit()]]
corr_df = corr_df.melt(id_vars=["Country"], var_name="Year", value_name="Corruption")
corr_df["Year"] = corr_df["Year"].astype(int)

# Merge all datasets on Country and Year
merged_df = debt_df.merge(gdp_df, on=["Country", "Year"]) \
                   .merge(pop_df, on=["Country", "Year"]) \
                   .merge(unemp_df, on=["Country", "Year"]) \
                   .merge(stab_df, on=["Country", "Year"]) \
                   .merge(corr_df, on=["Country", "Year"])

# Drop rows with any missing values
merged_df = merged_df.dropna()

# Preview the merged dataset
print(merged_df.head())

    Country  Year  Public_Sector_Debt  GDP_Growth  Population_Growth  \
0   Austria  2010           59.740463   -0.022556           0.002404   
1   Belgium  2010           85.766646   -0.006186           0.009136   
2    Canada  2010           44.071359    0.182482           0.011114   
3     Chile  2010            8.607327    0.261628           0.010046   
4  Colombia  2010           29.518780    0.232759           0.011360   

   Unemployment  Political_Stability  Corruption  
0      1.152648             1.152648          79  
1      0.810646             0.810646          71  
2      0.936318             0.936318          89  
3      0.679605             0.679605          72  
4     -1.537185            -1.537185          35  


In [None]:
# Merge all by Country and Year
merged_df = debt_df.merge(gdp_df, on=["Country", "Year"]) \
                   .merge(pop_df, on=["Country", "Year"]) \
                   .merge(unemp_df, on=["Country", "Year"]) \
                   .merge(stab_df, on=["Country", "Year"]) \
                   .merge(corr_df, on=["Country", "Year"])

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Define features and target variable
features = ["GDP_Growth", "Population_Growth", "Unemployment", "Political_Stability", "Corruption"]
target = "Public_Sector_Debt"

X = merged_df[features]
y = merged_df[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Gradient Boosting model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Feature importance
importances = model.feature_importances_
for feature, importance in zip(features, importances):
    print(f"{feature}: {importance}")

Mean Squared Error: 695.1430664229146
GDP_Growth: 0.09405154870950495
Population_Growth: 0.4089600863731905
Unemployment: 0.22945340177197435
Political_Stability: 0.08612428968983164
Corruption: 0.18141067345549855
