In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

# Load the Public Debt (Target) and Export datasets
df_debt = pd.read_csv('/content/drive/MyDrive/Capstone Data/2022-2023/Cleaned_Public_Debt_2022-2023.csv')
df_export = pd.read_csv('/content/drive/MyDrive/Capstone Data/2022-2023/Cleaned_Export_2022-2023.csv')

In [3]:
df_export.head()

Unnamed: 0,Country,Ccode,2022,2023
0,Australia,AUS,0.257272,0.072743
1,Austria,AUT,0.087363,0.040871
2,Belgium,BEL,0.053928,-0.040118
3,Canada,CAN,0.167388,-0.022728
4,Switzerland,CHE,0.085887,0.05866


In [4]:
import pandas as pd
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

# List of European OECD countries (Geographic)
european_oecd_countries = [
    "Austria", "Belgium", "Czech Republic", "Denmark", "Estonia", "Finland", "France", "Germany",
    "Greece", "Hungary", "Iceland", "Ireland", "Italy", "Latvia", "Lithuania", "Luxembourg",
    "Netherlands", "Norway", "Poland", "Portugal", "Slovak Republic", "Slovenia", "Spain", "Sweden", "Switzerland", "United Kingdom"]

# List of non-European OECD countries
non_european_oecd_countries = [
    "Australia", "Canada", "Chile", "Colombia", "Israel", "Japan", "Korea", "Mexico", "New Zealand", "Turkey", "United States"]

In [5]:
import pandas as pd
from scipy.stats import pearsonr
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Merge Datasets based on 'Country'
merged_df = pd.merge(df_debt, df_export, on='Country', how='inner')

# Convert wide format (years as columns) to long format
df_long_debt = df_debt.melt(id_vars=["Country"], var_name="Year", value_name="Public_Debt")
df_long_export = df_export.melt(id_vars=["Country"], var_name="Year", value_name="Exports")

# Merge the long format DataFrames
df_long = pd.merge(df_long_debt, df_long_export, on=['Country', 'Year'], how='inner')

# Convert 'Year' column to numeric, handling potential errors
df_long['Year'] = pd.to_numeric(df_long['Year'], errors='coerce')

# Convert 'Public_Debt' and 'Exports' to numeric
df_long['Public_Debt'] = pd.to_numeric(df_long['Public_Debt'], errors='coerce')
df_long['Exports'] = pd.to_numeric(df_long['Exports'], errors='coerce')

# Filter to 2022-2023
df_2022_2023 = df_long[df_long["Year"].between(2022, 2023)].dropna(subset=["Public_Debt", "Exports"]).copy()

# IQR function with country tracking (only for Exports)
def drop_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    removed_countries = df[(df[column] < lower) | (df[column] > upper)]["Country"].unique().tolist()  # Track unique removed countries
    df_cleaned = df[(df[column] >= lower) & (df[column] <= upper)]
    return df_cleaned, removed_countries

# Drop outliers only for Exports and track removed countries
df_2022_2023_clean, removed_countries_exports = drop_outliers_iqr(df_2022_2023, "Exports")

# Split removed countries into European and non-European OECD countries
removed_european_countries = [country for country in removed_countries_exports if country in european_oecd_countries]
removed_non_european_countries = [country for country in removed_countries_exports if country in non_european_oecd_countries]

# Filter out European and non-European OECD countries separately
df_2022_2023_european = df_2022_2023_clean[df_2022_2023_clean["Country"].isin(european_oecd_countries)]
df_2022_2023_non_european = df_2022_2023_clean[df_2022_2023_clean["Country"].isin(non_european_oecd_countries)]

# Split European countries into positive and negative groups based on debt-to-exports ratio (average over 2022-2023)
df_2022_2023_european['Debt_to_Exports'] = df_2022_2023_european["Public_Debt"] / df_2022_2023_european["Exports"]
median_ratio = df_2022_2023_european.groupby('Country')['Debt_to_Exports'].mean().median()
european_ratios = df_2022_2023_european.groupby('Country')['Debt_to_Exports'].mean()
positive_european = european_ratios[european_ratios >= median_ratio].index.tolist()
negative_european = european_ratios[european_ratios < median_ratio].index.tolist()

# Filter European groups
df_positive_european = df_2022_2023_european[df_2022_2023_european["Country"].isin(positive_european)]
df_negative_european = df_2022_2023_european[df_2022_2023_european["Country"].isin(negative_european)]

# Compute correlation and regression for Positive European OECD countries
if len(df_positive_european) > 2:
    corr_positive, p_positive = pearsonr(df_positive_european["Public_Debt"], df_positive_european["Exports"])
    sig_positive = "Significant" if p_positive < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for Positive European OECD countries (High Debt-to-Exports) after removing outliers: {corr_positive:.3f}, p-value: {p_positive:.3f} ({sig_positive})")

    X_pos = df_positive_european["Exports"]
    y_pos = df_positive_european["Public_Debt"]
    X_pos = sm.add_constant(X_pos)
    model_pos = sm.OLS(y_pos, X_pos).fit()
    print("\nLinear Regression (Positive European OECD, 2022-2023):")
    print(f"Coefficient (Exports): {model_pos.params['Exports']:.3f}, p-value: {model_pos.pvalues['Exports']:.3f}")
    print(f"R-squared: {model_pos.rsquared:.3f}")
    print(model_pos.summary())
else:
    print(f"Not enough data for Positive European OECD countries. Sample size: {len(df_positive_european)}")

# Compute correlation and regression for Negative European OECD countries
if len(df_negative_european) > 2:
    corr_negative, p_negative = pearsonr(df_negative_european["Public_Debt"], df_negative_european["Exports"])
    sig_negative = "Significant" if p_negative < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for Negative European OECD countries (Low Debt-to-Exports) after removing outliers: {corr_negative:.3f}, p-value: {p_negative:.3f} ({sig_negative})")

    X_neg = df_negative_european["Exports"]
    y_neg = df_negative_european["Public_Debt"]
    X_neg = sm.add_constant(X_neg)
    model_neg = sm.OLS(y_neg, X_neg).fit()
    print("\nLinear Regression (Negative European OECD, 2022-2023):")
    print(f"Coefficient (Exports): {model_neg.params['Exports']:.3f}, p-value: {model_neg.pvalues['Exports']:.3f}")
    print(f"R-squared: {model_neg.rsquared:.3f}")
    print(model_neg.summary())
else:
    print(f"Not enough data for Negative European OECD countries. Sample size: {len(df_negative_european)}")

# Compute correlation and regression for non-European OECD countries
if len(df_2022_2023_non_european) > 2:
    corr_non_european, p_non_european = pearsonr(df_2022_2023_non_european["Public_Debt"], df_2022_2023_non_european["Exports"])
    sig_non_european = "Significant" if p_non_european < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for non-European OECD countries after removing outliers in Exports: {corr_non_european:.3f}, p-value: {p_non_european:.3f} ({sig_non_european})")

    X_non_eu = df_2022_2023_non_european["Exports"]
    y_non_eu = df_2022_2023_non_european["Public_Debt"]
    X_non_eu = sm.add_constant(X_non_eu)
    model_non_eu = sm.OLS(y_non_eu, X_non_eu).fit()
    print("\nLinear Regression (Non-European OECD, 2022-2023):")
    print(f"Coefficient (Exports): {model_non_eu.params['Exports']:.3f}, p-value: {model_non_eu.pvalues['Exports']:.3f}")
    print(f"R-squared: {model_non_eu.rsquared:.3f}")
    print(model_non_eu.summary())
else:
    print("Not enough data left for non-European OECD countries after outlier removal.")

# Calculate the overall correlation and regression for 2022-2023 (after removing outliers in Exports)
if len(df_2022_2023_clean) > 2:
    overall_corr, overall_p = pearsonr(df_2022_2023_clean["Public_Debt"], df_2022_2023_clean["Exports"])
    sig_overall = "Significant" if overall_p < 0.05 else "Not Significant"
    print(f"Overall Correlation between Public Sector Debt and Exports (2022-2023) after removing outliers in Exports: {overall_corr:.3f}, p-value: {overall_p:.3f} ({sig_overall})")

    X_overall = df_2022_2023_clean["Exports"]
    y_overall = df_2022_2023_clean["Public_Debt"]
    X_overall = sm.add_constant(X_overall)
    model_overall = sm.OLS(y_overall, X_overall).fit()
    print("\nOverall Linear Regression (2022-2023):")
    print(f"Coefficient (Exports): {model_overall.params['Exports']:.3f}, p-value: {model_overall.pvalues['Exports']:.3f}")
    print(f"R-squared: {model_overall.rsquared:.3f}")
    print(model_overall.summary())
else:
    print("Not enough data left for overall correlation after outlier removal.")

# Print the countries that were removed and the European groups
print("\nEuropean OECD countries removed due to outliers in Exports:", removed_european_countries if removed_european_countries else "None")
print("Non-European OECD countries removed due to outliers in Exports:", removed_non_european_countries if removed_non_european_countries else "None")
print("Positive European OECD countries (High Debt-to-Exports):", positive_european)
print("Negative European OECD countries (Low Debt-to-Exports):", negative_european)

2022-2023 Correlation for Positive European OECD countries (High Debt-to-Exports) after removing outliers: -0.557, p-value: 0.003 (Significant)

Linear Regression (Positive European OECD, 2022-2023):
Coefficient (Exports): -0.418, p-value: 0.003
R-squared: 0.310
                            OLS Regression Results                            
Dep. Variable:            Public_Debt   R-squared:                       0.310
Model:                            OLS   Adj. R-squared:                  0.282
Method:                 Least Squares   F-statistic:                     10.81
Date:                Sun, 13 Apr 2025   Prob (F-statistic):            0.00311
Time:                        20:36:17   Log-Likelihood:                 46.682
No. Observations:                  26   AIC:                            -89.36
Df Residuals:                      24   BIC:                            -86.85
Df Model:                           1                                         
Covariance Type:          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2022_2023_european['Debt_to_Exports'] = df_2022_2023_european["Public_Debt"] / df_2022_2023_european["Exports"]
  return hypotest_fun_in(*args, **kwds)


In [8]:
#Import the cleaned GDP YOY dataset
df_GDP = pd.read_csv('/content/drive/MyDrive/Capstone Data/2022-2023/Cleaned_National_GDP_2022-2023.csv')

In [9]:
df_GDP.head()

Unnamed: 0,Country,CCode,2022,2023
0,Australia,AUS,0.083333,0.023669
1,Austria,AUT,-0.016667,0.084746
2,Belgium,BEL,-0.008361,0.08769
3,Canada,CAN,0.074627,-0.009259
4,Switzerland,CHE,0.00615,0.081907


In [10]:
import pandas as pd
from scipy.stats import pearsonr
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Merge Datasets based on 'Country'
merged_df = pd.merge(df_debt, df_GDP, on='Country', how='inner')

# Convert wide format (years as columns) to long format
df_long_debt = df_debt.melt(id_vars=["Country"], var_name="Year", value_name="Public_Debt")
df_long_GDP = df_GDP.melt(id_vars=["Country"], var_name="Year", value_name="GDP")

# Merge the long format DataFrames
df_long = pd.merge(df_long_debt, df_long_GDP, on=['Country', 'Year'], how='inner')

# Convert 'Year' column to numeric, handling potential errors
df_long['Year'] = pd.to_numeric(df_long['Year'], errors='coerce')

# Convert 'Public_Debt' and 'GDP' to numeric
df_long['Public_Debt'] = pd.to_numeric(df_long['Public_Debt'], errors='coerce')
df_long['GDP'] = pd.to_numeric(df_long['GDP'], errors='coerce')

# Filter to 2022-2023
df_2022_2023 = df_long[df_long["Year"].between(2022, 2023)].dropna(subset=["Public_Debt", "GDP"]).copy()

# IQR function with country tracking (only for GDP)
def drop_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    removed_countries = df[(df[column] < lower) | (df[column] > upper)]["Country"].unique().tolist()  # Track unique removed countries
    df_cleaned = df[(df[column] >= lower) & (df[column] <= upper)]
    return df_cleaned, removed_countries

# Drop outliers only for GDP and track removed countries
df_2022_2023_clean, removed_countries_GDP = drop_outliers_iqr(df_2022_2023, "GDP")

# Split removed countries into European and non-European OECD countries
removed_european_countries = [country for country in removed_countries_GDP if country in european_oecd_countries]
removed_non_european_countries = [country for country in removed_countries_GDP if country in non_european_oecd_countries]

# Filter out European and non-European OECD countries separately
df_2022_2023_european = df_2022_2023_clean[df_2022_2023_clean["Country"].isin(european_oecd_countries)]
df_2022_2023_non_european = df_2022_2023_clean[df_2022_2023_clean["Country"].isin(non_european_oecd_countries)]

# Split European countries into positive and negative groups based on debt-to-GDP ratio (average over 2022-2023)
df_2022_2023_european['Debt_to_GDP'] = df_2022_2023_european["Public_Debt"] / df_2022_2023_european["GDP"]
median_ratio = df_2022_2023_european.groupby('Country')['Debt_to_GDP'].mean().median()
european_ratios = df_2022_2023_european.groupby('Country')['Debt_to_GDP'].mean()
positive_european = european_ratios[european_ratios >= median_ratio].index.tolist()
negative_european = european_ratios[european_ratios < median_ratio].index.tolist()

# Filter European groups
df_positive_european = df_2022_2023_european[df_2022_2023_european["Country"].isin(positive_european)]
df_negative_european = df_2022_2023_european[df_2022_2023_european["Country"].isin(negative_european)]

# Compute correlation and regression for Positive European OECD countries
if len(df_positive_european) > 2:
    corr_positive, p_positive = pearsonr(df_positive_european["Public_Debt"], df_positive_european["GDP"])
    sig_positive = "Significant" if p_positive < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for Positive European OECD countries (High Debt-to-GDP) after removing outliers: {corr_positive:.3f}, p-value: {p_positive:.3f} ({sig_positive})")

    X_pos = df_positive_european["GDP"]
    y_pos = df_positive_european["Public_Debt"]
    X_pos = sm.add_constant(X_pos)
    model_pos = sm.OLS(y_pos, X_pos).fit()
    print("\nLinear Regression (Positive European OECD, 2022-2023):")
    print(f"Coefficient (GDP): {model_pos.params['GDP']:.3f}, p-value: {model_pos.pvalues['GDP']:.3f}")
    print(f"R-squared: {model_pos.rsquared:.3f}")
    print(model_pos.summary())
else:
    print(f"Not enough data for Positive European OECD countries. Sample size: {len(df_positive_european)}")

# Compute correlation and regression for Negative European OECD countries
if len(df_negative_european) > 2:
    corr_negative, p_negative = pearsonr(df_negative_european["Public_Debt"], df_negative_european["GDP"])
    sig_negative = "Significant" if p_negative < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for Negative European OECD countries (Low Debt-to-GDP) after removing outliers: {corr_negative:.3f}, p-value: {p_negative:.3f} ({sig_negative})")

    X_neg = df_negative_european["GDP"]
    y_neg = df_negative_european["Public_Debt"]
    X_neg = sm.add_constant(X_neg)
    model_neg = sm.OLS(y_neg, X_neg).fit()
    print("\nLinear Regression (Negative European OECD, 2022-2023):")
    print(f"Coefficient (GDP): {model_neg.params['GDP']:.3f}, p-value: {model_neg.pvalues['GDP']:.3f}")
    print(f"R-squared: {model_neg.rsquared:.3f}")
    print(model_neg.summary())
else:
    print(f"Not enough data for Negative European OECD countries. Sample size: {len(df_negative_european)}")

# Compute correlation and regression for non-European OECD countries
if len(df_2022_2023_non_european) > 2:
    corr_non_european, p_non_european = pearsonr(df_2022_2023_non_european["Public_Debt"], df_2022_2023_non_european["GDP"])
    sig_non_european = "Significant" if p_non_european < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for non-European OECD countries after removing outliers in GDP: {corr_non_european:.3f}, p-value: {p_non_european:.3f} ({sig_non_european})")

    X_non_eu = df_2022_2023_non_european["GDP"]
    y_non_eu = df_2022_2023_non_european["Public_Debt"]
    X_non_eu = sm.add_constant(X_non_eu)
    model_non_eu = sm.OLS(y_non_eu, X_non_eu).fit()
    print("\nLinear Regression (Non-European OECD, 2022-2023):")
    print(f"Coefficient (GDP): {model_non_eu.params['GDP']:.3f}, p-value: {model_non_eu.pvalues['GDP']:.3f}")
    print(f"R-squared: {model_non_eu.rsquared:.3f}")
    print(model_non_eu.summary())
else:
    print("Not enough data left for non-European OECD countries after outlier removal.")

# Calculate the overall correlation and regression for 2022-2023 (after removing outliers in GDP)
if len(df_2022_2023_clean) > 2:
    overall_corr, overall_p = pearsonr(df_2022_2023_clean["Public_Debt"], df_2022_2023_clean["GDP"])
    sig_overall = "Significant" if overall_p < 0.05 else "Not Significant"
    print(f"Overall Correlation between Public Sector Debt and GDP (2022-2023) after removing outliers in GDP: {overall_corr:.3f}, p-value: {overall_p:.3f} ({sig_overall})")

    X_overall = df_2022_2023_clean["GDP"]
    y_overall = df_2022_2023_clean["Public_Debt"]
    X_overall = sm.add_constant(X_overall)
    model_overall = sm.OLS(y_overall, X_overall).fit()
    print("\nOverall Linear Regression (2022-2023):")
    print(f"Coefficient (GDP): {model_overall.params['GDP']:.3f}, p-value: {model_overall.pvalues['GDP']:.3f}")
    print(f"R-squared: {model_overall.rsquared:.3f}")
    print(model_overall.summary())
else:
    print("Not enough data left for overall correlation after outlier removal.")

# Print the countries that were removed and the European groups
print("\nEuropean OECD countries removed due to outliers in GDP:", removed_european_countries if removed_european_countries else "None")
print("Non-European OECD countries removed due to outliers in GDP:", removed_non_european_countries if removed_non_european_countries else "None")
print("Positive European OECD countries (High Debt-to-GDP):", positive_european)
print("Negative European OECD countries (Low Debt-to-GDP):", negative_european)

2022-2023 Correlation for Positive European OECD countries (High Debt-to-GDP) after removing outliers: 0.212, p-value: 0.299 (Not Significant)

Linear Regression (Positive European OECD, 2022-2023):
Coefficient (GDP): 0.161, p-value: 0.299
R-squared: 0.045
                            OLS Regression Results                            
Dep. Variable:            Public_Debt   R-squared:                       0.045
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     1.128
Date:                Sun, 13 Apr 2025   Prob (F-statistic):              0.299
Time:                        20:39:15   Log-Likelihood:                 41.880
No. Observations:                  26   AIC:                            -79.76
Df Residuals:                      24   BIC:                            -77.24
Df Model:                           1                                         
Covariance Type:            nonr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2022_2023_european['Debt_to_GDP'] = df_2022_2023_european["Public_Debt"] / df_2022_2023_european["GDP"]
  return hypotest_fun_in(*args, **kwds)


In [11]:
df_import = pd.read_csv('/content/drive/MyDrive/Capstone Data/2022-2023/Cleaned_Import_2022-2023.csv')
df_import.head()

Unnamed: 0,Country,Ccode,2022,2023
0,Australia,AUS,0.207854,0.105261
1,Austria,AUT,0.112793,-0.081246
2,Belgium,BEL,0.092875,-0.123695
3,Canada,CAN,0.163181,0.003243
4,Switzerland,CHE,0.078947,-0.244278


In [12]:
import pandas as pd
from scipy.stats import pearsonr
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Merge Datasets based on 'Country'
merged_df = pd.merge(df_debt, df_import, on='Country', how='inner')

# Convert wide format (years as columns) to long format
df_long_debt = df_debt.melt(id_vars=["Country"], var_name="Year", value_name="Public_Debt")
df_long_import = df_import.melt(id_vars=["Country"], var_name="Year", value_name="Imports")

# Merge the long format DataFrames
df_long = pd.merge(df_long_debt, df_long_import, on=['Country', 'Year'], how='inner')

# Convert 'Year' column to numeric, handling potential errors
df_long['Year'] = pd.to_numeric(df_long['Year'], errors='coerce')

# Convert 'Public_Debt' and 'Imports' to numeric
df_long['Public_Debt'] = pd.to_numeric(df_long['Public_Debt'], errors='coerce')
df_long['Imports'] = pd.to_numeric(df_long['Imports'], errors='coerce')

# Filter to 2022-2023
df_2022_2023 = df_long[df_long["Year"].between(2022, 2023)].dropna(subset=["Public_Debt", "Imports"]).copy()

# IQR function with country tracking (only for Imports)
def drop_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    removed_countries = df[(df[column] < lower) | (df[column] > upper)]["Country"].unique().tolist()  # Track unique removed countries
    df_cleaned = df[(df[column] >= lower) & (df[column] <= upper)]
    return df_cleaned, removed_countries

# Drop outliers only for Imports and track removed countries
df_2022_2023_clean, removed_countries_imports = drop_outliers_iqr(df_2022_2023, "Imports")

# Split removed countries into European and non-European OECD countries
removed_european_countries = [country for country in removed_countries_imports if country in european_oecd_countries]
removed_non_european_countries = [country for country in removed_countries_imports if country in non_european_oecd_countries]

# Filter out European and non-European OECD countries separately
df_2022_2023_european = df_2022_2023_clean[df_2022_2023_clean["Country"].isin(european_oecd_countries)]
df_2022_2023_non_european = df_2022_2023_clean[df_2022_2023_clean["Country"].isin(non_european_oecd_countries)]

# Split European countries into positive and negative groups based on debt-to-imports ratio (average over 2022-2023)
df_2022_2023_european['Debt_to_Imports'] = df_2022_2023_european["Public_Debt"] / df_2022_2023_european["Imports"]
median_ratio = df_2022_2023_european.groupby('Country')['Debt_to_Imports'].mean().median()
european_ratios = df_2022_2023_european.groupby('Country')['Debt_to_Imports'].mean()
positive_european = european_ratios[european_ratios >= median_ratio].index.tolist()
negative_european = european_ratios[european_ratios < median_ratio].index.tolist()

# Filter European groups
df_positive_european = df_2022_2023_european[df_2022_2023_european["Country"].isin(positive_european)]
df_negative_european = df_2022_2023_european[df_2022_2023_european["Country"].isin(negative_european)]

# Compute correlation and regression for Positive European OECD countries
if len(df_positive_european) > 2:
    corr_positive, p_positive = pearsonr(df_positive_european["Public_Debt"], df_positive_european["Imports"])
    sig_positive = "Significant" if p_positive < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for Positive European OECD countries (High Debt-to-Imports) after removing outliers: {corr_positive:.3f}, p-value: {p_positive:.3f} ({sig_positive})")

    X_pos = df_positive_european["Imports"]
    y_pos = df_positive_european["Public_Debt"]
    X_pos = sm.add_constant(X_pos)
    model_pos = sm.OLS(y_pos, X_pos).fit()
    print("\nLinear Regression (Positive European OECD, 2022-2023):")
    print(f"Coefficient (Imports): {model_pos.params['Imports']:.3f}, p-value: {model_pos.pvalues['Imports']:.3f}")
    print(f"R-squared: {model_pos.rsquared:.3f}")
    print(model_pos.summary())
else:
    print(f"Not enough data for Positive European OECD countries. Sample size: {len(df_positive_european)}")

# Compute correlation and regression for Negative European OECD countries
if len(df_negative_european) > 2:
    corr_negative, p_negative = pearsonr(df_negative_european["Public_Debt"], df_negative_european["Imports"])
    sig_negative = "Significant" if p_negative < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for Negative European OECD countries (Low Debt-to-Imports) after removing outliers: {corr_negative:.3f}, p-value: {p_negative:.3f} ({sig_negative})")

    X_neg = df_negative_european["Imports"]
    y_neg = df_negative_european["Public_Debt"]
    X_neg = sm.add_constant(X_neg)
    model_neg = sm.OLS(y_neg, X_neg).fit()
    print("\nLinear Regression (Negative European OECD, 2022-2023):")
    print(f"Coefficient (Imports): {model_neg.params['Imports']:.3f}, p-value: {model_neg.pvalues['Imports']:.3f}")
    print(f"R-squared: {model_neg.rsquared:.3f}")
    print(model_neg.summary())
else:
    print(f"Not enough data for Negative European OECD countries. Sample size: {len(df_negative_european)}")

# Compute correlation and regression for non-European OECD countries
if len(df_2022_2023_non_european) > 2:
    corr_non_european, p_non_european = pearsonr(df_2022_2023_non_european["Public_Debt"], df_2022_2023_non_european["Imports"])
    sig_non_european = "Significant" if p_non_european < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for non-European OECD countries after removing outliers in Imports: {corr_non_european:.3f}, p-value: {p_non_european:.3f} ({sig_non_european})")

    X_non_eu = df_2022_2023_non_european["Imports"]
    y_non_eu = df_2022_2023_non_european["Public_Debt"]
    X_non_eu = sm.add_constant(X_non_eu)
    model_non_eu = sm.OLS(y_non_eu, X_non_eu).fit()
    print("\nLinear Regression (Non-European OECD, 2022-2023):")
    print(f"Coefficient (Imports): {model_non_eu.params['Imports']:.3f}, p-value: {model_non_eu.pvalues['Imports']:.3f}")
    print(f"R-squared: {model_non_eu.rsquared:.3f}")
    print(model_non_eu.summary())
else:
    print("Not enough data left for non-European OECD countries after outlier removal.")

# Calculate the overall correlation and regression for 2022-2023 (after removing outliers in Imports)
if len(df_2022_2023_clean) > 2:
    overall_corr, overall_p = pearsonr(df_2022_2023_clean["Public_Debt"], df_2022_2023_clean["Imports"])
    sig_overall = "Significant" if overall_p < 0.05 else "Not Significant"
    print(f"Overall Correlation between Public Sector Debt and Imports (2022-2023) after removing outliers in Imports: {overall_corr:.3f}, p-value: {overall_p:.3f} ({sig_overall})")

    X_overall = df_2022_2023_clean["Imports"]
    y_overall = df_2022_2023_clean["Public_Debt"]
    X_overall = sm.add_constant(X_overall)
    model_overall = sm.OLS(y_overall, X_overall).fit()
    print("\nOverall Linear Regression (2022-2023):")
    print(f"Coefficient (Imports): {model_overall.params['Imports']:.3f}, p-value: {model_overall.pvalues['Imports']:.3f}")
    print(f"R-squared: {model_overall.rsquared:.3f}")
    print(model_overall.summary())
else:
    print("Not enough data left for overall correlation after outlier removal.")

# Print the countries that were removed and the European groups
print("\nEuropean OECD countries removed due to outliers in Imports:", removed_european_countries if removed_european_countries else "None")
print("Non-European OECD countries removed due to outliers in Imports:", removed_non_european_countries if removed_non_european_countries else "None")
print("Positive European OECD countries (High Debt-to-Imports):", positive_european)
print("Negative European OECD countries (Low Debt-to-Imports):", negative_european)

2022-2023 Correlation for Positive European OECD countries (High Debt-to-Imports) after removing outliers: -0.035, p-value: 0.865 (Not Significant)

Linear Regression (Positive European OECD, 2022-2023):
Coefficient (Imports): -0.011, p-value: 0.865
R-squared: 0.001
                            OLS Regression Results                            
Dep. Variable:            Public_Debt   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.040
Method:                 Least Squares   F-statistic:                   0.02959
Date:                Sun, 13 Apr 2025   Prob (F-statistic):              0.865
Time:                        20:41:14   Log-Likelihood:                 44.271
No. Observations:                  26   AIC:                            -84.54
Df Residuals:                      24   BIC:                            -82.03
Df Model:                           1                                         
Covariance Type:      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2022_2023_european['Debt_to_Imports'] = df_2022_2023_european["Public_Debt"] / df_2022_2023_european["Imports"]


In [13]:
df_pop = pd.read_csv('/content/drive/MyDrive/Capstone Data/2022-2023/Cleaned_Pop_Growth_2022-2023.csv')
df_pop.head()

Unnamed: 0,Country,CCode,2022,2023
0,Australia,AUS,0.012727,0.024475
1,Austria,AUT,0.009563,0.009895
2,Belgium,BEL,0.008082,0.009137
3,Canada,CAN,0.018119,0.029323
4,Switzerland,CHE,0.008299,0.012568


In [15]:
import pandas as pd
from scipy.stats import pearsonr
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Merge Datasets based on 'Country'
merged_df = pd.merge(df_debt, df_pop, on='Country', how='inner')

# Convert wide format (years as columns) to long format
df_long_debt = df_debt.melt(id_vars=["Country"], var_name="Year", value_name="Public_Debt")
df_long_pop = df_pop.melt(id_vars=["Country"], var_name="Year", value_name="Population")

# Merge the long format DataFrames
df_long = pd.merge(df_long_debt, df_long_pop, on=['Country', 'Year'], how='inner')

# Convert 'Year' column to numeric, handling potential errors
df_long['Year'] = pd.to_numeric(df_long['Year'], errors='coerce')

# Convert 'Public_Debt' and 'Population' to numeric
df_long['Public_Debt'] = pd.to_numeric(df_long['Public_Debt'], errors='coerce')
df_long['Population'] = pd.to_numeric(df_long['Population'], errors='coerce')

# Filter to 2022-2023
df_2022_2023 = df_long[df_long["Year"].between(2022, 2023)].dropna(subset=["Public_Debt", "Population"]).copy()

# IQR function with country tracking (only for Population)
def drop_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    removed_countries = df[(df[column] < lower) | (df[column] > upper)]["Country"].unique().tolist()  # Track unique removed countries
    df_cleaned = df[(df[column] >= lower) & (df[column] <= upper)]
    return df_cleaned, removed_countries

# Drop outliers only for Population and track removed countries
df_2022_2023_clean, removed_countries_pop = drop_outliers_iqr(df_2022_2023, "Population")

# Split removed countries into European and non-European OECD countries
removed_european_countries = [country for country in removed_countries_pop if country in european_oecd_countries]
removed_non_european_countries = [country for country in removed_countries_pop if country in non_european_oecd_countries]

# Filter out European and non-European OECD countries separately
df_2022_2023_european = df_2022_2023_clean[df_2022_2023_clean["Country"].isin(european_oecd_countries)]
df_2022_2023_non_european = df_2022_2023_clean[df_2022_2023_clean["Country"].isin(non_european_oecd_countries)]

# Split European countries into positive and negative groups based on debt-to-population ratio (average over 2022-2023)
df_2022_2023_european['Debt_to_Population'] = df_2022_2023_european["Public_Debt"] / df_2022_2023_european["Population"]
median_ratio = df_2022_2023_european.groupby('Country')['Debt_to_Population'].mean().median()
european_ratios = df_2022_2023_european.groupby('Country')['Debt_to_Population'].mean()
positive_european = european_ratios[european_ratios >= median_ratio].index.tolist()
negative_european = european_ratios[european_ratios < median_ratio].index.tolist()

# Filter European groups
df_positive_european = df_2022_2023_european[df_2022_2023_european["Country"].isin(positive_european)]
df_negative_european = df_2022_2023_european[df_2022_2023_european["Country"].isin(negative_european)]

# Compute correlation and regression for Positive European OECD countries
if len(df_positive_european) > 2:
    corr_positive, p_positive = pearsonr(df_positive_european["Public_Debt"], df_positive_european["Population"])
    sig_positive = "Significant" if p_positive < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for Positive European OECD countries (High Debt-to-Population) after removing outliers: {corr_positive:.3f}, p-value: {p_positive:.3f} ({sig_positive})")

    X_pos = df_positive_european["Population"]
    y_pos = df_positive_european["Public_Debt"]
    X_pos = sm.add_constant(X_pos)
    model_pos = sm.OLS(y_pos, X_pos).fit()
    print("\nLinear Regression (Positive European OECD, 2022-2023):")
    print(f"Coefficient (Population): {model_pos.params['Population']:.3f}, p-value: {model_pos.pvalues['Population']:.3f}")
    print(f"R-squared: {model_pos.rsquared:.3f}")
    print(model_pos.summary())
else:
    print(f"Not enough data for Positive European OECD countries. Sample size: {len(df_positive_european)}")

# Compute correlation and regression for Negative European OECD countries
if len(df_negative_european) > 2:
    corr_negative, p_negative = pearsonr(df_negative_european["Public_Debt"], df_negative_european["Population"])
    sig_negative = "Significant" if p_negative < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for Negative European OECD countries (Low Debt-to-Population) after removing outliers: {corr_negative:.3f}, p-value: {p_negative:.3f} ({sig_negative})")

    X_neg = df_negative_european["Population"]
    y_neg = df_negative_european["Public_Debt"]
    X_neg = sm.add_constant(X_neg)
    model_neg = sm.OLS(y_neg, X_neg).fit()
    print("\nLinear Regression (Negative European OECD, 2022-2023):")
    print(f"Coefficient (Population): {model_neg.params['Population']:.3f}, p-value: {model_neg.pvalues['Population']:.3f}")
    print(f"R-squared: {model_neg.rsquared:.3f}")
    print(model_neg.summary())
else:
    print(f"Not enough data for Negative European OECD countries. Sample size: {len(df_negative_european)}")

# Compute correlation and regression for non-European OECD countries
if len(df_2022_2023_non_european) > 2:
    corr_non_european, p_non_european = pearsonr(df_2022_2023_non_european["Public_Debt"], df_2022_2023_non_european["Population"])
    sig_non_european = "Significant" if p_non_european < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for non-European OECD countries after removing outliers in Population: {corr_non_european:.3f}, p-value: {p_non_european:.3f} ({sig_non_european})")

    X_non_eu = df_2022_2023_non_european["Population"]
    y_non_eu = df_2022_2023_non_european["Public_Debt"]
    X_non_eu = sm.add_constant(X_non_eu)
    model_non_eu = sm.OLS(y_non_eu, X_non_eu).fit()
    print("\nLinear Regression (Non-European OECD, 2022-2023):")
    print(f"Coefficient (Population): {model_non_eu.params['Population']:.3f}, p-value: {model_non_eu.pvalues['Population']:.3f}")
    print(f"R-squared: {model_non_eu.rsquared:.3f}")
    print(model_non_eu.summary())
else:
    print("Not enough data left for non-European OECD countries after outlier removal.")

# Calculate the overall correlation and regression for 2022-2023 (after removing outliers in Population)
if len(df_2022_2023_clean) > 2:
    overall_corr, overall_p = pearsonr(df_2022_2023_clean["Public_Debt"], df_2022_2023_clean["Population"])
    sig_overall = "Significant" if overall_p < 0.05 else "Not Significant"
    print(f"Overall Correlation between Public Sector Debt and Population (2022-2023) after removing outliers in Population: {overall_corr:.3f}, p-value: {overall_p:.3f} ({sig_overall})")

    X_overall = df_2022_2023_clean["Population"]
    y_overall = df_2022_2023_clean["Public_Debt"]
    X_overall = sm.add_constant(X_overall)
    model_overall = sm.OLS(y_overall, X_overall).fit()
    print("\nOverall Linear Regression (2022-2023):")
    print(f"Coefficient (Population): {model_overall.params['Population']:.3f}, p-value: {model_overall.pvalues['Population']:.3f}")
    print(f"R-squared: {model_overall.rsquared:.3f}")
    print(model_overall.summary())
else:
    print("Not enough data left for overall correlation after outlier removal.")

# Print the countries that were removed and the European groups
print("\nEuropean OECD countries removed due to outliers in Population:", removed_european_countries if removed_european_countries else "None")
print("Non-European OECD countries removed due to outliers in Population:", removed_non_european_countries if removed_non_european_countries else "None")
print("Positive European OECD countries (High Debt-to-Population):", positive_european)
print("Negative European OECD countries (Low Debt-to-Population):", negative_european)

2022-2023 Correlation for Positive European OECD countries (High Debt-to-Population) after removing outliers: 0.771, p-value: 0.000 (Significant)

Linear Regression (Positive European OECD, 2022-2023):
Coefficient (Population): 3.357, p-value: 0.000
R-squared: 0.595
                            OLS Regression Results                            
Dep. Variable:            Public_Debt   R-squared:                       0.595
Model:                            OLS   Adj. R-squared:                  0.578
Method:                 Least Squares   F-statistic:                     35.26
Date:                Sun, 13 Apr 2025   Prob (F-statistic):           3.97e-06
Time:                        20:45:40   Log-Likelihood:                 58.898
No. Observations:                  26   AIC:                            -113.8
Df Residuals:                      24   BIC:                            -111.3
Df Model:                           1                                         
Covariance Type:      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2022_2023_european['Debt_to_Population'] = df_2022_2023_european["Public_Debt"] / df_2022_2023_european["Population"]
  return hypotest_fun_in(*args, **kwds)


In [16]:
df_unemployment = pd.read_csv('/content/drive/MyDrive/Capstone Data/2022-2023/Cleaned_Unemployment_2022-2023.csv')

In [17]:
df_unemployment.head()

Unnamed: 0,Country,Ccode,2022,2023
0,Australia,AUS,-0.276583,-0.009187
1,Austria,AUT,-0.227125,0.05028
2,Belgium,BEL,-0.112708,-0.008996
3,Canada,CAN,-0.298525,0.016288
4,Switzerland,CHE,-0.156366,-0.05814


In [18]:
import pandas as pd
from scipy.stats import pearsonr
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Merge Datasets based on 'Country'
merged_df = pd.merge(df_debt, df_unemployment, on='Country', how='inner')

# Convert wide format (years as columns) to long format
df_long_debt = df_debt.melt(id_vars=["Country"], var_name="Year", value_name="Public_Debt")
df_long_unemployment = df_unemployment.melt(id_vars=["Country"], var_name="Year", value_name="Unemployment")

# Merge the long format DataFrames
df_long = pd.merge(df_long_debt, df_long_unemployment, on=['Country', 'Year'], how='inner')

# Convert 'Year' column to numeric, handling potential errors
df_long['Year'] = pd.to_numeric(df_long['Year'], errors='coerce')

# Convert 'Public_Debt' and 'Unemployment' to numeric
df_long['Public_Debt'] = pd.to_numeric(df_long['Public_Debt'], errors='coerce')
df_long['Unemployment'] = pd.to_numeric(df_long['Unemployment'], errors='coerce')

# Filter to 2022-2023
df_2022_2023 = df_long[df_long["Year"].between(2022, 2023)].dropna(subset=["Public_Debt", "Unemployment"]).copy()

# IQR function with country tracking (only for Unemployment)
def drop_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    removed_countries = df[(df[column] < lower) | (df[column] > upper)]["Country"].unique().tolist()  # Track unique removed countries
    df_cleaned = df[(df[column] >= lower) & (df[column] <= upper)]
    return df_cleaned, removed_countries

# Drop outliers only for Unemployment and track removed countries
df_2022_2023_clean, removed_countries_unemployment = drop_outliers_iqr(df_2022_2023, "Unemployment")

# Split removed countries into European and non-European OECD countries
removed_european_countries = [country for country in removed_countries_unemployment if country in european_oecd_countries]
removed_non_european_countries = [country for country in removed_countries_unemployment if country in non_european_oecd_countries]

# Filter out European and non-European OECD countries separately
df_2022_2023_european = df_2022_2023_clean[df_2022_2023_clean["Country"].isin(european_oecd_countries)]
df_2022_2023_non_european = df_2022_2023_clean[df_2022_2023_clean["Country"].isin(non_european_oecd_countries)]

# Split European countries into positive and negative groups based on debt-to-unemployment ratio (average over 2022-2023)
df_2022_2023_european['Debt_to_Unemployment'] = df_2022_2023_european["Public_Debt"] / df_2022_2023_european["Unemployment"]
median_ratio = df_2022_2023_european.groupby('Country')['Debt_to_Unemployment'].mean().median()
european_ratios = df_2022_2023_european.groupby('Country')['Debt_to_Unemployment'].mean()
positive_european = european_ratios[european_ratios >= median_ratio].index.tolist()
negative_european = european_ratios[european_ratios < median_ratio].index.tolist()

# Filter European groups
df_positive_european = df_2022_2023_european[df_2022_2023_european["Country"].isin(positive_european)]
df_negative_european = df_2022_2023_european[df_2022_2023_european["Country"].isin(negative_european)]

# Compute correlation and regression for Positive European OECD countries
if len(df_positive_european) > 2:
    corr_positive, p_positive = pearsonr(df_positive_european["Public_Debt"], df_positive_european["Unemployment"])
    sig_positive = "Significant" if p_positive < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for Positive European OECD countries (High Debt-to-Unemployment) after removing outliers: {corr_positive:.3f}, p-value: {p_positive:.3f} ({sig_positive})")

    X_pos = df_positive_european["Unemployment"]
    y_pos = df_positive_european["Public_Debt"]
    X_pos = sm.add_constant(X_pos)
    model_pos = sm.OLS(y_pos, X_pos).fit()
    print("\nLinear Regression (Positive European OECD, 2022-2023):")
    print(f"Coefficient (Unemployment): {model_pos.params['Unemployment']:.3f}, p-value: {model_pos.pvalues['Unemployment']:.3f}")
    print(f"R-squared: {model_pos.rsquared:.3f}")
    print(model_pos.summary())
else:
    print(f"Not enough data for Positive European OECD countries. Sample size: {len(df_positive_european)}")

# Compute correlation and regression for Negative European OECD countries
if len(df_negative_european) > 2:
    corr_negative, p_negative = pearsonr(df_negative_european["Public_Debt"], df_negative_european["Unemployment"])
    sig_negative = "Significant" if p_negative < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for Negative European OECD countries (Low Debt-to-Unemployment) after removing outliers: {corr_negative:.3f}, p-value: {p_negative:.3f} ({sig_negative})")

    X_neg = df_negative_european["Unemployment"]
    y_neg = df_negative_european["Public_Debt"]
    X_neg = sm.add_constant(X_neg)
    model_neg = sm.OLS(y_neg, X_neg).fit()
    print("\nLinear Regression (Negative European OECD, 2022-2023):")
    print(f"Coefficient (Unemployment): {model_neg.params['Unemployment']:.3f}, p-value: {model_neg.pvalues['Unemployment']:.3f}")
    print(f"R-squared: {model_neg.rsquared:.3f}")
    print(model_neg.summary())
else:
    print(f"Not enough data for Negative European OECD countries. Sample size: {len(df_negative_european)}")

# Compute correlation and regression for non-European OECD countries
if len(df_2022_2023_non_european) > 2:
    corr_non_european, p_non_european = pearsonr(df_2022_2023_non_european["Public_Debt"], df_2022_2023_non_european["Unemployment"])
    sig_non_european = "Significant" if p_non_european < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for non-European OECD countries after removing outliers in Unemployment: {corr_non_european:.3f}, p-value: {p_non_european:.3f} ({sig_non_european})")

    X_non_eu = df_2022_2023_non_european["Unemployment"]
    y_non_eu = df_2022_2023_non_european["Public_Debt"]
    X_non_eu = sm.add_constant(X_non_eu)
    model_non_eu = sm.OLS(y_non_eu, X_non_eu).fit()
    print("\nLinear Regression (Non-European OECD, 2022-2023):")
    print(f"Coefficient (Unemployment): {model_non_eu.params['Unemployment']:.3f}, p-value: {model_non_eu.pvalues['Unemployment']:.3f}")
    print(f"R-squared: {model_non_eu.rsquared:.3f}")
    print(model_non_eu.summary())
else:
    print("Not enough data left for non-European OECD countries after outlier removal.")

# Calculate the overall correlation and regression for 2022-2023 (after removing outliers in Unemployment)
if len(df_2022_2023_clean) > 2:
    overall_corr, overall_p = pearsonr(df_2022_2023_clean["Public_Debt"], df_2022_2023_clean["Unemployment"])
    sig_overall = "Significant" if overall_p < 0.05 else "Not Significant"
    print(f"Overall Correlation between Public Sector Debt and Unemployment (2022-2023) after removing outliers in Unemployment: {overall_corr:.3f}, p-value: {overall_p:.3f} ({sig_overall})")

    X_overall = df_2022_2023_clean["Unemployment"]
    y_overall = df_2022_2023_clean["Public_Debt"]
    X_overall = sm.add_constant(X_overall)
    model_overall = sm.OLS(y_overall, X_overall).fit()
    print("\nOverall Linear Regression (2022-2023):")
    print(f"Coefficient (Unemployment): {model_overall.params['Unemployment']:.3f}, p-value: {model_overall.pvalues['Unemployment']:.3f}")
    print(f"R-squared: {model_overall.rsquared:.3f}")
    print(model_overall.summary())
else:
    print("Not enough data left for overall correlation after outlier removal.")

# Print the countries that were removed and the European groups
print("\nEuropean OECD countries removed due to outliers in Unemployment:", removed_european_countries if removed_european_countries else "None")
print("Non-European OECD countries removed due to outliers in Unemployment:", removed_non_european_countries if removed_non_european_countries else "None")
print("Positive European OECD countries (High Debt-to-Unemployment):", positive_european)
print("Negative European OECD countries (Low Debt-to-Unemployment):", negative_european)

2022-2023 Correlation for Positive European OECD countries (High Debt-to-Unemployment) after removing outliers: 0.632, p-value: 0.001 (Significant)

Linear Regression (Positive European OECD, 2022-2023):
Coefficient (Unemployment): 0.295, p-value: 0.001
R-squared: 0.400
                            OLS Regression Results                            
Dep. Variable:            Public_Debt   R-squared:                       0.400
Model:                            OLS   Adj. R-squared:                  0.375
Method:                 Least Squares   F-statistic:                     15.99
Date:                Sun, 13 Apr 2025   Prob (F-statistic):           0.000529
Time:                        20:48:11   Log-Likelihood:                 44.427
No. Observations:                  26   AIC:                            -84.85
Df Residuals:                      24   BIC:                            -82.34
Df Model:                           1                                         
Covariance Type:  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2022_2023_european['Debt_to_Unemployment'] = df_2022_2023_european["Public_Debt"] / df_2022_2023_european["Unemployment"]


In [19]:
df_labor = pd.read_csv('/content/drive/MyDrive/Capstone Data/2022-2023/Cleaned_Labor_Force_2022-2023.csv')
df_labor.head()

Unnamed: 0,Country,CCode,2022,2023
0,Australia,AUS,0.01972,0.030828
1,Austria,AUT,0.015725,0.013627
2,Belgium,BEL,0.021273,0.008762
3,Canada,CAN,0.019545,0.035632
4,Chile,CHL,0.054545,0.029672


In [22]:
import pandas as pd
from scipy.stats import pearsonr
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Merge Datasets based on 'Country'
merged_df = pd.merge(df_debt, df_labor, on='Country', how='inner')

# Convert wide format (years as columns) to long format
df_long_debt = df_debt.melt(id_vars=["Country"], var_name="Year", value_name="Public_Debt")
df_long_labor = df_labor.melt(id_vars=["Country"], var_name="Year", value_name="Labor_Force")

# Merge the long format DataFrames
df_long = pd.merge(df_long_debt, df_long_labor, on=['Country', 'Year'], how='inner')

# Convert 'Year' column to numeric, handling potential errors
df_long['Year'] = pd.to_numeric(df_long['Year'], errors='coerce')

# Convert 'Public_Debt' and 'Labor_Force' to numeric
df_long['Public_Debt'] = pd.to_numeric(df_long['Public_Debt'], errors='coerce')
df_long['Labor_Force'] = pd.to_numeric(df_long['Labor_Force'], errors='coerce')

# Filter to 2022-2023
df_2022_2023 = df_long[df_long["Year"].between(2022, 2023)].dropna(subset=["Public_Debt", "Labor_Force"]).copy()

# IQR function with country tracking (only for Labor_Force)
def drop_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    removed_countries = df[(df[column] < lower) | (df[column] > upper)]["Country"].unique().tolist()  # Track unique removed countries
    df_cleaned = df[(df[column] >= lower) & (df[column] <= upper)]
    return df_cleaned, removed_countries

# Drop outliers only for Labor_Force and track removed countries
df_2022_2023_clean, removed_countries_labor = drop_outliers_iqr(df_2022_2023, "Labor_Force")

# Split removed countries into European and non-European OECD countries
removed_european_countries = [country for country in removed_countries_labor if country in european_oecd_countries]
removed_non_european_countries = [country for country in removed_countries_labor if country in non_european_oecd_countries]

# Filter out European and non-European OECD countries separately
df_2022_2023_european = df_2022_2023_clean[df_2022_2023_clean["Country"].isin(european_oecd_countries)]
df_2022_2023_non_european = df_2022_2023_clean[df_2022_2023_clean["Country"].isin(non_european_oecd_countries)]

# Split European countries into positive and negative groups based on debt-to-labor-force ratio (average over 2022-2023)
df_2022_2023_european['Debt_to_Labor_Force'] = df_2022_2023_european["Public_Debt"] / df_2022_2023_european["Labor_Force"]
median_ratio = df_2022_2023_european.groupby('Country')['Debt_to_Labor_Force'].mean().median()
european_ratios = df_2022_2023_european.groupby('Country')['Debt_to_Labor_Force'].mean()
positive_european = european_ratios[european_ratios >= median_ratio].index.tolist()
negative_european = european_ratios[european_ratios < median_ratio].index.tolist()

# Filter European groups
df_positive_european = df_2022_2023_european[df_2022_2023_european["Country"].isin(positive_european)]
df_negative_european = df_2022_2023_european[df_2022_2023_european["Country"].isin(negative_european)]

# Compute correlation and regression for Positive European OECD countries
if len(df_positive_european) > 2:
    corr_positive, p_positive = pearsonr(df_positive_european["Public_Debt"], df_positive_european["Labor_Force"])
    sig_positive = "Significant" if p_positive < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for Positive European OECD countries (High Debt-to-Labor-Force) after removing outliers: {corr_positive:.3f}, p-value: {p_positive:.3f} ({sig_positive})")

    X_pos = df_positive_european["Labor_Force"]
    y_pos = df_positive_european["Public_Debt"]
    X_pos = sm.add_constant(X_pos)
    model_pos = sm.OLS(y_pos, X_pos).fit()
    print("\nLinear Regression (Positive European OECD, 2022-2023):")
    print(f"Coefficient (Labor_Force): {model_pos.params['Labor_Force']:.3f}, p-value: {model_pos.pvalues['Labor_Force']:.3f}")
    print(f"R-squared: {model_pos.rsquared:.3f}")
    print(model_pos.summary())
else:
    print(f"Not enough data for Positive European OECD countries. Sample size: {len(df_positive_european)}")

# Compute correlation and regression for Negative European OECD countries
if len(df_negative_european) > 2:
    corr_negative, p_negative = pearsonr(df_negative_european["Public_Debt"], df_negative_european["Labor_Force"])
    sig_negative = "Significant" if p_negative < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for Negative European OECD countries (Low Debt-to-Labor-Force) after removing outliers: {corr_negative:.3f}, p-value: {p_negative:.3f} ({sig_negative})")

    X_neg = df_negative_european["Labor_Force"]
    y_neg = df_negative_european["Public_Debt"]
    X_neg = sm.add_constant(X_neg)
    model_neg = sm.OLS(y_neg, X_neg).fit()
    print("\nLinear Regression (Negative European OECD, 2022-2023):")
    print(f"Coefficient (Labor_Force): {model_neg.params['Labor_Force']:.3f}, p-value: {model_neg.pvalues['Labor_Force']:.3f}")
    print(f"R-squared: {model_neg.rsquared:.3f}")
    print(model_neg.summary())
else:
    print(f"Not enough data for Negative European OECD countries. Sample size: {len(df_negative_european)}")

# Compute correlation and regression for non-European OECD countries
if len(df_2022_2023_non_european) > 2:
    corr_non_european, p_non_european = pearsonr(df_2022_2023_non_european["Public_Debt"], df_2022_2023_non_european["Labor_Force"])
    sig_non_european = "Significant" if p_non_european < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for non-European OECD countries after removing outliers in Labor Force: {corr_non_european:.3f}, p-value: {p_non_european:.3f} ({sig_non_european})")

    X_non_eu = df_2022_2023_non_european["Labor_Force"]
    y_non_eu = df_2022_2023_non_european["Public_Debt"]
    X_non_eu = sm.add_constant(X_non_eu)
    model_non_eu = sm.OLS(y_non_eu, X_non_eu).fit()
    print("\nLinear Regression (Non-European OECD, 2022-2023):")
    print(f"Coefficient (Labor_Force): {model_non_eu.params['Labor_Force']:.3f}, p-value: {model_non_eu.pvalues['Labor_Force']:.3f}")
    print(f"R-squared: {model_non_eu.rsquared:.3f}")
    print(model_non_eu.summary())
else:
    print("Not enough data left for non-European OECD countries after outlier removal.")

# Calculate the overall correlation and regression for 2022-2023 (after removing outliers in Labor Force)
if len(df_2022_2023_clean) > 2:
    overall_corr, overall_p = pearsonr(df_2022_2023_clean["Public_Debt"], df_2022_2023_clean["Labor_Force"])
    sig_overall = "Significant" if overall_p < 0.05 else "Not Significant"
    print(f"Overall Correlation between Public Sector Debt and Labor Force (2022-2023) after removing outliers in Labor Force: {overall_corr:.3f}, p-value: {overall_p:.3f} ({sig_overall})")

    X_overall = df_2022_2023_clean["Labor_Force"]
    y_overall = df_2022_2023_clean["Public_Debt"]
    X_overall = sm.add_constant(X_overall)
    model_overall = sm.OLS(y_overall, X_overall).fit()
    print("\nOverall Linear Regression (2022-2023):")
    print(f"Coefficient (Labor_Force): {model_overall.params['Labor_Force']:.3f}, p-value: {model_overall.pvalues['Labor_Force']:.3f}")
    print(f"R-squared: {model_overall.rsquared:.3f}")
    print(model_overall.summary())
else:
    print("Not enough data left for overall correlation after outlier removal.")

# Print the countries that were removed and the European groups
print("\nEuropean OECD countries removed due to outliers in Labor Force:", removed_european_countries if removed_european_countries else "None")
print("Non-European OECD countries removed due to outliers in Labor Force:", removed_non_european_countries if removed_non_european_countries else "None")
print("Positive European OECD countries (High Debt-to-Labor-Force):", positive_european)
print("Negative European OECD countries (Low Debt-to-Labor-Force):", negative_european)

2022-2023 Correlation for Positive European OECD countries (High Debt-to-Labor-Force) after removing outliers: 0.202, p-value: 0.356 (Not Significant)

Linear Regression (Positive European OECD, 2022-2023):
Coefficient (Labor_Force): 0.612, p-value: 0.356
R-squared: 0.041
                            OLS Regression Results                            
Dep. Variable:            Public_Debt   R-squared:                       0.041
Model:                            OLS   Adj. R-squared:                 -0.005
Method:                 Least Squares   F-statistic:                    0.8926
Date:                Sun, 13 Apr 2025   Prob (F-statistic):              0.356
Time:                        20:53:59   Log-Likelihood:                 48.472
No. Observations:                  23   AIC:                            -92.94
Df Residuals:                      21   BIC:                            -90.67
Df Model:                           1                                         
Covariance Type:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2022_2023_european['Debt_to_Labor_Force'] = df_2022_2023_european["Public_Debt"] / df_2022_2023_european["Labor_Force"]
  return hypotest_fun_in(*args, **kwds)


In [20]:
df_act = pd.read_csv('/content/drive/MyDrive/Capstone Data/2022-2023/Cleaned_current_account_2022-2023.csv')
df_act.head()

Unnamed: 0,Country,CCode,2022,2023
0,Australia,AUS,-0.707334,-0.692022
1,Austria,AUT,-1.474437,-2.734083
2,Belgium,BEL,-1.629706,-0.359518
3,Canada,CAN,-30.715244,1.05074
4,Switzerland,CHE,0.411052,-0.249436


In [21]:
import pandas as pd
from scipy.stats import pearsonr
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Merge Datasets based on 'Country'
merged_df = pd.merge(df_debt, df_act, on='Country', how='inner')

# Convert wide format (years as columns) to long format
df_long_debt = df_debt.melt(id_vars=["Country"], var_name="Year", value_name="Public_Debt")
df_long_act = df_act.melt(id_vars=["Country"], var_name="Year", value_name="Current_Account")

# Merge the long format DataFrames
df_long = pd.merge(df_long_debt, df_long_act, on=['Country', 'Year'], how='inner')

# Convert 'Year' column to numeric, handling potential errors
df_long['Year'] = pd.to_numeric(df_long['Year'], errors='coerce')

# Convert 'Public_Debt' and 'Current_Account' to numeric
df_long['Public_Debt'] = pd.to_numeric(df_long['Public_Debt'], errors='coerce')
df_long['Current_Account'] = pd.to_numeric(df_long['Current_Account'], errors='coerce')

# Filter to 2022-2023
df_2022_2023 = df_long[df_long["Year"].between(2022, 2023)].dropna(subset=["Public_Debt", "Current_Account"]).copy()

# IQR function with country tracking (only for Current_Account)
def drop_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    removed_countries = df[(df[column] < lower) | (df[column] > upper)]["Country"].unique().tolist()  # Track unique removed countries
    df_cleaned = df[(df[column] >= lower) & (df[column] <= upper)]
    return df_cleaned, removed_countries

# Drop outliers only for Current_Account and track removed countries
df_2022_2023_clean, removed_countries_act = drop_outliers_iqr(df_2022_2023, "Current_Account")

# Split removed countries into European and non-European OECD countries
removed_european_countries = [country for country in removed_countries_act if country in european_oecd_countries]
removed_non_european_countries = [country for country in removed_countries_act if country in non_european_oecd_countries]

# Filter out European and non-European OECD countries separately
df_2022_2023_european = df_2022_2023_clean[df_2022_2023_clean["Country"].isin(european_oecd_countries)]
df_2022_2023_non_european = df_2022_2023_clean[df_2022_2023_clean["Country"].isin(non_european_oecd_countries)]

# Split European countries into positive and negative groups based on debt-to-current-account ratio (average over 2022-2023)
df_2022_2023_european['Debt_to_Current_Account'] = df_2022_2023_european["Public_Debt"] / df_2022_2023_european["Current_Account"].abs()  # Use absolute value due to potential negatives
median_ratio = df_2022_2023_european.groupby('Country')['Debt_to_Current_Account'].mean().median()
european_ratios = df_2022_2023_european.groupby('Country')['Debt_to_Current_Account'].mean()
positive_european = european_ratios[european_ratios >= median_ratio].index.tolist()
negative_european = european_ratios[european_ratios < median_ratio].index.tolist()

# Filter European groups
df_positive_european = df_2022_2023_european[df_2022_2023_european["Country"].isin(positive_european)]
df_negative_european = df_2022_2023_european[df_2022_2023_european["Country"].isin(negative_european)]

# Compute correlation and regression for Positive European OECD countries
if len(df_positive_european) > 2:
    corr_positive, p_positive = pearsonr(df_positive_european["Public_Debt"], df_positive_european["Current_Account"])
    sig_positive = "Significant" if p_positive < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for Positive European OECD countries (High Debt-to-Current-Account) after removing outliers: {corr_positive:.3f}, p-value: {p_positive:.3f} ({sig_positive})")

    X_pos = df_positive_european["Current_Account"]
    y_pos = df_positive_european["Public_Debt"]
    X_pos = sm.add_constant(X_pos)
    model_pos = sm.OLS(y_pos, X_pos).fit()
    print("\nLinear Regression (Positive European OECD, 2022-2023):")
    print(f"Coefficient (Current_Account): {model_pos.params['Current_Account']:.3f}, p-value: {model_pos.pvalues['Current_Account']:.3f}")
    print(f"R-squared: {model_pos.rsquared:.3f}")
    print(model_pos.summary())
else:
    print(f"Not enough data for Positive European OECD countries. Sample size: {len(df_positive_european)}")

# Compute correlation and regression for Negative European OECD countries
if len(df_negative_european) > 2:
    corr_negative, p_negative = pearsonr(df_negative_european["Public_Debt"], df_negative_european["Current_Account"])
    sig_negative = "Significant" if p_negative < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for Negative European OECD countries (Low Debt-to-Current-Account) after removing outliers: {corr_negative:.3f}, p-value: {p_negative:.3f} ({sig_negative})")

    X_neg = df_negative_european["Current_Account"]
    y_neg = df_negative_european["Public_Debt"]
    X_neg = sm.add_constant(X_neg)
    model_neg = sm.OLS(y_neg, X_neg).fit()
    print("\nLinear Regression (Negative European OECD, 2022-2023):")
    print(f"Coefficient (Current_Account): {model_neg.params['Current_Account']:.3f}, p-value: {model_neg.pvalues['Current_Account']:.3f}")
    print(f"R-squared: {model_neg.rsquared:.3f}")
    print(model_neg.summary())
else:
    print(f"Not enough data for Negative European OECD countries. Sample size: {len(df_negative_european)}")

# Compute correlation and regression for non-European OECD countries
if len(df_2022_2023_non_european) > 2:
    corr_non_european, p_non_european = pearsonr(df_2022_2023_non_european["Public_Debt"], df_2022_2023_non_european["Current_Account"])
    sig_non_european = "Significant" if p_non_european < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for non-European OECD countries after removing outliers in Current Account: {corr_non_european:.3f}, p-value: {p_non_european:.3f} ({sig_non_european})")

    X_non_eu = df_2022_2023_non_european["Current_Account"]
    y_non_eu = df_2022_2023_non_european["Public_Debt"]
    X_non_eu = sm.add_constant(X_non_eu)
    model_non_eu = sm.OLS(y_non_eu, X_non_eu).fit()
    print("\nLinear Regression (Non-European OECD, 2022-2023):")
    print(f"Coefficient (Current_Account): {model_non_eu.params['Current_Account']:.3f}, p-value: {model_non_eu.pvalues['Current_Account']:.3f}")
    print(f"R-squared: {model_non_eu.rsquared:.3f}")
    print(model_non_eu.summary())
else:
    print("Not enough data left for non-European OECD countries after outlier removal.")

# Calculate the overall correlation and regression for 2022-2023 (after removing outliers in Current Account)
if len(df_2022_2023_clean) > 2:
    overall_corr, overall_p = pearsonr(df_2022_2023_clean["Public_Debt"], df_2022_2023_clean["Current_Account"])
    sig_overall = "Significant" if overall_p < 0.05 else "Not Significant"
    print(f"Overall Correlation between Public Sector Debt and Current Account (2022-2023) after removing outliers in Current Account: {overall_corr:.3f}, p-value: {overall_p:.3f} ({sig_overall})")

    X_overall = df_2022_2023_clean["Current_Account"]
    y_overall = df_2022_2023_clean["Public_Debt"]
    X_overall = sm.add_constant(X_overall)
    model_overall = sm.OLS(y_overall, X_overall).fit()
    print("\nOverall Linear Regression (2022-2023):")
    print(f"Coefficient (Current_Account): {model_overall.params['Current_Account']:.3f}, p-value: {model_overall.pvalues['Current_Account']:.3f}")
    print(f"R-squared: {model_overall.rsquared:.3f}")
    print(model_overall.summary())
else:
    print("Not enough data left for overall correlation after outlier removal.")

# Print the countries that were removed and the European groups
print("\nEuropean OECD countries removed due to outliers in Current Account:", removed_european_countries if removed_european_countries else "None")
print("Non-European OECD countries removed due to outliers in Current Account:", removed_non_european_countries if removed_non_european_countries else "None")
print("Positive European OECD countries (High Debt-to-Current-Account):", positive_european)
print("Negative European OECD countries (Low Debt-to-Current-Account):", negative_european)

2022-2023 Correlation for Positive European OECD countries (High Debt-to-Current-Account) after removing outliers: 0.003, p-value: 0.991 (Not Significant)

Linear Regression (Positive European OECD, 2022-2023):
Coefficient (Current_Account): 0.000, p-value: 0.991
R-squared: 0.000
                            OLS Regression Results                            
Dep. Variable:            Public_Debt   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.056
Method:                 Least Squares   F-statistic:                 0.0001169
Date:                Sun, 13 Apr 2025   Prob (F-statistic):              0.991
Time:                        20:51:52   Log-Likelihood:                 31.714
No. Observations:                  20   AIC:                            -59.43
Df Residuals:                      18   BIC:                            -57.44
Df Model:                           1                                         
Covarian

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2022_2023_european['Debt_to_Current_Account'] = df_2022_2023_european["Public_Debt"] / df_2022_2023_european["Current_Account"].abs()  # Use absolute value due to potential negatives
  return hypotest_fun_in(*args, **kwds)


In [23]:
df_inflation = pd.read_csv('/content/drive/MyDrive/Capstone Data/2022-2023/clean_inflation_rate_2022-2023.csv')
df_inflation.head()

Unnamed: 0,Country,Ccode,2022,2023
0,Australia,AUS,0.065941,0.05597
1,Austria,AUT,0.085469,0.078141
2,Belgium,BEL,0.095975,0.04049
3,Canada,CAN,0.068028,0.03879
4,Switzerland,CHE,0.02835,0.021354


In [24]:
import pandas as pd
from scipy.stats import pearsonr
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Merge Datasets based on 'Country'
merged_df = pd.merge(df_debt, df_inflation, on='Country', how='inner')

# Convert wide format (years as columns) to long format
df_long_debt = df_debt.melt(id_vars=["Country"], var_name="Year", value_name="Public_Debt")
df_long_inflation = df_inflation.melt(id_vars=["Country"], var_name="Year", value_name="Inflation")

# Merge the long format DataFrames
df_long = pd.merge(df_long_debt, df_long_inflation, on=['Country', 'Year'], how='inner')

# Convert 'Year' column to numeric, handling potential errors
df_long['Year'] = pd.to_numeric(df_long['Year'], errors='coerce')

# Convert 'Public_Debt' and 'Inflation' to numeric
df_long['Public_Debt'] = pd.to_numeric(df_long['Public_Debt'], errors='coerce')
df_long['Inflation'] = pd.to_numeric(df_long['Inflation'], errors='coerce')

# Filter to 2022-2023
df_2022_2023 = df_long[df_long["Year"].between(2022, 2023)].dropna(subset=["Public_Debt", "Inflation"]).copy()

# IQR function with country tracking (only for Inflation)
def drop_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    removed_countries = df[(df[column] < lower) | (df[column] > upper)]["Country"].unique().tolist()  # Track unique removed countries
    df_cleaned = df[(df[column] >= lower) & (df[column] <= upper)]
    return df_cleaned, removed_countries

# Drop outliers only for Inflation and track removed countries
df_2022_2023_clean, removed_countries_inflation = drop_outliers_iqr(df_2022_2023, "Inflation")

# Split removed countries into European and non-European OECD countries
removed_european_countries = [country for country in removed_countries_inflation if country in european_oecd_countries]
removed_non_european_countries = [country for country in removed_countries_inflation if country in non_european_oecd_countries]

# Filter out European and non-European OECD countries separately
df_2022_2023_european = df_2022_2023_clean[df_2022_2023_clean["Country"].isin(european_oecd_countries)]
df_2022_2023_non_european = df_2022_2023_clean[df_2022_2023_clean["Country"].isin(non_european_oecd_countries)]

# Split European countries into positive and negative groups based on debt-to-inflation ratio (average over 2022-2023)
df_2022_2023_european['Debt_to_Inflation'] = df_2022_2023_european["Public_Debt"] / df_2022_2023_european["Inflation"].abs()  # Use absolute value due to potential negatives
median_ratio = df_2022_2023_european.groupby('Country')['Debt_to_Inflation'].mean().median()
european_ratios = df_2022_2023_european.groupby('Country')['Debt_to_Inflation'].mean()
positive_european = european_ratios[european_ratios >= median_ratio].index.tolist()
negative_european = european_ratios[european_ratios < median_ratio].index.tolist()

# Filter European groups
df_positive_european = df_2022_2023_european[df_2022_2023_european["Country"].isin(positive_european)]
df_negative_european = df_2022_2023_european[df_2022_2023_european["Country"].isin(negative_european)]

# Compute correlation and regression for Positive European OECD countries
if len(df_positive_european) > 2:
    corr_positive, p_positive = pearsonr(df_positive_european["Public_Debt"], df_positive_european["Inflation"])
    sig_positive = "Significant" if p_positive < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for Positive European OECD countries (High Debt-to-Inflation) after removing outliers: {corr_positive:.3f}, p-value: {p_positive:.3f} ({sig_positive})")

    X_pos = df_positive_european["Inflation"]
    y_pos = df_positive_european["Public_Debt"]
    X_pos = sm.add_constant(X_pos)
    model_pos = sm.OLS(y_pos, X_pos).fit()
    print("\nLinear Regression (Positive European OECD, 2022-2023):")
    print(f"Coefficient (Inflation): {model_pos.params['Inflation']:.3f}, p-value: {model_pos.pvalues['Inflation']:.3f}")
    print(f"R-squared: {model_pos.rsquared:.3f}")
    print(model_pos.summary())
else:
    print(f"Not enough data for Positive European OECD countries. Sample size: {len(df_positive_european)}")

# Compute correlation and regression for Negative European OECD countries
if len(df_negative_european) > 2:
    corr_negative, p_negative = pearsonr(df_negative_european["Public_Debt"], df_negative_european["Inflation"])
    sig_negative = "Significant" if p_negative < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for Negative European OECD countries (Low Debt-to-Inflation) after removing outliers: {corr_negative:.3f}, p-value: {p_negative:.3f} ({sig_negative})")

    X_neg = df_negative_european["Inflation"]
    y_neg = df_negative_european["Public_Debt"]
    X_neg = sm.add_constant(X_neg)
    model_neg = sm.OLS(y_neg, X_neg).fit()
    print("\nLinear Regression (Negative European OECD, 2022-2023):")
    print(f"Coefficient (Inflation): {model_neg.params['Inflation']:.3f}, p-value: {model_neg.pvalues['Inflation']:.3f}")
    print(f"R-squared: {model_neg.rsquared:.3f}")
    print(model_neg.summary())
else:
    print(f"Not enough data for Negative European OECD countries. Sample size: {len(df_negative_european)}")

# Compute correlation and regression for non-European OECD countries
if len(df_2022_2023_non_european) > 2:
    corr_non_european, p_non_european = pearsonr(df_2022_2023_non_european["Public_Debt"], df_2022_2023_non_european["Inflation"])
    sig_non_european = "Significant" if p_non_european < 0.05 else "Not Significant"
    print(f"2022-2023 Correlation for non-European OECD countries after removing outliers in Inflation: {corr_non_european:.3f}, p-value: {p_non_european:.3f} ({sig_non_european})")

    X_non_eu = df_2022_2023_non_european["Inflation"]
    y_non_eu = df_2022_2023_non_european["Public_Debt"]
    X_non_eu = sm.add_constant(X_non_eu)
    model_non_eu = sm.OLS(y_non_eu, X_non_eu).fit()
    print("\nLinear Regression (Non-European OECD, 2022-2023):")
    print(f"Coefficient (Inflation): {model_non_eu.params['Inflation']:.3f}, p-value: {model_non_eu.pvalues['Inflation']:.3f}")
    print(f"R-squared: {model_non_eu.rsquared:.3f}")
    print(model_non_eu.summary())
else:
    print("Not enough data left for non-European OECD countries after outlier removal.")

# Calculate the overall correlation and regression for 2022-2023 (after removing outliers in Inflation)
if len(df_2022_2023_clean) > 2:
    overall_corr, overall_p = pearsonr(df_2022_2023_clean["Public_Debt"], df_2022_2023_clean["Inflation"])
    sig_overall = "Significant" if overall_p < 0.05 else "Not Significant"
    print(f"Overall Correlation between Public Sector Debt and Inflation (2022-2023) after removing outliers in Inflation: {overall_corr:.3f}, p-value: {overall_p:.3f} ({sig_overall})")

    X_overall = df_2022_2023_clean["Inflation"]
    y_overall = df_2022_2023_clean["Public_Debt"]
    X_overall = sm.add_constant(X_overall)
    model_overall = sm.OLS(y_overall, X_overall).fit()
    print("\nOverall Linear Regression (2022-2023):")
    print(f"Coefficient (Inflation): {model_overall.params['Inflation']:.3f}, p-value: {model_overall.pvalues['Inflation']:.3f}")
    print(f"R-squared: {model_overall.rsquared:.3f}")
    print(model_overall.summary())
else:
    print("Not enough data left for overall correlation after outlier removal.")

# Print the countries that were removed and the European groups
print("\nEuropean OECD countries removed due to outliers in Inflation:", removed_european_countries if removed_european_countries else "None")
print("Non-European OECD countries removed due to outliers in Inflation:", removed_non_european_countries if removed_non_european_countries else "None")
print("Positive European OECD countries (High Debt-to-Inflation):", positive_european)
print("Negative European OECD countries (Low Debt-to-Inflation):", negative_european)

2022-2023 Correlation for Positive European OECD countries (High Debt-to-Inflation) after removing outliers: -0.644, p-value: 0.001 (Significant)

Linear Regression (Positive European OECD, 2022-2023):
Coefficient (Inflation): -0.605, p-value: 0.001
R-squared: 0.415
                            OLS Regression Results                            
Dep. Variable:            Public_Debt   R-squared:                       0.415
Model:                            OLS   Adj. R-squared:                  0.386
Method:                 Least Squares   F-statistic:                     14.21
Date:                Sun, 13 Apr 2025   Prob (F-statistic):            0.00121
Time:                        20:56:32   Log-Likelihood:                 50.352
No. Observations:                  22   AIC:                            -96.70
Df Residuals:                      20   BIC:                            -94.52
Df Model:                           1                                         
Covariance Type:      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2022_2023_european['Debt_to_Inflation'] = df_2022_2023_european["Public_Debt"] / df_2022_2023_european["Inflation"].abs()  # Use absolute value due to potential negatives
  return hypotest_fun_in(*args, **kwds)
