In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# Load wage data
wage_data = pd.read_csv('/content/drive/MyDrive/Capstone Data/YOY/Average_Wage_Data_YoY.csv')
debt_df = pd.read_csv('/content/drive/MyDrive/Capstone Data/YOY/Cleaned_Public_Debt_Yoy.csv')

# Define Euro and Non-Euro countries
euro_countries = [
    "Austria", "Belgium", "Czech Republic", "Denmark", "Estonia", "Finland", "France", "Germany",
    "Greece", "Hungary", "Iceland", "Ireland", "Italy", "Latvia", "Lithuania", "Luxembourg",
    "Netherlands", "Norway", "Poland", "Portugal", "Slovak Republic", "Slovenia", "Spain", "Sweden", "Switzerland", "United Kingdom"
]
non_euro_countries = [
    "Australia", "Canada", "Chile", "Colombia", "Israel", "Japan", "Korea", "Mexico", "New Zealand", "Turkey", "United States"
]
all_countries = euro_countries + non_euro_countries

# Convert wage YoY percentages by dividing by 100
for year in map(str, range(2010, 2024)):
    wage_data[year] = wage_data[year] / 100

# Melt data for easier processing
wage_melted = pd.melt(wage_data, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Wage_YoY')
debt_melted = pd.melt(debt_df, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Debt')
wage_melted['Year'] = wage_melted['Year'].astype(int)
debt_melted['Year'] = debt_melted['Year'].astype(int)

# Merge datasets
merged_df = pd.merge(wage_melted, debt_melted, on=['Country', 'Year'])

# Define time periods
periods = {
    '2010': [2010],
    '2011-2013': [2011, 2012, 2013],
    '2014-2019': [2014, 2015, 2016, 2017, 2018, 2019],
    '2020-2021': [2020, 2021],
    '2022-2023': [2022, 2023],
    'Overall': list(range(2010, 2024))
}

# Function to remove outliers using IQR method
def remove_outliers(df, columns):
    df_clean = df.copy()
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
    return df_clean

# Function to calculate Pearson correlation and linear regression
def analyze_correlation(df, period_name, years, region_name, countries):
    period_df = df[(df['Year'].isin(years)) & (df['Country'].isin(countries))]

    # Remove outliers
    period_df = remove_outliers(period_df, ['Debt', 'Wage_YoY'])

    # Check if data is sufficient
    if len(period_df) < 2 or period_df['Debt'].isnull().all() or period_df['Wage_YoY'].isnull().all():
        print(f"\n{period_name} ({region_name}): Insufficient data for analysis after outlier removal")
        return

    # Pearson correlation
    corr, p_value = pearsonr(period_df['Debt'], period_df['Wage_YoY'])

    # Linear regression
    X = period_df['Debt'].values.reshape(-1, 1)
    y = period_df['Wage_YoY'].values
    reg = LinearRegression().fit(X, y)
    slope = reg.coef_[0]
    intercept = reg.intercept_
    r_squared = reg.score(X, y)

  # Corrected variable name from 'r_square' to 'r_squared'

    # Print results
    print(f"\n{period_name} ({region_name}):")
    print(f"Pearson Correlation: {corr:.4f}, p-value: {p_value:.4f}")
    print(f"Linear Regression: y = {slope:.6f}x + {intercept:.6f}")
    print(f"R-squared: {r_squared:.4f}")

    # Plot
    plt.figure(figsize=(8, 6))
    plt.scatter(period_df['Debt'], period_df['Wage_YoY'], color='blue', label='Data')
    plt.plot(period_df['Debt'], reg.predict(X), color='red', label='Regression Line')
    plt.title(f'{period_name} - {region_name} Debt vs Wage YoY (Outliers Removed)')
    plt.xlabel('Public Sector Debt (% GDP)')
    plt.ylabel('Wage YoY Growth')
    plt.legend()
    plt.savefig(f'{period_name}_{region_name}_plot_no_outliers.png')
    plt.close()

# Analyze for each period and region
for period_name, years in periods.items():
    analyze_correlation(merged_df, period_name, years, 'Euro', euro_countries)
    analyze_correlation(merged_df, period_name, years, 'Non-Euro', non_euro_countries)
    analyze_correlation(merged_df, period_name, years, 'All Countries', all_countries)


2010 (Euro):
Pearson Correlation: 0.3808, p-value: 0.0804
Linear Regression: y = 0.068673x + 0.032111
R-squared: 0.1450

2010 (Non-Euro):
Pearson Correlation: -0.2138, p-value: 0.5808
Linear Regression: y = -0.030262x + 0.027212
R-squared: 0.0457

2010 (All Countries):
Pearson Correlation: 0.2040, p-value: 0.2796
Linear Regression: y = 0.040888x + 0.030665
R-squared: 0.0416

2011-2013 (Euro):
Pearson Correlation: -0.2477, p-value: 0.0433
Linear Regression: y = -0.082510x + 0.043517
R-squared: 0.0613

2011-2013 (Non-Euro):
Pearson Correlation: 0.2756, p-value: 0.2145
Linear Regression: y = 0.137164x + 0.028845
R-squared: 0.0759

2011-2013 (All Countries):
Pearson Correlation: -0.1477, p-value: 0.1600
Linear Regression: y = -0.052409x + 0.040523
R-squared: 0.0218

2014-2019 (Euro):
Pearson Correlation: -0.1888, p-value: 0.0295
Linear Regression: y = -0.106915x + 0.033835
R-squared: 0.0356

2014-2019 (Non-Euro):
Pearson Correlation: 0.0771, p-value: 0.5947
Linear Regression: y = 0.022139

In [None]:
# Load labor force data
labor_data = pd.read_csv('/content/drive/MyDrive/Capstone Data/YOY/Cleaned_Labor_Force_Yoy.csv')

# Define Euro and Non-Euro countries
euro_countries = [
    "Austria", "Belgium", "Czech Republic", "Denmark", "Estonia", "Finland", "France", "Germany",
    "Greece", "Hungary", "Iceland", "Ireland", "Italy", "Latvia", "Lithuania", "Luxembourg",
    "Netherlands", "Norway", "Poland", "Portugal", "Slovak Republic", "Slovenia", "Spain", "Sweden", "Switzerland", "United Kingdom"
]
non_euro_countries = [
    "Australia", "Canada", "Chile", "Colombia", "Israel", "Japan", "Korea", "Mexico", "New Zealand", "Turkey", "United States"
]
all_countries = euro_countries + non_euro_countries

# Melt data for easier processing
labor_melted = pd.melt(labor_data, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Labor_YoY')
debt_melted = pd.melt(debt_df, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Debt')
labor_melted['Year'] = labor_melted['Year'].astype(int)
debt_melted['Year'] = debt_melted['Year'].astype(int)

# Merge datasets
merged_df = pd.merge(labor_melted, debt_melted, on=['Country', 'Year'])

# Define time periods
periods = {
    '2010': [2010],
    '2011-2013': [2011, 2012, 2013],
    '2014-2019': [2014, 2015, 2016, 2017, 2018, 2019],
    '2020-2021': [2020, 2021],
    '2022-2023': [2022, 2023],
    'Overall': list(range(2010, 2024))
}

# Function to identify and remove outliers using IQR method
def identify_outliers(df, columns):
    df_clean = df.copy()
    outliers = []

    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify outliers
        col_outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)][['Country', 'Year', col]]
        if not col_outliers.empty:
            col_outliers['Variable'] = col
            outliers.append(col_outliers)

        # Remove outliers for analysis
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    outliers_df = pd.concat(outliers, ignore_index=True) if outliers else pd.DataFrame()
    return df_clean, outliers_df

# Function to calculate Pearson correlation and linear regression
def analyze_correlation(df, period_name, years, region_name, countries):
    period_df = df[(df['Year'].isin(years)) & (df['Country'].isin(countries))]

    # Identify and remove outliers
    period_df_clean, outliers_df = identify_outliers(period_df, ['Debt', 'Labor_YoY'])

    # Check if data is sufficient
    if len(period_df_clean) < 2 or period_df_clean['Debt'].isnull().all() or period_df_clean['Labor_YoY'].isnull().all():
        print(f"\n{period_name} ({region_name}): Insufficient data for analysis after outlier removal")
        return

    # Pearson correlation
    corr, p_value = pearsonr(period_df_clean['Debt'], period_df_clean['Labor_YoY'])

    # Linear regression
    X = period_df_clean['Debt'].values.reshape(-1, 1)
    y = period_df_clean['Labor_YoY'].values
    reg = LinearRegression().fit(X, y)
    slope = reg.coef_[0]
    intercept = reg.intercept_
    r_squared = reg.score(X, y)

    # Print results
    print(f"\n{period_name} ({region_name}):")
    print(f"Pearson Correlation: {corr:.4f}, p-value: {p_value:.4f}")
    print(f"Linear Regression: y = {slope:.6f}x + {intercept:.6f}")
    print(f"R-squared: {r_squared:.4f}")
# Analyze for each period and region
for period_name, years in periods.items():
    analyze_correlation(merged_df, period_name, years, 'Euro', euro_countries)
    analyze_correlation(merged_df, period_name, years, 'Non-Euro', non_euro_countries)
    analyze_correlation(merged_df, period_name, years, 'All Countries', all_countries)


2010 (Euro):
Pearson Correlation: 0.1579, p-value: 0.4941
Linear Regression: y = 0.009896x + 0.000141
R-squared: 0.0249

2010 (Non-Euro):
Pearson Correlation: 0.2514, p-value: 0.5141
Linear Regression: y = 0.013859x + 0.010134
R-squared: 0.0632

2010 (All Countries):
Pearson Correlation: -0.1379, p-value: 0.4594
Linear Regression: y = -0.012800x + 0.004205
R-squared: 0.0190

2011-2013 (Euro):
Pearson Correlation: -0.2514, p-value: 0.0358
Linear Regression: y = -0.040821x + 0.004221
R-squared: 0.0632

2011-2013 (Non-Euro):
Pearson Correlation: -0.2711, p-value: 0.1803
Linear Regression: y = -0.042072x + 0.012971
R-squared: 0.0735

2011-2013 (All Countries):
Pearson Correlation: -0.2255, p-value: 0.0272
Linear Regression: y = -0.036044x + 0.006936
R-squared: 0.0508

2014-2019 (Euro):
Pearson Correlation: -0.3236, p-value: 0.0001
Linear Regression: y = -0.065617x + 0.004012
R-squared: 0.1047

2014-2019 (Non-Euro):
Pearson Correlation: -0.1159, p-value: 0.3864
Linear Regression: y = -0.01

In [None]:
# Load exports data
exports_data = pd.read_csv('/content/drive/MyDrive/Capstone Data/YOY/Cleaned_Export_Yoy.csv')

# Melt data for easier processing
exports_melted = pd.melt(exports_data, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Exports_YoY')
debt_melted = pd.melt(debt_df, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Debt')
exports_melted['Year'] = exports_melted['Year'].astype(int)
debt_melted['Year'] = debt_melted['Year'].astype(int)

# Merge datasets
merged_df = pd.merge(exports_melted, debt_melted, on=['Country', 'Year'])

# Define time periods
periods = {
    '2010': [2010],
    '2011-2013': [2011, 2012, 2013],
    '2014-2019': [2014, 2015, 2016, 2017, 2018, 2019],
    '2020-2021': [2020, 2021],
    '2022-2023': [2022, 2023],
    'Overall': list(range(2010, 2024))
}

# Function to identify and remove outliers using IQR method
def identify_outliers(df, columns):
    df_clean = df.copy()
    outliers = []

    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify outliers
        col_outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)][['Country', 'Year', col]]
        if not col_outliers.empty:
            col_outliers['Variable'] = col
            outliers.append(col_outliers)

        # Remove outliers for analysis
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    outliers_df = pd.concat(outliers, ignore_index=True) if outliers else pd.DataFrame()
    return df_clean, outliers_df

# Function to calculate Pearson correlation and linear regression
def analyze_correlation(df, period_name, years, region_name, countries):
    period_df = df[(df['Year'].isin(years)) & (df['Country'].isin(countries))]

    # Identify and remove outliers
    period_df_clean, outliers_df = identify_outliers(period_df, ['Debt', 'Exports_YoY'])

    # Check if data is sufficient
    if len(period_df_clean) < 2 or period_df_clean['Debt'].isnull().all() or period_df_clean['Exports_YoY'].isnull().all():
        print(f"\n{period_name} ({region_name}): Insufficient data for analysis after outlier removal")
        return

    # Pearson correlation
    corr, p_value = pearsonr(period_df_clean['Debt'], period_df_clean['Exports_YoY'])

    # Linear regression
    X = period_df_clean['Debt'].values.reshape(-1, 1)
    y = period_df_clean['Exports_YoY'].values
    reg = LinearRegression().fit(X, y)
    slope = reg.coef_[0]
    intercept = reg.intercept_
    r_squared = reg.score(X, y)

    # Print results
    print(f"\n{period_name} ({region_name}):")
    print(f"Pearson Correlation: {corr:.4f}, p-value: {p_value:.4f}")
    print(f"Linear Regression: y = {slope:.6f}x + {intercept:.6f}")
    print(f"R-squared: {r_squared:.4f}")

    # Plot
    plt.figure(figsize=(8, 6))
    plt.scatter(period_df_clean['Debt'], period_df_clean['Exports_YoY'], color='blue', label='Data')
    plt.plot(period_df_clean['Debt'], reg.predict(X), color='red', label='Regression Line')
    plt.title(f'{period_name} - {region_name} Debt vs Exports YoY (Outliers Removed)')
    plt.xlabel('Public Sector Debt (% GDP)')
    plt.ylabel('Exports YoY Growth')
    plt.legend()
    plt.savefig(f'{period_name}_{region_name}_exports_plot_no_outliers.png')
    plt.close()

# Analyze for each period and region
for period_name, years in periods.items():
    analyze_correlation(merged_df, period_name, years, 'Euro', euro_countries)
    analyze_correlation(merged_df, period_name, years, 'Non-Euro', non_euro_countries)
    analyze_correlation(merged_df, period_name, years, 'All Countries', all_countries)


2010 (Euro):
Pearson Correlation: -0.1538, p-value: 0.4731
Linear Regression: y = -0.082392x + 0.122876
R-squared: 0.0237

2010 (Non-Euro):
Pearson Correlation: 0.1783, p-value: 0.6462
Linear Regression: y = 0.093062x + 0.201602
R-squared: 0.0318

2010 (All Countries):
Pearson Correlation: -0.2210, p-value: 0.2164
Linear Regression: y = -0.144386x + 0.146959
R-squared: 0.0489

2011-2013 (Euro):
Pearson Correlation: -0.0811, p-value: 0.4983
Linear Regression: y = -0.127542x + 0.072940
R-squared: 0.0066

2011-2013 (Non-Euro):
Pearson Correlation: -0.0625, p-value: 0.7617
Linear Regression: y = -0.080993x + 0.056195
R-squared: 0.0039

2011-2013 (All Countries):
Pearson Correlation: 0.0011, p-value: 0.9916
Linear Regression: y = 0.001521x + 0.065270
R-squared: 0.0000

2014-2019 (Euro):
Pearson Correlation: -0.1793, p-value: 0.0315
Linear Regression: y = -0.380333x + 0.014850
R-squared: 0.0321

2014-2019 (Non-Euro):
Pearson Correlation: -0.3408, p-value: 0.0088
Linear Regression: y = -0.46

In [None]:
imports_data = pd.read_csv('/content/drive/MyDrive/Capstone Data/YOY/Cleaned_Import_Yoy.csv')


# Melt data for easier processing
imports_melted = pd.melt(imports_data, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Imports_YoY')
debt_melted = pd.melt(debt_df, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Debt')
imports_melted['Year'] = imports_melted['Year'].astype(int)
debt_melted['Year'] = debt_melted['Year'].astype(int)

# Merge datasets
merged_df = pd.merge(imports_melted, debt_melted, on=['Country', 'Year'])

# Define time periods
periods = {
    '2010': [2010],
    '2011-2013': [2011, 2012, 2013],
    '2014-2019': [2014, 2015, 2016, 2017, 2018, 2019],
    '2020-2021': [2020, 2021],
    '2022-2023': [2022, 2023],
    'Overall': list(range(2010, 2024))
}

# Function to identify and remove outliers using IQR method
def identify_outliers(df, columns):
    df_clean = df.copy()
    outliers = []

    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify outliers
        col_outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)][['Country', 'Year', col]]
        if not col_outliers.empty:
            col_outliers['Variable'] = col
            outliers.append(col_outliers)

        # Remove outliers for analysis
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    outliers_df = pd.concat(outliers, ignore_index=True) if outliers else pd.DataFrame()
    return df_clean, outliers_df

# Function to calculate Pearson correlation and linear regression
def analyze_correlation(df, period_name, years, region_name, countries):
    period_df = df[(df['Year'].isin(years)) & (df['Country'].isin(countries))]

    # Identify and remove outliers
    period_df_clean, outliers_df = identify_outliers(period_df, ['Debt', 'Imports_YoY'])

    # Check if data is sufficient
    if len(period_df_clean) < 2 or period_df_clean['Debt'].isnull().all() or period_df_clean['Imports_YoY'].isnull().all():
        print(f"\n{period_name} ({region_name}): Insufficient data for analysis after outlier removal")
        return

    # Pearson correlation
    corr, p_value = pearsonr(period_df_clean['Debt'], period_df_clean['Imports_YoY'])

    # Linear regression
    X = period_df_clean['Debt'].values.reshape(-1, 1)
    y = period_df_clean['Imports_YoY'].values
    reg = LinearRegression().fit(X, y)
    slope = reg.coef_[0]
    intercept = reg.intercept_
    r_squared = reg.score(X, y)

    # Print results
    print(f"\n{period_name} ({region_name}):")
    print(f"Pearson Correlation: {corr:.4f}, p-value: {p_value:.4f}")
    print(f"Linear Regression: y = {slope:.6f}x + {intercept:.6f}")
    print(f"R-squared: {r_squared:.4f}")

    # Plot
    plt.figure(figsize=(8, 6))
    plt.scatter(period_df_clean['Debt'], period_df_clean['Imports_YoY'], color='blue', label='Data')
    plt.plot(period_df_clean['Debt'], reg.predict(X), color='red', label='Regression Line')
    plt.title(f'{period_name} - {region_name} Debt vs Imports YoY (Outliers Removed)')
    plt.xlabel('Public Sector Debt (% GDP)')
    plt.ylabel('Imports YoY Growth')
    plt.legend()
    plt.savefig(f'{period_name}_{region_name}_imports_plot_no_outliers.png')
    plt.close()

# Analyze for each period and region
for period_name, years in periods.items():
    analyze_correlation(merged_df, period_name, years, 'Euro', euro_countries)
    analyze_correlation(merged_df, period_name, years, 'Non-Euro', non_euro_countries)
    analyze_correlation(merged_df, period_name, years, 'All Countries', all_countries)


2010 (Euro):
Pearson Correlation: -0.0728, p-value: 0.7413
Linear Regression: y = -0.040241x + 0.126548
R-squared: 0.0053

2010 (Non-Euro):
Pearson Correlation: 0.1566, p-value: 0.6656
Linear Regression: y = 0.066235x + 0.239283
R-squared: 0.0245

2010 (All Countries):
Pearson Correlation: -0.3551, p-value: 0.0393
Linear Regression: y = -0.259927x + 0.165119
R-squared: 0.1261

2011-2013 (Euro):
Pearson Correlation: -0.2037, p-value: 0.0884
Linear Regression: y = -0.343620x + 0.065311
R-squared: 0.0415

2011-2013 (Non-Euro):
Pearson Correlation: 0.1341, p-value: 0.5047
Linear Regression: y = 0.217469x + 0.068907
R-squared: 0.0180

2011-2013 (All Countries):
Pearson Correlation: -0.0882, p-value: 0.3852
Linear Regression: y = -0.139667x + 0.065721
R-squared: 0.0078

2014-2019 (Euro):
Pearson Correlation: -0.1606, p-value: 0.0545
Linear Regression: y = -0.362071x + 0.014435
R-squared: 0.0258

2014-2019 (Non-Euro):
Pearson Correlation: -0.3583, p-value: 0.0053
Linear Regression: y = -0.49

In [None]:
gdp_data = pd.read_csv('/content/drive/MyDrive/Capstone Data/YOY/Cleaned_National_GDP_Yoy.csv')


# Melt data for easier processing
gdp_melted = pd.melt(gdp_data, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='GDP')
debt_melted = pd.melt(debt_df, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Debt')
gdp_melted['Year'] = gdp_melted['Year'].astype(int)
debt_melted['Year'] = debt_melted['Year'].astype(int)

# Merge datasets
merged_df = pd.merge(gdp_melted, debt_melted, on=['Country', 'Year'])

# Define time periods
periods = {
    '2010': [2010],
    '2011-2013': [2011, 2012, 2013],
    '2014-2019': [2014, 2015, 2016, 2017, 2018, 2019],
    '2020-2021': [2020, 2021],
    '2022-2023': [2022, 2023],
    'Overall': list(range(2010, 2024))
}

# Function to identify and remove outliers using IQR method
def identify_outliers(df, columns):
    df_clean = df.copy()
    outliers = []

    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify outliers
        col_outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)][['Country', 'Year', col]]
        if not col_outliers.empty:
            col_outliers['Variable'] = col
            outliers.append(col_outliers)

        # Remove outliers for analysis
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    outliers_df = pd.concat(outliers, ignore_index=True) if outliers else pd.DataFrame()
    return df_clean, outliers_df

# Function to calculate Pearson correlation and linear regression
def analyze_correlation(df, period_name, years, region_name, countries):
    period_df = df[(df['Year'].isin(years)) & (df['Country'].isin(countries))]

    # Identify and remove outliers
    period_df_clean, outliers_df = identify_outliers(period_df, ['Debt', 'GDP'])

    # Check if data is sufficient
    if len(period_df_clean) < 2 or period_df_clean['Debt'].isnull().all() or period_df_clean['GDP'].isnull().all():
        print(f"\n{period_name} ({region_name}): Insufficient data for analysis after outlier removal")
        return

    # Pearson correlation
    corr, p_value = pearsonr(period_df_clean['Debt'], period_df_clean['GDP'])

    # Linear regression
    X = period_df_clean['Debt'].values.reshape(-1, 1)
    y = period_df_clean['GDP'].values
    reg = LinearRegression().fit(X, y)
    slope = reg.coef_[0]
    intercept = reg.intercept_
    r_squared = reg.score(X, y)

    # Print results
    print(f"\n{period_name} ({region_name}):")
    print(f"Pearson Correlation: {corr:.4f}, p-value: {p_value:.4f}")
    print(f"Linear Regression: y = {slope:.6f}x + {intercept:.6f}")
    print(f"R-squared: {r_squared:.4f}")


# Analyze for each period and region
for period_name, years in periods.items():
    analyze_correlation(merged_df, period_name, years, 'Euro', euro_countries)
    analyze_correlation(merged_df, period_name, years, 'Non-Euro', non_euro_countries)
    analyze_correlation(merged_df, period_name, years, 'All Countries', all_countries)


2010 (Euro):
Pearson Correlation: -0.4798, p-value: 0.0238
Linear Regression: y = -0.188930x + 0.006557
R-squared: 0.2302

2010 (Non-Euro):
Pearson Correlation: 0.6281, p-value: 0.1309
Linear Regression: y = 0.200843x + 0.171129
R-squared: 0.3945

2010 (All Countries):
Pearson Correlation: -0.2262, p-value: 0.2211
Linear Regression: y = -0.187157x + 0.053456
R-squared: 0.0512

2011-2013 (Euro):
Pearson Correlation: -0.2871, p-value: 0.0168
Linear Regression: y = -0.347037x + 0.041486
R-squared: 0.0824

2011-2013 (Non-Euro):
Pearson Correlation: 0.0569, p-value: 0.8063
Linear Regression: y = 0.044015x + 0.062222
R-squared: 0.0032

2011-2013 (All Countries):
Pearson Correlation: -0.2998, p-value: 0.0043
Linear Regression: y = -0.339103x + 0.052829
R-squared: 0.0899

2014-2019 (Euro):
Pearson Correlation: -0.1610, p-value: 0.0642
Linear Regression: y = -0.268575x + 0.014512
R-squared: 0.0259

2014-2019 (Non-Euro):
Pearson Correlation: -0.3843, p-value: 0.0070
Linear Regression: y = -0.41

In [None]:
gdp_data = pd.read_csv('/content/drive/MyDrive/Capstone Data/YOY/Cleaned_National_GDP_Yoy.csv')


# Melt data for easier processing
gdp_melted = pd.melt(gdp_data, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='GDP')
debt_melted = pd.melt(debt_df, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Debt')
gdp_melted['Year'] = gdp_melted['Year'].astype(int)
debt_melted['Year'] = debt_melted['Year'].astype(int)

# Merge datasets
merged_df = pd.merge(gdp_melted, debt_melted, on=['Country', 'Year'])

# Define time periods
periods = {
    '2010': [2010],
    '2011-2013': [2011, 2012, 2013],
    '2014-2019': [2014, 2015, 2016, 2017, 2018, 2019],
    '2020-2021': [2020, 2021],
    '2022-2023': [2022, 2023],
    'Overall': list(range(2010, 2024))
}

# Function to identify and remove outliers using IQR method
def identify_outliers(df, columns):
    df_clean = df.copy()
    outliers = []

    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify outliers
        col_outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)][['Country', 'Year', col]]
        if not col_outliers.empty:
            col_outliers['Variable'] = col
            outliers.append(col_outliers)

        # Remove outliers for analysis
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    outliers_df = pd.concat(outliers, ignore_index=True) if outliers else pd.DataFrame()
    return df_clean, outliers_df

# Function to calculate Pearson correlation and linear regression
def analyze_correlation(df, period_name, years, region_name, countries):
    period_df = df[(df['Year'].isin(years)) & (df['Country'].isin(countries))]

    # Identify and remove outliers
    period_df_clean, outliers_df = identify_outliers(period_df, ['Debt', 'GDP'])

    # Check if data is sufficient
    if len(period_df_clean) < 2 or period_df_clean['Debt'].isnull().all() or period_df_clean['GDP'].isnull().all():
        print(f"\n{period_name} ({region_name}): Insufficient data for analysis after outlier removal")
        return

    # Pearson correlation
    corr, p_value = pearsonr(period_df_clean['Debt'], period_df_clean['GDP'])

    # Linear regression
    X = period_df_clean['Debt'].values.reshape(-1, 1)
    y = period_df_clean['GDP'].values
    reg = LinearRegression().fit(X, y)
    slope = reg.coef_[0]
    intercept = reg.intercept_
    r_squared = reg.score(X, y)

    # Print results
    print(f"\n{period_name} ({region_name}):")
    print(f"Pearson Correlation: {corr:.4f}, p-value: {p_value:.4f}")
    print(f"Linear Regression: y = {slope:.6f}x + {intercept:.6f}")
    print(f"R-squared: {r_squared:.4f}")


# Analyze for each period and region
for period_name, years in periods.items():
    analyze_correlation(merged_df, period_name, years, 'Euro', euro_countries)
    analyze_correlation(merged_df, period_name, years, 'Non-Euro', non_euro_countries)
    analyze_correlation(merged_df, period_name, years, 'All Countries', all_countries)


2010 (Euro):
Pearson Correlation: -0.4798, p-value: 0.0238
Linear Regression: y = -0.188930x + 0.006557
R-squared: 0.2302

2010 (Non-Euro):
Pearson Correlation: 0.6281, p-value: 0.1309
Linear Regression: y = 0.200843x + 0.171129
R-squared: 0.3945

2010 (All Countries):
Pearson Correlation: -0.2262, p-value: 0.2211
Linear Regression: y = -0.187157x + 0.053456
R-squared: 0.0512

2011-2013 (Euro):
Pearson Correlation: -0.2871, p-value: 0.0168
Linear Regression: y = -0.347037x + 0.041486
R-squared: 0.0824

2011-2013 (Non-Euro):
Pearson Correlation: 0.0569, p-value: 0.8063
Linear Regression: y = 0.044015x + 0.062222
R-squared: 0.0032

2011-2013 (All Countries):
Pearson Correlation: -0.2998, p-value: 0.0043
Linear Regression: y = -0.339103x + 0.052829
R-squared: 0.0899

2014-2019 (Euro):
Pearson Correlation: -0.1610, p-value: 0.0642
Linear Regression: y = -0.268575x + 0.014512
R-squared: 0.0259

2014-2019 (Non-Euro):
Pearson Correlation: -0.3843, p-value: 0.0070
Linear Regression: y = -0.41

In [None]:
pop_data = pd.read_csv('/content/drive/MyDrive/Capstone Data/YOY/Cleaned_Pop_Growth.csv')


# Melt data for easier processing
pop_melted = pd.melt(pop_data, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Population')
debt_melted = pd.melt(debt_df, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Debt')
# Change: Ensuring pop_melted['Year'] is of type int
pop_melted['Year'] = pop_melted['Year'].astype(int)
debt_melted['Year'] = debt_melted['Year'].astype(int)

# Change: Merging pop_melted and debt_melted instead of gdp_melted and debt_melted
merged_df = pd.merge(pop_melted, debt_melted, on=['Country', 'Year'])

# Define time periods
periods = {
    '2010': [2010],
    '2011-2013': [2011, 2012, 2013],
    '2014-2019': [2014, 2015, 2016, 2017, 2018, 2019],
    '2020-2021': [2020, 2021],
    '2022-2023': [2022, 2023],
    '2010-2013': [2010, 2011, 2012, 2013],
    'Overall': list(range(2010, 2024))
}

# Function to identify and remove outliers using IQR method
def identify_outliers(df, columns):
    df_clean = df.copy()
    outliers = []

    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify outliers
        col_outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)][['Country', 'Year', col]]
        if not col_outliers.empty:
            col_outliers['Variable'] = col
            outliers.append(col_outliers)

        # Remove outliers for analysis
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    outliers_df = pd.concat(outliers, ignore_index=True) if outliers else pd.DataFrame()
    return df_clean, outliers_df

# Function to calculate Pearson correlation and linear regression
def analyze_correlation(df, period_name, years, region_name, countries):
    period_df = df[(df['Year'].isin(years)) & (df['Country'].isin(countries))]

    # Identify and remove outliers
    period_df_clean, outliers_df = identify_outliers(period_df, ['Debt', 'Population'])

    # Check if data is sufficient
    if len(period_df_clean) < 2 or period_df_clean['Debt'].isnull().all() or period_df_clean['Population'].isnull().all():
        print(f"\n{period_name} ({region_name}): Insufficient data for analysis after outlier removal")
        return

    # Pearson correlation
    corr, p_value = pearsonr(period_df_clean['Debt'], period_df_clean['Population'])

    # Linear regression
    X = period_df_clean['Debt'].values.reshape(-1, 1)
    y = period_df_clean['Population'].values
    reg = LinearRegression().fit(X, y)
    slope = reg.coef_[0]
    intercept = reg.intercept_
    r_squared = reg.score(X, y)

    # Print results
    print(f"\n{period_name} ({region_name}):")
    print(f"Pearson Correlation: {corr:.4f}, p-value: {p_value:.4f}")
    print(f"Linear Regression: y = {slope:.6f}x + {intercept:.6f}")
    print(f"R-squared: {r_squared:.4f}")


# Analyze for each period and region
for period_name, years in periods.items():
    analyze_correlation(merged_df, period_name, years, 'Euro', euro_countries)
    analyze_correlation(merged_df, period_name, years, 'Non-Euro', non_euro_countries)
    analyze_correlation(merged_df, period_name, years, 'All Countries', all_countries)


2010 (Euro):
Pearson Correlation: -0.2933, p-value: 0.1853
Linear Regression: y = -0.015158x + 0.004325
R-squared: 0.0860

2010 (Non-Euro):
Pearson Correlation: -0.0636, p-value: 0.8708
Linear Regression: y = -0.001243x + 0.011477
R-squared: 0.0040

2010 (All Countries):
Pearson Correlation: -0.0809, p-value: 0.6600
Linear Regression: y = -0.004072x + 0.005732
R-squared: 0.0065

2011-2013 (Euro):
Pearson Correlation: -0.0613, p-value: 0.6278
Linear Regression: y = -0.005638x + 0.002880
R-squared: 0.0038

2011-2013 (Non-Euro):
Pearson Correlation: 0.0763, p-value: 0.7232
Linear Regression: y = 0.004206x + 0.009918
R-squared: 0.0058

2011-2013 (All Countries):
Pearson Correlation: -0.0613, p-value: 0.5551
Linear Regression: y = -0.006186x + 0.005069
R-squared: 0.0038

2014-2019 (Euro):
Pearson Correlation: -0.2628, p-value: 0.0020
Linear Regression: y = -0.035847x + 0.002273
R-squared: 0.0691

2014-2019 (Non-Euro):
Pearson Correlation: 0.0340, p-value: 0.7980
Linear Regression: y = 0.00

In [None]:
unemployment_data = pd.read_csv('/content/drive/MyDrive/Capstone Data/YOY/Cleaned_Unemployment_Yoy.csv')


# Melt data for easier processing
unemployment_melted = pd.melt(unemployment_data, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Unemployment')
debt_melted = pd.melt(debt_df, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Debt')
# Change: Ensuring pop_melted['Year'] is of type int
unemployment_melted['Year'] = unemployment_melted['Year'].astype(int)
debt_melted['Year'] = debt_melted['Year'].astype(int)

# Change: Merging pop_melted and debt_melted instead of gdp_melted and debt_melted
merged_df = pd.merge(unemployment_melted, debt_melted, on=['Country', 'Year'])

# Define time periods
periods = {
    '2010': [2010],
    '2011-2013': [2011, 2012, 2013],
    '2014-2019': [2014, 2015, 2016, 2017, 2018, 2019],
    '2020-2021': [2020, 2021],
    '2022-2023': [2022, 2023],
    '2010-2013': [2010, 2011, 2012, 2013],
    'Overall': list(range(2010, 2024))
}

# Function to identify and remove outliers using IQR method
def identify_outliers(df, columns):
    df_clean = df.copy()
    outliers = []

    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify outliers
        col_outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)][['Country', 'Year', col]]
        if not col_outliers.empty:
            col_outliers['Variable'] = col
            outliers.append(col_outliers)

        # Remove outliers for analysis
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    outliers_df = pd.concat(outliers, ignore_index=True) if outliers else pd.DataFrame()
    return df_clean, outliers_df

# Function to calculate Pearson correlation and linear regression
def analyze_correlation(df, period_name, years, region_name, countries):
    period_df = df[(df['Year'].isin(years)) & (df['Country'].isin(countries))]

    # Identify and remove outliers
    period_df_clean, outliers_df = identify_outliers(period_df, ['Debt', 'Unemployment'])

    # Check if data is sufficient
    if len(period_df_clean) < 2 or period_df_clean['Debt'].isnull().all() or period_df_clean['Unemployment'].isnull().all():
        print(f"\n{period_name} ({region_name}): Insufficient data for analysis after outlier removal")
        return

    # Pearson correlation
    corr, p_value = pearsonr(period_df_clean['Debt'], period_df_clean['Unemployment'])

    # Linear regression
    X = period_df_clean['Debt'].values.reshape(-1, 1)
    y = period_df_clean['Unemployment'].values
    reg = LinearRegression().fit(X, y)
    slope = reg.coef_[0]
    intercept = reg.intercept_
    r_squared = reg.score(X, y)

    # Print results
    print(f"\n{period_name} ({region_name}):")
    print(f"Pearson Correlation: {corr:.4f}, p-value: {p_value:.4f}")
    print(f"Linear Regression: y = {slope:.6f}x + {intercept:.6f}")
    print(f"R-squared: {r_squared:.4f}")


# Analyze for each period and region
for period_name, years in periods.items():
    analyze_correlation(merged_df, period_name, years, 'Euro', euro_countries)
    analyze_correlation(merged_df, period_name, years, 'Non-Euro', non_euro_countries)
    analyze_correlation(merged_df, period_name, years, 'All Countries', all_countries)


2010 (Euro):
Pearson Correlation: 0.1750, p-value: 0.4135
Linear Regression: y = 0.185838x + 0.102643
R-squared: 0.0306

2010 (Non-Euro):
Pearson Correlation: -0.1701, p-value: 0.6385
Linear Regression: y = -0.105725x + -0.055073
R-squared: 0.0289

2010 (All Countries):
Pearson Correlation: 0.2724, p-value: 0.1191
Linear Regression: y = 0.294746x + 0.051126
R-squared: 0.0742

2011-2013 (Euro):
Pearson Correlation: 0.4469, p-value: 0.0001
Linear Regression: y = 0.770892x + -0.027290
R-squared: 0.1998

2011-2013 (Non-Euro):
Pearson Correlation: 0.1727, p-value: 0.3890
Linear Regression: y = 0.165332x + -0.041909
R-squared: 0.0298

2011-2013 (All Countries):
Pearson Correlation: 0.3954, p-value: 0.0001
Linear Regression: y = 0.544715x + -0.032611
R-squared: 0.1564

2014-2019 (Euro):
Pearson Correlation: 0.2259, p-value: 0.0069
Linear Regression: y = 0.414001x + -0.080619
R-squared: 0.0510

2014-2019 (Non-Euro):
Pearson Correlation: 0.2673, p-value: 0.0425
Linear Regression: y = 0.253290x

In [None]:
acct_data = pd.read_csv('/content/drive/MyDrive/Capstone Data/YOY/Cleaned_current_account_yoy.csv')


# Melt data for easier processing
acct_melted = pd.melt(acct_data, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Account')
debt_melted = pd.melt(debt_df, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Debt')
# Change: Ensuring pop_melted['Year'] is of type int
acct_melted['Year'] = acct_melted['Year'].astype(int)
debt_melted['Year'] = debt_melted['Year'].astype(int)

# Change: Merging pop_melted and debt_melted instead of gdp_melted and debt_melted
merged_df = pd.merge(acct_melted, debt_melted, on=['Country', 'Year'])

# Define time periods
periods = {
    '2010': [2010],
    '2011-2013': [2011, 2012, 2013],
    '2014-2019': [2014, 2015, 2016, 2017, 2018, 2019],
    '2020-2021': [2020, 2021],
    '2022-2023': [2022, 2023],
    '2010-2013': [2010, 2011, 2012, 2013],
    'Overall': list(range(2010, 2024))
}

# Function to identify and remove outliers using IQR method
def identify_outliers(df, columns):
    df_clean = df.copy()
    outliers = []

    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify outliers
        col_outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)][['Country', 'Year', col]]
        if not col_outliers.empty:
            col_outliers['Variable'] = col
            outliers.append(col_outliers)

        # Remove outliers for analysis
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    outliers_df = pd.concat(outliers, ignore_index=True) if outliers else pd.DataFrame()
    return df_clean, outliers_df

# Function to calculate Pearson correlation and linear regression
def analyze_correlation(df, period_name, years, region_name, countries):
    period_df = df[(df['Year'].isin(years)) & (df['Country'].isin(countries))]

    # Identify and remove outliers
    period_df_clean, outliers_df = identify_outliers(period_df, ['Debt', 'Account'])

    # Check if data is sufficient
    if len(period_df_clean) < 2 or period_df_clean['Debt'].isnull().all() or period_df_clean['Account'].isnull().all():
        print(f"\n{period_name} ({region_name}): Insufficient data for analysis after outlier removal")
        return

    # Pearson correlation
    corr, p_value = pearsonr(period_df_clean['Debt'], period_df_clean['Account'])

    # Linear regression
    X = period_df_clean['Debt'].values.reshape(-1, 1)
    y = period_df_clean['Account'].values
    reg = LinearRegression().fit(X, y)
    slope = reg.coef_[0]
    intercept = reg.intercept_
    r_squared = reg.score(X, y)

    # Print results
    print(f"\n{period_name} ({region_name}):")
    print(f"Pearson Correlation: {corr:.4f}, p-value: {p_value:.4f}")
    print(f"Linear Regression: y = {slope:.6f}x + {intercept:.6f}")
    print(f"R-squared: {r_squared:.4f}")


# Analyze for each period and region
for period_name, years in periods.items():
    analyze_correlation(merged_df, period_name, years, 'Euro', euro_countries)
    analyze_correlation(merged_df, period_name, years, 'Non-Euro', non_euro_countries)
    analyze_correlation(merged_df, period_name, years, 'All Countries', all_countries)


2010 (Euro):
Pearson Correlation: -0.4529, p-value: 0.0263
Linear Regression: y = -2.576126x + 0.226067
R-squared: 0.2051

2010 (Non-Euro):
Pearson Correlation: -0.2913, p-value: 0.4469
Linear Regression: y = -0.764991x + 0.233624
R-squared: 0.0849

2010 (All Countries):
Pearson Correlation: -0.3681, p-value: 0.0382
Linear Regression: y = -1.980727x + 0.222750
R-squared: 0.1355

2011-2013 (Euro):
Pearson Correlation: -0.1384, p-value: 0.3001
Linear Regression: y = -1.197550x + -0.140230
R-squared: 0.0192

2011-2013 (Non-Euro):
Pearson Correlation: 0.2577, p-value: 0.2137
Linear Regression: y = 1.512077x + 0.005274
R-squared: 0.0664

2011-2013 (All Countries):
Pearson Correlation: -0.1003, p-value: 0.3699
Linear Regression: y = -0.609667x + -0.048768
R-squared: 0.0101

2014-2019 (Euro):
Pearson Correlation: -0.0560, p-value: 0.5268
Linear Regression: y = -0.813088x + -0.031961
R-squared: 0.0031

2014-2019 (Non-Euro):
Pearson Correlation: -0.0254, p-value: 0.8553
Linear Regression: y = 

In [None]:
inf_data = pd.read_csv('/content/drive/MyDrive/Capstone Data/YOY/clean_inflation_rate (2).csv')


# Melt data for easier processing
inf_melted = pd.melt(inf_data, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Inflation')
debt_melted = pd.melt(debt_df, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Debt')
# Change: Ensuring pop_melted['Year'] is of type int
inf_melted['Year'] = inf_melted['Year'].astype(int)
debt_melted['Year'] = debt_melted['Year'].astype(int)

# Change: Merging pop_melted and debt_melted instead of gdp_melted and debt_melted
merged_df = pd.merge(inf_melted, debt_melted, on=['Country', 'Year'])

# Define time periods
periods = {
    '2010': [2010],
    '2011-2013': [2011, 2012, 2013],
    '2014-2019': [2014, 2015, 2016, 2017, 2018, 2019],
    '2020-2021': [2020, 2021],
    '2022-2023': [2022, 2023],
    '2010-2013': [2010, 2011, 2012, 2013],
    'Overall': list(range(2010, 2024))
}

# Function to identify and remove outliers using IQR method
def identify_outliers(df, columns):
    df_clean = df.copy()
    outliers = []

    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify outliers
        col_outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)][['Country', 'Year', col]]
        if not col_outliers.empty:
            col_outliers['Variable'] = col
            outliers.append(col_outliers)

        # Remove outliers for analysis
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    outliers_df = pd.concat(outliers, ignore_index=True) if outliers else pd.DataFrame()
    return df_clean, outliers_df

# Function to calculate Pearson correlation and linear regression
def analyze_correlation(df, period_name, years, region_name, countries):
    period_df = df[(df['Year'].isin(years)) & (df['Country'].isin(countries))]

    # Identify and remove outliers
    period_df_clean, outliers_df = identify_outliers(period_df, ['Debt', 'Inflation'])

    # Check if data is sufficient
    if len(period_df_clean) < 2 or period_df_clean['Debt'].isnull().all() or period_df_clean['Inflation'].isnull().all():
        print(f"\n{period_name} ({region_name}): Insufficient data for analysis after outlier removal")
        return

    # Pearson correlation
    corr, p_value = pearsonr(period_df_clean['Debt'], period_df_clean['Inflation'])

    # Linear regression
    X = period_df_clean['Debt'].values.reshape(-1, 1)
    y = period_df_clean['Inflation'].values
    reg = LinearRegression().fit(X, y)
    slope = reg.coef_[0]
    intercept = reg.intercept_
    r_squared = reg.score(X, y)

    # Print results
    print(f"\n{period_name} ({region_name}):")
    print(f"Pearson Correlation: {corr:.4f}, p-value: {p_value:.4f}")
    print(f"Linear Regression: y = {slope:.6f}x + {intercept:.6f}")
    print(f"R-squared: {r_squared:.4f}")


# Analyze for each period and region
for period_name, years in periods.items():
    analyze_correlation(merged_df, period_name, years, 'Euro', euro_countries)
    analyze_correlation(merged_df, period_name, years, 'Non-Euro', non_euro_countries)
    analyze_correlation(merged_df, period_name, years, 'All Countries', all_countries)


2010 (Euro):
Pearson Correlation: -0.1734, p-value: 0.4778
Linear Regression: y = -0.011409x + 0.017763
R-squared: 0.0301

2010 (Non-Euro):
Pearson Correlation: -0.5042, p-value: 0.2026
Linear Regression: y = -0.026287x + 0.027763
R-squared: 0.2543

2010 (All Countries):
Pearson Correlation: -0.1107, p-value: 0.5904
Linear Regression: y = -0.008789x + 0.019997
R-squared: 0.0122

2011-2013 (Euro):
Pearson Correlation: 0.0397, p-value: 0.7459
Linear Regression: y = 0.008979x + 0.023064
R-squared: 0.0016

2011-2013 (Non-Euro):
Pearson Correlation: 0.1277, p-value: 0.5429
Linear Regression: y = 0.030461x + 0.020604
R-squared: 0.0163

2011-2013 (All Countries):
Pearson Correlation: 0.0723, p-value: 0.4865
Linear Regression: y = 0.015925x + 0.022502
R-squared: 0.0052

2014-2019 (Euro):
Pearson Correlation: -0.2101, p-value: 0.0131
Linear Regression: y = -0.056718x + 0.009253
R-squared: 0.0441

2014-2019 (Non-Euro):
Pearson Correlation: 0.2422, p-value: 0.0836
Linear Regression: y = 0.053058

In [None]:
fdi_data = pd.read_csv('/content/drive/Shareddrives/Capstone/Capstone Project NY/Cleaned Data/YoY/FDI_YoY_Change__2010-2023_.csv') # change path if necessary


# Melt data for easier processing
fdi_melted = pd.melt(fdi_data, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='FDI')
debt_melted = pd.melt(debt_df, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Debt')
# Change: Ensuring pop_melted['Year'] is of type int
fdi_melted['Year'] = fdi_melted['Year'].astype(int)
debt_melted['Year'] = debt_melted['Year'].astype(int)

# Change: Merging pop_melted and debt_melted instead of gdp_melted and debt_melted
merged_df = pd.merge(fdi_melted, debt_melted, on=['Country', 'Year'])

# Define time periods
periods = {
    '2010': [2010],
    '2011-2013': [2011, 2012, 2013],
    '2014-2019': [2014, 2015, 2016, 2017, 2018, 2019],
    '2020-2021': [2020, 2021],
    '2022-2023': [2022, 2023],
    '2010-2013': [2010, 2011, 2012, 2013],
    'Overall': list(range(2010, 2024))
}

# Function to identify and remove outliers using IQR method
def identify_outliers(df, columns):
    df_clean = df.copy()
    outliers = []

    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify outliers
        col_outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)][['Country', 'Year', col]]
        if not col_outliers.empty:
            col_outliers['Variable'] = col
            outliers.append(col_outliers)

        # Remove outliers for analysis
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    outliers_df = pd.concat(outliers, ignore_index=True) if outliers else pd.DataFrame()
    return df_clean, outliers_df

# Function to calculate Pearson correlation and linear regression
def analyze_correlation(df, period_name, years, region_name, countries):
    period_df = df[(df['Year'].isin(years)) & (df['Country'].isin(countries))]

    # Identify and remove outliers
    period_df_clean, outliers_df = identify_outliers(period_df, ['Debt', 'FDI'])

    # Check if data is sufficient
    if len(period_df_clean) < 2 or period_df_clean['Debt'].isnull().all() or period_df_clean['FDI'].isnull().all():
        print(f"\n{period_name} ({region_name}): Insufficient data for analysis after outlier removal")
        return

    # Pearson correlation
    corr, p_value = pearsonr(period_df_clean['Debt'], period_df_clean['FDI'])

    # Linear regression
    X = period_df_clean['Debt'].values.reshape(-1, 1)
    y = period_df_clean['FDI'].values
    reg = LinearRegression().fit(X, y)
    slope = reg.coef_[0]
    intercept = reg.intercept_
    r_squared = reg.score(X, y)

    # Print results
    print(f"\n{period_name} ({region_name}):")
    print(f"Pearson Correlation: {corr:.4f}, p-value: {p_value:.4f}")
    print(f"Linear Regression: y = {slope:.6f}x + {intercept:.6f}")
    print(f"R-squared: {r_squared:.4f}")


# Analyze for each period and region
for period_name, years in periods.items():
    analyze_correlation(merged_df, period_name, years, 'Euro', euro_countries)
    analyze_correlation(merged_df, period_name, years, 'Non-Euro', non_euro_countries)
    analyze_correlation(merged_df, period_name, years, 'All Countries', all_countries)

# Results for FDI correlation with debt

2010 (Euro):
Pearson Correlation: 0.0814, p-value: 0.7331
Linear Regression: y = 118.958376x + 13.215762
R-squared: 0.0066

2010 (Non-Euro):
Pearson Correlation: 0.0821, p-value: 0.8467
Linear Regression: y = 16.489515x + 16.716239
R-squared: 0.0067

2010 (All Countries):
Pearson Correlation: -0.0623, p-value: 0.7726
Linear Regression: y = -55.873671x + -12.802612
R-squared: 0.0039

2011-2013 (Euro):
Pearson Correlation: 0.1054, p-value: 0.4189
Linear Regression: y = 156.235202x + -38.538427
R-squared: 0.0111

2011-2013 (Non-Euro):
Pearson Correlation: 0.0695, p-value: 0.7528
Linear Regression: y = 69.320285x + 17.797501
R-squared: 0.0048

2011-2013 (All Countries):
Pearson Correlation: 0.0841, p-value: 0.4443
Linear Regression: y = 119.088118x + -24.431017
R-squared: 0.0071

2014-2019 (Euro):
Pearson Correlation: 0.2016, p-value: 0.0286
Linear Regression: y = 555.926968x + -11.630419
R-squared: 0.0407

2014-2019 (Non-Euro):
Pearson Correlation: -0.1407, p-value: 0.3348
Linear Regression: y = -68.817765x + 2.976844
R-squared: 0.0198

2014-2019 (All Countries):
Pearson Correlation: 0.2203, p-value: 0.0067
Linear Regression: y = 354.172340x + -7.214283
R-squared: 0.0485

2020-2021 (Euro):
Pearson Correlation: -0.1007, p-value: 0.5205
Linear Regression: y = -115.567499x + -4.306586
R-squared: 0.0101

2020-2021 (Non-Euro):
Pearson Correlation: -0.5757, p-value: 0.0196
Linear Regression: y = -226.671278x + 37.350478
R-squared: 0.3314

2020-2021 (All Countries):
Pearson Correlation: -0.1717, p-value: 0.2015
Linear Regression: y = -143.345694x + 5.135790
R-squared: 0.0295

2022-2023 (Euro):
Pearson Correlation: -0.2416, p-value: 0.1439
Linear Regression: y = -320.177524x + -42.914369
R-squared: 0.0584

2022-2023 (Non-Euro):
Pearson Correlation: 0.3544, p-value: 0.2138
Linear Regression: y = 235.538686x + -7.834341
R-squared: 0.1256

2022-2023 (All Countries):
Pearson Correlation: -0.2035, p-value: 0.1401
Linear Regression: y = -243.490918x + -31.202179
R-squared: 0.0414

2010-2013 (Euro):
Pearson Correlation: 0.0907, p-value: 0.4179
Linear Regression: y = 133.075591x + -25.318233
R-squared: 0.0082

2010-2013 (Non-Euro):
Pearson Correlation: 0.0240, p-value: 0.9034
Linear Regression: y = 14.860665x + 7.703543
R-squared: 0.0006

2010-2013 (All Countries):
Pearson Correlation: 0.0517, p-value: 0.5879
Linear Regression: y = 67.937165x + -16.044037
R-squared: 0.0027

Overall (Euro):
Pearson Correlation: 0.0128, p-value: 0.8346
Linear Regression: y = 21.960424x + -25.415539
R-squared: 0.0002

Overall (Non-Euro):
Pearson Correlation: -0.0711, p-value: 0.4687
Linear Regression: y = -38.655528x + 5.689427
R-squared: 0.0051

Overall (All Countries):
Pearson Correlation: 0.0648, p-value: 0.2112
Linear Regression: y = 75.409513x + -11.940441
R-squared: 0.0042


In [None]:
house_data = pd.read_csv('/content/filtered_housing_price_indicator_YOY.csv')
house_data.head()

Unnamed: 0,Country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Australia,9.42,-4.26,-2.54,4.29,7.25,7.64,4.85,7.02,-3.06,-5.75,4.09,15.0,1.37,-4.35
1,Austria,6.85,2.24,4.25,2.95,1.91,4.23,5.3,3.19,3.78,4.21,6.5,9.2,3.73,-10.33
2,Belgium,0.48,0.89,0.09,0.47,-1.13,1.02,0.83,1.48,0.94,2.24,3.34,3.94,-4.22,-3.51
3,Canada,7.53,3.23,3.42,1.46,3.3,4.31,9.08,10.41,2.03,-0.16,5.71,11.44,5.43,-5.59
4,Chile,4.42,8.73,7.67,7.33,6.66,7.06,2.87,5.55,8.1,6.46,1.28,7.06,-3.51,0.77


In [None]:
for year in map(str, range(2010, 2024)):
    house_data[year] = house_data[year] / 100

house_data.head()

Unnamed: 0,Country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Australia,0.0942,-0.0426,-0.0254,0.0429,0.0725,0.0764,0.0485,0.0702,-0.0306,-0.0575,0.0409,0.15,0.0137,-0.0435
1,Austria,0.0685,0.0224,0.0425,0.0295,0.0191,0.0423,0.053,0.0319,0.0378,0.0421,0.065,0.092,0.0373,-0.1033
2,Belgium,0.0048,0.0089,0.0009,0.0047,-0.0113,0.0102,0.0083,0.0148,0.0094,0.0224,0.0334,0.0394,-0.0422,-0.0351
3,Canada,0.0753,0.0323,0.0342,0.0146,0.033,0.0431,0.0908,0.1041,0.0203,-0.0016,0.0571,0.1144,0.0543,-0.0559
4,Chile,0.0442,0.0873,0.0767,0.0733,0.0666,0.0706,0.0287,0.0555,0.081,0.0646,0.0128,0.0706,-0.0351,0.0077


In [None]:
# Melt data for easier processing
house_melted = pd.melt(house_data, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Housing')
debt_melted = pd.melt(debt_df, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Debt')
# Change: Ensuring pop_melted['Year'] is of type int
house_melted['Year'] = house_melted['Year'].astype(int)
debt_melted['Year'] = debt_melted['Year'].astype(int)

# Change: Merging pop_melted and debt_melted instead of gdp_melted and debt_melted
merged_df = pd.merge(house_melted, debt_melted, on=['Country', 'Year'])

# Define time periods
periods = {
    '2010': [2010],
    '2011-2013': [2011, 2012, 2013],
    '2014-2019': [2014, 2015, 2016, 2017, 2018, 2019],
    '2020-2021': [2020, 2021],
    '2022-2023': [2022, 2023],
    '2010-2013': [2010, 2011, 2012, 2013],
    'Overall': list(range(2010, 2024))
}

# Function to identify and remove outliers using IQR method
def identify_outliers(df, columns):
    df_clean = df.copy()
    outliers = []

    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify outliers
        col_outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)][['Country', 'Year', col]]
        if not col_outliers.empty:
            col_outliers['Variable'] = col
            outliers.append(col_outliers)

        # Remove outliers for analysis
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    outliers_df = pd.concat(outliers, ignore_index=True) if outliers else pd.DataFrame()
    return df_clean, outliers_df

# Function to calculate Pearson correlation and linear regression
def analyze_correlation(df, period_name, years, region_name, countries):
    period_df = df[(df['Year'].isin(years)) & (df['Country'].isin(countries))]

    # Identify and remove outliers
    period_df_clean, outliers_df = identify_outliers(period_df, ['Debt', 'Housing'])

    # Check if data is sufficient
    if len(period_df_clean) < 2 or period_df_clean['Debt'].isnull().all() or period_df_clean['Housing'].isnull().all():
        print(f"\n{period_name} ({region_name}): Insufficient data for analysis after outlier removal")
        return

    # Pearson correlation
    corr, p_value = pearsonr(period_df_clean['Debt'], period_df_clean['Housing'])

    # Linear regression
    X = period_df_clean['Debt'].values.reshape(-1, 1)
    y = period_df_clean['Housing'].values
    reg = LinearRegression().fit(X, y)
    slope = reg.coef_[0]
    intercept = reg.intercept_
    r_squared = reg.score(X, y)

    # Print results
    print(f"\n{period_name} ({region_name}):")
    print(f"Pearson Correlation: {corr:.4f}, p-value: {p_value:.4f}")
    print(f"Linear Regression: y = {slope:.6f}x + {intercept:.6f}")
    print(f"R-squared: {r_squared:.4f}")


# Analyze for each period and region
for period_name, years in periods.items():
    analyze_correlation(merged_df, period_name, years, 'Euro', euro_countries)
    analyze_correlation(merged_df, period_name, years, 'Non-Euro', non_euro_countries)
    analyze_correlation(merged_df, period_name, years, 'All Countries', all_countries)


2010 (Euro):
Pearson Correlation: -0.6272, p-value: 0.0010
Linear Regression: y = -0.291562x + 0.008897
R-squared: 0.3933

2010 (Non-Euro):
Pearson Correlation: -0.0995, p-value: 0.7990
Linear Regression: y = -0.033905x + 0.042626
R-squared: 0.0099

2010 (All Countries):
Pearson Correlation: -0.4949, p-value: 0.0040
Linear Regression: y = -0.259774x + 0.018838
R-squared: 0.2450

2011-2013 (Euro):
Pearson Correlation: -0.4406, p-value: 0.0002
Linear Regression: y = -0.358212x + -0.004883
R-squared: 0.1942

2011-2013 (Non-Euro):
Pearson Correlation: -0.3326, p-value: 0.0969
Linear Regression: y = -0.172547x + 0.028547
R-squared: 0.1106

2011-2013 (All Countries):
Pearson Correlation: -0.3673, p-value: 0.0003
Linear Regression: y = -0.283602x + 0.003755
R-squared: 0.1349

2014-2019 (Euro):
Pearson Correlation: -0.1340, p-value: 0.1227
Linear Regression: y = -0.103165x + 0.030002
R-squared: 0.0180

2014-2019 (Non-Euro):
Pearson Correlation: 0.1695, p-value: 0.2393
Linear Regression: y = 0

In [None]:
corruption_data = pd.read_csv('/content/filtered_corruption_score_YOY(0-100)).csv')
corruption_data.head()

Unnamed: 0,Country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Australia,0.0,1.15,-3.41,-4.71,-1.23,-1.25,0.0,-2.53,0.0,0.0,0.0,-5.19,2.74,0.0
1,Austria,0.0,-1.27,-11.54,0.0,4.35,5.56,-1.32,0.0,1.33,1.32,-1.3,-2.63,-4.05,0.0
2,Belgium,0.0,5.63,0.0,0.0,1.33,1.32,0.0,-2.6,0.0,0.0,1.33,-3.95,0.0,0.0
3,Canada,2.3,-2.25,-3.45,-3.57,0.0,2.47,-1.2,0.0,-1.22,-4.94,0.0,-3.9,0.0,2.7
4,Chile,7.46,0.0,0.0,-1.39,2.82,-4.11,-5.71,1.52,0.0,0.0,0.0,0.0,0.0,-1.49


In [None]:
for year in map(str, range(2010, 2024)):
    corruption_data[year] = corruption_data[year] / 100

corruption_data.head()

Unnamed: 0,Country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Australia,0.0,0.0115,-0.0341,-0.0471,-0.0123,-0.0125,0.0,-0.0253,0.0,0.0,0.0,-0.0519,0.0274,0.0
1,Austria,0.0,-0.0127,-0.1154,0.0,0.0435,0.0556,-0.0132,0.0,0.0133,0.0132,-0.013,-0.0263,-0.0405,0.0
2,Belgium,0.0,0.0563,0.0,0.0,0.0133,0.0132,0.0,-0.026,0.0,0.0,0.0133,-0.0395,0.0,0.0
3,Canada,0.023,-0.0225,-0.0345,-0.0357,0.0,0.0247,-0.012,0.0,-0.0122,-0.0494,0.0,-0.039,0.0,0.027
4,Chile,0.0746,0.0,0.0,-0.0139,0.0282,-0.0411,-0.0571,0.0152,0.0,0.0,0.0,0.0,0.0,-0.0149


In [None]:
# Melt data for easier processing
corruption_melted = pd.melt(corruption_data, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='corruption')
debt_melted = pd.melt(debt_df, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Debt')
# Change: Ensuring pop_melted['Year'] is of type int
corruption_melted['Year'] = corruption_melted['Year'].astype(int)
debt_melted['Year'] = debt_melted['Year'].astype(int)

# Change: Merging pop_melted and debt_melted instead of gdp_melted and debt_melted
merged_df = pd.merge(corruption_melted, debt_melted, on=['Country', 'Year'])

# Define time periods
periods = {
    '2010': [2010],
    '2011-2013': [2011, 2012, 2013],
    '2014-2019': [2014, 2015, 2016, 2017, 2018, 2019],
    '2020-2021': [2020, 2021],
    '2022-2023': [2022, 2023],
    '2010-2013': [2010, 2011, 2012, 2013],
    'Overall': list(range(2010, 2024))
}

# Function to identify and remove outliers using IQR method
def identify_outliers(df, columns):
    df_clean = df.copy()
    outliers = []

    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify outliers
        col_outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)][['Country', 'Year', col]]
        if not col_outliers.empty:
            col_outliers['Variable'] = col
            outliers.append(col_outliers)

        # Remove outliers for analysis
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    outliers_df = pd.concat(outliers, ignore_index=True) if outliers else pd.DataFrame()
    return df_clean, outliers_df

# Function to calculate Pearson correlation and linear regression
def analyze_correlation(df, period_name, years, region_name, countries):
    period_df = df[(df['Year'].isin(years)) & (df['Country'].isin(countries))]

    # Identify and remove outliers
    period_df_clean, outliers_df = identify_outliers(period_df, ['Debt', 'corruption'])

    # Check if data is sufficient
    if len(period_df_clean) < 2 or period_df_clean['Debt'].isnull().all() or period_df_clean['corruption'].isnull().all():
        print(f"\n{period_name} ({region_name}): Insufficient data for analysis after outlier removal")
        return

    # Pearson correlation
    corr, p_value = pearsonr(period_df_clean['Debt'], period_df_clean['corruption'])

    # Linear regression
    X = period_df_clean['Debt'].values.reshape(-1, 1)
    y = period_df_clean['corruption'].values
    reg = LinearRegression().fit(X, y)
    slope = reg.coef_[0]
    intercept = reg.intercept_
    r_squared = reg.score(X, y)

    # Print results
    print(f"\n{period_name} ({region_name}):")
    print(f"Pearson Correlation: {corr:.4f}, p-value: {p_value:.4f}")
    print(f"Linear Regression: y = {slope:.6f}x + {intercept:.6f}")
    print(f"R-squared: {r_squared:.4f}")


# Analyze for each period and region
for period_name, years in periods.items():
    analyze_correlation(merged_df, period_name, years, 'Euro', euro_countries)
    analyze_correlation(merged_df, period_name, years, 'Non-Euro', non_euro_countries)
    analyze_correlation(merged_df, period_name, years, 'All Countries', all_countries)


2010 (Euro):
Pearson Correlation: 0.1906, p-value: 0.4344
Linear Regression: y = 0.030510x + -0.008263
R-squared: 0.0363

2010 (Non-Euro):
Pearson Correlation: 0.5724, p-value: 0.1072
Linear Regression: y = 0.146380x + -0.024507
R-squared: 0.3277

2010 (All Countries):
Pearson Correlation: 0.0663, p-value: 0.7325
Linear Regression: y = 0.016465x + -0.015442
R-squared: 0.0044

2011-2013 (Euro):
Pearson Correlation: -0.1718, p-value: 0.1854
Linear Regression: y = -0.106401x + 0.003276
R-squared: 0.0295

2011-2013 (Non-Euro):
Pearson Correlation: 0.0786, p-value: 0.7150
Linear Regression: y = 0.034484x + -0.011654
R-squared: 0.0062

2011-2013 (All Countries):
Pearson Correlation: -0.1143, p-value: 0.2977
Linear Regression: y = -0.064544x + -0.000587
R-squared: 0.0131

2014-2019 (Euro):
Pearson Correlation: 0.0683, p-value: 0.4530
Linear Regression: y = 0.033190x + 0.001913
R-squared: 0.0047

2014-2019 (Non-Euro):
Pearson Correlation: 0.0375, p-value: 0.7919
Linear Regression: y = 0.01371

  corr, p_value = pearsonr(period_df_clean['Debt'], period_df_clean['corruption'])


In [None]:
import pandas as pd
int_data = pd.read_csv('/content/Filtered_Interest_Rates (2).csv')
int_data.head()

Unnamed: 0,Country,CCode,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Australia,AUS,6.055527,1.445852,5.077615,6.353161,4.481619,6.237773,6.124607,1.536196,3.329588,1.578067,1.59,1.05,1.61,3.6
1,Austria,AUT,3.7512,3.5407,3.2696,1.923,2.1311,0.5434,0.7672,0.5705,0.6692,0.452,-0.0868,-0.3995,0.1813,2.8076
2,Belgium,BEL,3.75,4.14,4.11,2.31,2.45,0.73,0.86,0.7,0.7,0.77,-0.03,-0.36,0.26,2.79
3,Canada,CAN,-0.2377,-0.236773,1.76011,1.237717,1.038639,3.674533,1.968905,0.125752,2.19,1.91,1.6,0.81,1.88,2.94
4,Switzerland,CHE,2.41305,2.739347,2.622942,2.778422,3.365102,3.996448,3.248909,3.006367,1.849005,2.739665,3.35751,1.376511,0.14592,1.822187


In [None]:
for year in map(str, range(2010, 2024)):
    int_data[year] = int_data[year] / 100

int_data.head()

Unnamed: 0,Country,CCode,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Australia,AUS,0.060555,0.014459,0.050776,0.063532,0.044816,0.062378,0.061246,0.015362,0.033296,0.015781,0.0159,0.0105,0.0161,0.036
1,Austria,AUT,0.037512,0.035407,0.032696,0.01923,0.021311,0.005434,0.007672,0.005705,0.006692,0.00452,-0.000868,-0.003995,0.001813,0.028076
2,Belgium,BEL,0.0375,0.0414,0.0411,0.0231,0.0245,0.0073,0.0086,0.007,0.007,0.0077,-0.0003,-0.0036,0.0026,0.0279
3,Canada,CAN,-0.002377,-0.002368,0.017601,0.012377,0.010386,0.036745,0.019689,0.001258,0.0219,0.0191,0.016,0.0081,0.0188,0.0294
4,Switzerland,CHE,0.024131,0.027393,0.026229,0.027784,0.033651,0.039964,0.032489,0.030064,0.01849,0.027397,0.033575,0.013765,0.001459,0.018222


In [None]:
# Melt data for easier processing
int_melted = pd.melt(int_data, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Interest')
debt_melted = pd.melt(debt_df, id_vars=['Country'], value_vars=[str(y) for y in range(2010, 2024)], var_name='Year', value_name='Debt')
# Change: Ensuring pop_melted['Year'] is of type int
int_melted['Year'] = int_melted['Year'].astype(int)
debt_melted['Year'] = debt_melted['Year'].astype(int)

# Change: Merging pop_melted and debt_melted instead of gdp_melted and debt_melted
merged_df = pd.merge(int_melted, debt_melted, on=['Country', 'Year'])

# Define time periods
periods = {
    '2010': [2010],
    '2011-2013': [2011, 2012, 2013],
    '2014-2019': [2014, 2015, 2016, 2017, 2018, 2019],
    '2020-2021': [2020, 2021],
    '2022-2023': [2022, 2023],
    '2010-2013': [2010, 2011, 2012, 2013],
    'Overall': list(range(2010, 2024))
}

# Function to identify and remove outliers using IQR method
def identify_outliers(df, columns):
    df_clean = df.copy()
    outliers = []

    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify outliers
        col_outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)][['Country', 'Year', col]]
        if not col_outliers.empty:
            col_outliers['Variable'] = col
            outliers.append(col_outliers)

        # Remove outliers for analysis
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    outliers_df = pd.concat(outliers, ignore_index=True) if outliers else pd.DataFrame()
    return df_clean, outliers_df

# Function to calculate Pearson correlation and linear regression
def analyze_correlation(df, period_name, years, region_name, countries):
    period_df = df[(df['Year'].isin(years)) & (df['Country'].isin(countries))]

    # Identify and remove outliers
    period_df_clean, outliers_df = identify_outliers(period_df, ['Debt', 'Interest'])

    # Check if data is sufficient
    if len(period_df_clean) < 2 or period_df_clean['Debt'].isnull().all() or period_df_clean['Interest'].isnull().all():
        print(f"\n{period_name} ({region_name}): Insufficient data for analysis after outlier removal")
        return

    # Pearson correlation
    corr, p_value = pearsonr(period_df_clean['Debt'], period_df_clean['Interest'])

    # Linear regression
    X = period_df_clean['Debt'].values.reshape(-1, 1)
    y = period_df_clean['Interest'].values
    reg = LinearRegression().fit(X, y)
    slope = reg.coef_[0]
    intercept = reg.intercept_
    r_squared = reg.score(X, y)

    # Print results
    print(f"\n{period_name} ({region_name}):")
    print(f"Pearson Correlation: {corr:.4f}, p-value: {p_value:.4f}")
    print(f"Linear Regression: y = {slope:.6f}x + {intercept:.6f}")
    print(f"R-squared: {r_squared:.4f}")


# Analyze for each period and region
for period_name, years in periods.items():
    analyze_correlation(merged_df, period_name, years, 'Euro', euro_countries)
    analyze_correlation(merged_df, period_name, years, 'Non-Euro', non_euro_countries)
    analyze_correlation(merged_df, period_name, years, 'All Countries', all_countries)


2010 (Euro):
Pearson Correlation: 0.5672, p-value: 0.0059
Linear Regression: y = 0.075176x + 0.036958
R-squared: 0.3217

2010 (Non-Euro):
Pearson Correlation: -0.4575, p-value: 0.1837
Linear Regression: y = -0.101832x + 0.041862
R-squared: 0.2093

2010 (All Countries):
Pearson Correlation: 0.3611, p-value: 0.0423
Linear Regression: y = 0.055986x + 0.039138
R-squared: 0.1304

2011-2013 (Euro):
Pearson Correlation: 0.2032, p-value: 0.1044
Linear Regression: y = 0.059402x + 0.034799
R-squared: 0.0413

2011-2013 (Non-Euro):
Pearson Correlation: 0.0126, p-value: 0.9504
Linear Regression: y = 0.006404x + 0.037085
R-squared: 0.0002

2011-2013 (All Countries):
Pearson Correlation: 0.0522, p-value: 0.6176
Linear Regression: y = 0.018480x + 0.036173
R-squared: 0.0027

2014-2019 (Euro):
Pearson Correlation: 0.1086, p-value: 0.2133
Linear Regression: y = 0.031071x + 0.015217
R-squared: 0.0118

2014-2019 (Non-Euro):
Pearson Correlation: -0.0016, p-value: 0.9907
Linear Regression: y = -0.000704x + 