In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

data = pd.read_csv('fin_dataset_africa copy.csv', delimiter=';')
data = data.replace('..', np.nan)
data[data.columns[2:]] = data[data.columns[2:]].apply(pd.to_numeric, errors='coerce')
data.fillna(data.mean(), inplace=True)

dep_var = "GDP_per_capita_growth_annual_Percent"
endogenous_var = "Government_Debt_Percent_of_GDP"
exogenous_vars = [
    'Gross_capital_formation_Percent_of_GDP',
    'Gross_national_expenditure_Percent_of_GDP',
    'Net_barter_terms_of_trade_index_2015_100',
    'Population_growth_annual_Percent',
    'Trade_Percent_of_GDP'
    ]

# Define the range of possible thresholds
thresholds = np.linspace(data[endogenous_var].min(), data[endogenous_var].max(), 100)

# Placeholder for the best fit
best_fit = None
best_ssr = np.inf
best_threshold = None

# Iterating over possible thresholds
for threshold in thresholds:
    data['group'] = (data[endogenous_var] > threshold).astype(int)
    model = sm.OLS(data[dep_var], sm.add_constant(data[['group'] + exogenous_vars]))
    results = model.fit()

    ssr = results.ssr
    if ssr < best_ssr:
        best_ssr = ssr
        best_fit = results
        best_threshold = threshold

print("Best Threshold:", best_threshold)
best_fit.summary()


  data.fillna(data.mean(), inplace=True)


Best Threshold: 43.91111111111111


0,1,2,3
Dep. Variable:,GDP_per_capita_growth_annual_Percent,R-squared:,0.031
Model:,OLS,Adj. R-squared:,0.024
Method:,Least Squares,F-statistic:,4.796
Date:,"Thu, 18 Apr 2024",Prob (F-statistic):,7.94e-05
Time:,10:48:45,Log-Likelihood:,-2711.7
No. Observations:,912,AIC:,5437.0
Df Residuals:,905,BIC:,5471.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3761,1.916,0.196,0.844,-3.384,4.136
group,-1.4757,0.320,-4.612,0.000,-2.104,-0.848
Gross_capital_formation_Percent_of_GDP,0.0448,0.020,2.193,0.029,0.005,0.085
Gross_national_expenditure_Percent_of_GDP,0.0116,0.012,0.939,0.348,-0.013,0.036
Net_barter_terms_of_trade_index_2015_100,0.0017,0.009,0.196,0.844,-0.015,0.019
Population_growth_annual_Percent,-0.2352,0.183,-1.287,0.199,-0.594,0.124
Trade_Percent_of_GDP,-0.0003,0.006,-0.046,0.963,-0.012,0.012

0,1,2,3
Omnibus:,482.84,Durbin-Watson:,1.529
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17559.155
Skew:,-1.761,Prob(JB):,0.0
Kurtosis:,24.206,Cond. No.,2040.0


In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from linearmodels.iv import IV2SLS
from numpy.linalg import matrix_rank

# Load and preprocess the data
data = pd.read_csv('fin_dataset_africa copy.csv', delimiter=';')
data = data.replace('..', np.nan)
data[data.columns[2:]] = data[data.columns[2:]].apply(pd.to_numeric, errors='coerce')
data.fillna(data.mean(), inplace=True)

# Define variables
dep_var = "GDP_per_capita_growth_annual_Percent"
endogenous_var = "Government_Debt_Percent_of_GDP"
exogenous_vars = [
    'Gross_capital_formation_Percent_of_GDP',
    'Gross_national_expenditure_Percent_of_GDP',
    'Net_barter_terms_of_trade_index_2015_100',
    'Population_growth_annual_Percent',
    'Trade_Percent_of_GDP'
]
instrument = "Foreign_direct_investment_net_inflows_Percent_of_GDP"

# Define thresholds
thresholds = np.linspace(data[endogenous_var].min(), data[endogenous_var].max(), 100)

best_model = None
lowest_pvalue = np.inf
best_threshold = None

for threshold in thresholds:
    data_below = data[data[endogenous_var] <= threshold]
    data_above = data[data[endogenous_var] > threshold]

    for subset in [data_below, data_above]:
        if subset.empty or subset.shape[0] <= len(exogenous_vars) + 2:
            continue  # Skip this subset if not enough data

        X = sm.add_constant(subset[exogenous_vars])
        y = subset[dep_var]
        Z = subset[[instrument]]

        # Check rank and sufficient data points before fitting
        if matrix_rank(X) < X.shape[1] or subset.shape[0] <= X.shape[1]:
            continue  # Skip this iteration if not full rank or not enough data

        iv = IV2SLS(dependent=subset[dep_var],
                    exog=X,
                    endog=subset[endogenous_var],
                    instruments=Z).fit(cov_type='robust')

        # Check and record the p-value of the endogenous variable
        pvalue = iv.pvalues[endogenous_var]
        if pvalue < lowest_pvalue:
            lowest_pvalue = pvalue
            best_model = iv
            best_threshold = threshold

# Display results if a valid model was found
if best_model is not None:
    print("Best Threshold:", best_threshold)
    print(best_model.summary)
else:
    print("No valid model was found.")


  data.fillna(data.mean(), inplace=True)


Best Threshold: 126.24444444444444
                                   IV-2SLS Estimation Summary                                   
Dep. Variable:     GDP_per_capita_growth_annual_Percent   R-squared:                     -0.4922
Estimator:                                      IV-2SLS   Adj. R-squared:                -0.7054
No. Observations:                                    49   F-statistic:                    9.0133
Date:                                  Thu, Apr 18 2024   P-value (F-stat)                0.1728
Time:                                          10:48:46   Distribution:                  chi2(6)
Cov. Estimator:                                  robust                                         
                                                                                                
                                             Parameter Estimates                                             
                                           Parameter  Std. Err.     T-stat    P

In [3]:
# Create the regime indicator based on a threshold

from linearmodels.panel import PanelOLS

df = data.copy()

threshold = 60  # Define your threshold
df['regime_indicator'] = (df['Government_Debt_Percent_of_GDP'] >= threshold).astype(int)

# Setting multiindex for panel data
df = df.set_index(['Country_Name', 'Year'])

# Define the model
exog_vars = ['Government_Debt_Percent_of_GDP', 'regime_indicator']
exog = sm.add_constant(df[exog_vars])  # Adds a constant term to the predictors
mod = PanelOLS(df['GDP_per_capita_growth_annual_Percent'], exog, entity_effects=True)

# Fit the model
fitted_model = mod.fit(cov_type='clustered', cluster_entity=True)

# Output the results
print(fitted_model)

                                   PanelOLS Estimation Summary                                    
Dep. Variable:     GDP_per_capita_growth_annual_Percent   R-squared:                        0.0151
Estimator:                                     PanelOLS   R-squared (Between):             -0.0680
No. Observations:                                   912   R-squared (Within):               0.0151
Date:                                  Thu, Apr 18 2024   R-squared (Overall):              0.0072
Time:                                          10:48:48   Log-likelihood                   -2673.6
Cov. Estimator:                               Clustered                                           
                                                          F-statistic:                      6.6194
Entities:                                            48   P-value                           0.0014
Avg Obs:                                         19.000   Distribution:                   F(2,862)
Min Obs:  

In [4]:
import statsmodels.api as sm

# remove all nan values
df = df.dropna()

# remove all .. values
df = df.replace('..', np.nan)


# Instrumental variables
df['lag_debt'] = df.groupby('Country_Name')['Government_Debt_Percent_of_GDP'].shift(1)
# remove the first year for each country
df = df.dropna()

iv = df[['lag_debt', 'Gross_capital_formation_Percent_of_GDP',
    'Gross_national_expenditure_Percent_of_GDP',
    'Net_barter_terms_of_trade_index_2015_100',
    'Population_growth_annual_Percent',
    'Trade_Percent_of_GDP']]

endog = df['Government_Debt_Percent_of_GDP']
first_stage = sm.OLS(endog, sm.add_constant(iv)).fit()

df['predicted_debt'] = first_stage.predict(sm.add_constant(iv))

print(first_stage.summary())

                                  OLS Regression Results                                  
Dep. Variable:     Government_Debt_Percent_of_GDP   R-squared:                       0.879
Model:                                        OLS   Adj. R-squared:                  0.878
Method:                             Least Squares   F-statistic:                     1039.
Date:                            Thu, 18 Apr 2024   Prob (F-statistic):               0.00
Time:                                    10:48:48   Log-Likelihood:                -3598.1
No. Observations:                             864   AIC:                             7210.
Df Residuals:                                 857   BIC:                             7243.
Df Model:                                       6                                         
Covariance Type:                        nonrobust                                         
                                                coef    std err          t      P>|t|     

In [6]:
# Add control variables to the model

df['regime'] = (df['predicted_debt'] >= 60).astype(int)
df['interaction'] = df['regime'] * df['predicted_debt']

control_vars = df[['Gross_capital_formation_Percent_of_GDP',
                   'Gross_national_expenditure_Percent_of_GDP',
                   'Net_barter_terms_of_trade_index_2015_100',
                   'Population_growth_annual_Percent',
                   'Trade_Percent_of_GDP']]

# Assuming control_vars is a DataFrame containing the additional variables
# Include these controls in the second stage regression by creating a new DataFrame that includes all variables
all_vars = df[['predicted_debt', 'interaction']].join(control_vars)

# Now fit the OLS model using all_vars
second_stage = sm.OLS(df['GDP_per_capita_growth_annual_Percent'], sm.add_constant(all_vars)).fit()

print(second_stage.summary())


                                     OLS Regression Results                                     
Dep. Variable:     GDP_per_capita_growth_annual_Percent   R-squared:                       0.014
Model:                                              OLS   Adj. R-squared:                  0.005
Method:                                   Least Squares   F-statistic:                     1.675
Date:                                  Thu, 18 Apr 2024   Prob (F-statistic):              0.112
Time:                                          10:49:12   Log-Likelihood:                -2553.1
No. Observations:                                   864   AIC:                             5122.
Df Residuals:                                       856   BIC:                             5160.
Df Model:                                             7                                         
Covariance Type:                              nonrobust                                         
                              

In [10]:
from statsmodels.sandbox.regression.gmm import IV2SLS

lag_debt = df.groupby('Country_Name')['Government_Debt_Percent_of_GDP'].shift(1)

# Define your variables
endog = df['GDP_per_capita_growth_annual_Percent']

exog = sm.add_constant(df[['Gross_capital_formation_Percent_of_GDP',
                           'Gross_national_expenditure_Percent_of_GDP',
                           'Net_barter_terms_of_trade_index_2015_100',
                           'Population_growth_annual_Percent',
                           'Trade_Percent_of_GDP']])
instruments = sm.add_constant(df[['lag_debt', 'Gross_capital_formation_Percent_of_GDP',
                                  'Gross_national_expenditure_Percent_of_GDP',
                                  'Net_barter_terms_of_trade_index_2015_100',
                                  'Population_growth_annual_Percent',
                                  'Trade_Percent_of_GDP']])

endog_regressor = df['Government_Debt_Percent_of_GDP']

# Combine exogenous and endogenous regressors for the IV2SLS model
exog_iv = exog.copy()
exog_iv['Government_Debt_Percent_of_GDP'] = endog_regressor

# Running 2SLS
iv_model = IV2SLS(endog, exog_iv, instruments).fit()

# Print the results from IV2SLS
print(iv_model.summary())


                                   IV2SLS Regression Results                                    
Dep. Variable:     GDP_per_capita_growth_annual_Percent   R-squared:                       0.014
Model:                                           IV2SLS   Adj. R-squared:                  0.008
Method:                                       Two Stage   F-statistic:                     1.920
                                          Least Squares   Prob (F-statistic):             0.0750
Date:                                  Thu, 18 Apr 2024                                         
Time:                                          10:55:40                                         
No. Observations:                                   864                                         
Df Residuals:                                       857                                         
Df Model:                                             6                                         
                              