# Panel Regression

## Preparation and Checks

In [4]:
# load libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from linearmodels.panel import PanelOLS
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [6]:
# load data
data = pd.read_csv("dissertation_dataset.csv")

In [8]:
# add lag for GDP 
data['gdp_lag1'] = data.groupby('country')['gdp'].shift(1)

#define X
X_vars = ['hdi', 'wgi', 'gdp_lag1', 'infant_mort', 'unemployment']

# define y (logged due to distribution
data['log_homicide'] = np.log1p(data['homicide_ratio'])
y_var = 'log_homicide'

# drop NAs
data = data.dropna(subset=[y_var] + X_vars)

# standardize variables for better VIF interpretation
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_standardized = scaler.fit_transform(data[X_vars])
X_standardized = pd.DataFrame(X_standardized, columns=X_vars)

# check multicollinearity
vif_data = pd.DataFrame()
vif_data["Variable"] = X_vars
vif_data["VIF"] = [variance_inflation_factor(X_standardized.values, i) 
                   for i in range(X_standardized.shape[1])]
print("Variance Inflation Factors:\n", vif_data)

Variance Inflation Factors:
        Variable       VIF
0           hdi  7.267018
1           wgi  2.407617
2      gdp_lag1  1.056889
3   infant_mort  4.859501
4  unemployment  1.013667


In [10]:
# check structure
print("=== Dataset Structure ===")
print(f"Total rows: {len(data)}")

=== Dataset Structure ===
Total rows: 2911


In [12]:
data = data.copy()

# standardize variables (otherwise issues with GDP)
scaler = StandardScaler()
data[X_vars] = scaler.fit_transform(data[X_vars])

# set country and year as columns again (not otherwise recognized as column names)
data = data.reset_index()
data = data.set_index(['country', 'year'])

## Modeling

In [15]:
# specify variables
y = data[y_var]
X = data[X_vars]

# model with clustered standard errors by country
model = PanelOLS(y, X, entity_effects=True, time_effects=True)
result = model.fit(cov_type='clustered', cluster_entity=True)
print(result.summary)


                          PanelOLS Estimation Summary                           
Dep. Variable:           log_homicide   R-squared:                        0.0968
Estimator:                   PanelOLS   R-squared (Between):              0.0686
No. Observations:                2911   R-squared (Within):               0.0705
Date:                Mon, Jul 21 2025   R-squared (Overall):              0.0374
Time:                        01:46:00   Log-likelihood                   -231.75
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      58.643
Entities:                         146   P-value                           0.0000
Avg Obs:                       19.938   Distribution:                  F(5,2735)
Min Obs:                       7.0000                                           
Max Obs:                       26.000   F-statistic (robust):             8.0704
                            

In [15]:
# create table 
# coefficient summary
summary_df = result.summary.tables[1]

coeffs_df = pd.DataFrame(summary_df.data[1:], columns=summary_df.data[0])

coeffs_df.to_csv("panel_model_results.csv", index=False)