# Run logistic regression for figure 5

### Compile all settlement data into a single file

In [None]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy.stats.mstats import winsorize
from sklearn.utils import resample


In [None]:

# Initialize an empty DataFrame with the specified columns
columns = [
    'iso', 'type', 'pop_un_adj', 'adm1_name', 
    'NTL_weighted_percentage', 'state_capital_distance', 'in_state_capital'
]
df1 = pd.DataFrame(columns=columns)

# List of country codes
countrycodes = ['AGO', 'BDI', 'BFA', 'CMR', 'CAF', 'TCD', 'COG', 'COD', 'ERI', 'ETH', 'KEN', 'MLI', 'MOZ', 'NER', 'NGA', 'RWA', 'SDN', 'SOM', 'SSD', 'TZA', 'UGA']

# List to collect DataFrames
dfs = []

# Iterate through each country code
for countrycode in countrycodes:
    # Construct the file path
    file_path = os.path.join(countrycode, f"{countrycode}_grid3_with_ntl_percentage_complete.csv")
    
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Drop columns that are not needed
    df = df[columns]
    
    # Append the DataFrame to the list
    dfs.append(df)

# Concatenate all DataFrames in the list
df1 = pd.concat(dfs, ignore_index=True)

# Add the 'monitorable' column
df1['monitorable'] = df1['NTL_weighted_percentage'].apply(lambda x: 1 if x > 50 else 0)

# Save the final DataFrame to a CSV file
df1.to_csv('all_compiled.csv', index=False)

print("Data saved in single file.")


### Run logistic regression

In [None]:
# Load the data
data = pd.read_csv('all_compiled.csv')
print(f"Number of entries: {data.shape[0]}")

# Winsorize and log transform
data['pop_un_adj'] = winsorize(data['pop_un_adj'], limits=[0, 0.001])
data['log_pop_un_adj'] = np.log(data['pop_un_adj'] + 1)  # Adding 1 to avoid log(0)
data['log_state_capital_distance'] = np.log(data['state_capital_distance'] + 1)

# Convert 'iso' to categorical
data['iso'] = data['iso'].astype('category')

# Define the dependent and independent variables
X = data[['log_pop_un_adj', 'in_state_capital', 'log_state_capital_distance', 'iso']]
y = data['monitorable']

# Add a constant to the independent variables
X = sm.add_constant(X)

# Create dummy variables for the 'iso' categorical variable
X = pd.get_dummies(X, drop_first=True)

# Combine X and y into a single DataFrame to drop missing values
combined = pd.concat([X, y], axis=1)

# Drop rows with missing values
combined = combined.dropna()

# Separate the features and target variable after dropping missing values
X = combined.drop(columns=['monitorable'])
y = combined['monitorable']

# Ensure all variables are numeric
X = X.apply(pd.to_numeric, errors='coerce')

# Convert boolean columns to integers
X = X.astype({col: 'int' for col in X.select_dtypes(include=['bool']).columns})

# Re-check for missing values after conversion
print(X.isnull().sum())

# Ensure there are no object types in X
print(X.dtypes)

# Combine X and y for undersampling
data_combined = pd.concat([X, y], axis=1)

# Separate majority and minority classes
majority_class = data_combined[data_combined['monitorable'] == 0]
minority_class = data_combined[data_combined['monitorable'] == 1]

# Undersample majority class
majority_class_undersampled = resample(majority_class, 
                                       replace=False,    # sample without replacement
                                       n_samples=len(minority_class) * 4,  # to match 4:1 ratio
                                       random_state=42)  # for reproducibility

# Combine minority class with undersampled majority class
undersampled_data = pd.concat([majority_class_undersampled, minority_class])

# Separate the features and target variable after undersampling
X_undersampled = undersampled_data.drop(columns=['monitorable'])
y_undersampled = undersampled_data['monitorable']

# Fit the logistic regression model
logit_model = sm.Logit(y_undersampled, X_undersampled)
result = logit_model.fit(maxiter=1000)

# Print the summary of the model
print(result.summary())

# Save the summary to a text file
with open('logistic_regression_figure5_results.txt', 'w') as f:
    f.write(result.summary().as_text())

print("Logistic regression conducted, figure 5 is in files.")