In [91]:
import pandas as pd
import os
import joblib 

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

### Load All Datasets

In [92]:
country = pd.read_csv('../datasets/country_metadata/country.csv')
literacy_rate_df = pd.read_csv('../datasets/indicators/litercay_rate_adult.csv', skiprows=4)
unemployment_rate_df = pd.read_csv('../datasets/indicators/unemployment_rate.csv', skiprows=4)
gdp_per_capita_df = pd.read_csv('../datasets/indicators/gdp_per_capita_ppp.csv', skiprows=4)
infant_mortality_df = pd.read_csv('../datasets/indicators/infant_mortality_rate.csv', skiprows=4)
health_expenditure_df = pd.read_csv('../datasets/indicators/health_expenditure_ppp.csv', skiprows=4)
poverty_risk_df = pd.read_csv('../datasets/indicators/poverty_headcount_ratio.csv', skiprows=4)

### Data Cleaning
Step 1: Remove all the aggregate regionals and combines all the indicators into one file (only from 1990 onwards)

In [93]:
def clean_dataset(df, column_name):
    # Get year columns from 1990 onwards
    year_cols = [col for col in df.columns if col.isdigit() and int(col) >= 1990]
    
    # Remove aggregate regions
    exclude_country = [
        'World', 'Africa', 'Asia', 'Europe', 'America', 'Caribbean', 'Pacific',
        'Central', 'Southern', 'Western', 'Eastern', 'Northern', 'Middle East',
        'Sub-Saharan', 'Latin America', 'North Africa','income', 'High income', 
        'Low income', 'Lower middle', 'Upper middle', 'dividend', 'economies',
        'OECD', 'EU', 'Euro', 'European Union', 'Arab', 'Arab World','OPEC', 
        'G7', 'G20', 'Least developed', 'Small states', 'Channel Islands',
        'Heavily indebted', 'Post-demographic', 'Fragile and conflict affected situations', 
        'Fragile', 'conflict', 'IDA', 'IBRD', 'Blend', 'Other small states', 
        'Pacific island small states', 'Small island developing states'
    ]
    
    # Filter to individual countries
    individual_countries = df[
        ~df['Country Name'].str.contains('|'.join(exclude_country), case=False, na=False)
    ].copy()
    
    # Convert to long format
    formatted_df = individual_countries.melt(
        id_vars=['Country Name', 'Country Code'],
        value_vars=year_cols,
        var_name='Year',
        value_name=column_name
    )
    
    # Clean the data
    formatted_df['Year'] = formatted_df['Year'].astype(int)
    formatted_df = formatted_df.dropna(subset=[column_name])
    formatted_df[column_name] = pd.to_numeric(formatted_df[column_name], errors='coerce')
    formatted_df = formatted_df.dropna(subset=[column_name])
    
    return formatted_df

# Clean each dataset using the original data
literacy_clean = clean_dataset(literacy_rate_df, 'Literacy Rate')
unemployment_clean = clean_dataset(unemployment_rate_df, 'Unemployment Rate')
gdp_clean = clean_dataset(gdp_per_capita_df, 'GDP per Capita')
infant_mortality_clean = clean_dataset(infant_mortality_df, 'Infant Mortality Rate')
health_expenditure_clean = clean_dataset(health_expenditure_df, 'Health Expenditure')
poverty_clean = clean_dataset(poverty_risk_df, 'Poverty Rate')

# Create a comprehensive merge starting with all country-year combinations
all_datasets = [literacy_clean, unemployment_clean, gdp_clean, 
                infant_mortality_clean, health_expenditure_clean, poverty_clean]

# Get all unique country-year combinations
base_df = pd.concat([df[['Country Name', 'Country Code', 'Year']] for df in all_datasets if not df.empty], 
                   ignore_index=True).drop_duplicates()

# Merge each indicator
poverty_dataset_clean = base_df.copy()

# Merge literacy rate
poverty_dataset_clean = poverty_dataset_clean.merge(
    literacy_clean[['Country Name', 'Country Code', 'Year', 'Literacy Rate']], 
    on=['Country Name', 'Country Code', 'Year'], how='left'
)

# Merge unemployment rate
poverty_dataset_clean = poverty_dataset_clean.merge(
    unemployment_clean[['Country Name', 'Country Code', 'Year', 'Unemployment Rate']], 
    on=['Country Name', 'Country Code', 'Year'], how='left'
)

# Merge GDP per capita
poverty_dataset_clean = poverty_dataset_clean.merge(
    gdp_clean[['Country Name', 'Country Code', 'Year', 'GDP per Capita']], 
    on=['Country Name', 'Country Code', 'Year'], how='left'
)

# Merge infant mortality
poverty_dataset_clean = poverty_dataset_clean.merge(
    infant_mortality_clean[['Country Name', 'Country Code', 'Year', 'Infant Mortality Rate']], 
    on=['Country Name', 'Country Code', 'Year'], how='left'
)

# Merge health expenditure
poverty_dataset_clean = poverty_dataset_clean.merge(
    health_expenditure_clean[['Country Name', 'Country Code', 'Year', 'Health Expenditure']], 
    on=['Country Name', 'Country Code', 'Year'], how='left'
)

# Merge poverty rate
poverty_dataset_clean = poverty_dataset_clean.merge(
    poverty_clean[['Country Name', 'Country Code', 'Year', 'Poverty Rate']], 
    on=['Country Name', 'Country Code', 'Year'], how='left'
)

# Add country metadata
poverty_dataset_clean = poverty_dataset_clean.merge(
    country[['Country Code', 'Region', 'IncomeGroup']], 
    on='Country Code', how='left'
)

# Filter out rows where ALL indicators are missing
indicator_columns = ['Literacy Rate', 'Unemployment Rate', 'GDP per Capita', 
                    'Infant Mortality Rate', 'Health Expenditure', 'Poverty Rate']
poverty_dataset_clean = poverty_dataset_clean.dropna(subset=indicator_columns, how='all')

poverty_dataset_clean.head()

Unnamed: 0,Country Name,Country Code,Year,Literacy Rate,Unemployment Rate,GDP per Capita,Infant Mortality Rate,Health Expenditure,Poverty Rate,Region,IncomeGroup
0,Burundi,BDI,1990,37.0,,598.164184,92.6,,,Sub-Saharan Africa,Low income
1,China,CHN,1990,78.0,,990.776191,43.3,,83.0,East Asia & Pacific,Upper middle income
2,Cabo Verde,CPV,1990,63.0,,1044.457524,42.1,,,Sub-Saharan Africa,Upper middle income
3,Ecuador,ECU,1990,88.0,,4631.553258,42.6,,,Latin America & Caribbean,Upper middle income
4,Guam,GUM,1990,99.0,,,,,,East Asia & Pacific,High income


Step 2: Fill NaN with median to preserves data quality while handling missing values appropriately

In [94]:
key_indicators = ['Literacy Rate', 'Unemployment Rate', 'GDP per Capita',
                  'Infant Mortality Rate', 'Health Expenditure', 'Poverty Rate']

# Fill NaN values with median for each indicator
for indicator in key_indicators:
    if poverty_dataset_clean[indicator].isnull().sum() > 0:
        # Calculate median
        median = poverty_dataset_clean[indicator].median()

        # Fill NaN
        nan = poverty_dataset_clean[indicator].isnull().sum()
        poverty_dataset_clean[indicator] = poverty_dataset_clean[indicator].fillna(median)

poverty_dataset_clean.head()

Unnamed: 0,Country Name,Country Code,Year,Literacy Rate,Unemployment Rate,GDP per Capita,Infant Mortality Rate,Health Expenditure,Poverty Rate,Region,IncomeGroup
0,Burundi,BDI,1990,37.0,6.337,598.164184,92.6,554.711017,1.8,Sub-Saharan Africa,Low income
1,China,CHN,1990,78.0,6.337,990.776191,43.3,554.711017,83.0,East Asia & Pacific,Upper middle income
2,Cabo Verde,CPV,1990,63.0,6.337,1044.457524,42.1,554.711017,1.8,Sub-Saharan Africa,Upper middle income
3,Ecuador,ECU,1990,88.0,6.337,4631.553258,42.6,554.711017,1.8,Latin America & Caribbean,Upper middle income
4,Guam,GUM,1990,99.0,6.337,8849.525778,19.7,554.711017,1.8,East Asia & Pacific,High income


Step 3: Save it into a new csv file

In [95]:
poverty_dataset_clean.to_csv('../datasets/poverty_dataset_clean.csv', index=False)

### Train Model

In [96]:
key_indicators = ['Literacy Rate', 'Unemployment Rate', 'GDP per Capita',
                  'Infant Mortality Rate', 'Health Expenditure']

# Create binary target variable for classification
poverty_dataset_clean['Poverty Risk'] = (poverty_dataset_clean['Poverty Rate'] > 25).astype(int)

# Create feature matrix and target variable
X = poverty_dataset_clean[key_indicators].copy()
y = poverty_dataset_clean['Poverty Risk'].copy()

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)

# Evaluate
print("Model Accuracy:", random_forest.score(X_test, y_test))

# Save the model
os.makedirs('../models', exist_ok=True)
joblib.dump(random_forest, '../models/random_forest_model.pkl')
joblib.dump(key_indicators, '../models/feature_names.pkl')

Model Accuracy: 0.9516358463726885


['../models/feature_names.pkl']