# Ridge Regression to Estimate Coefficients

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import os 
from sklearn.model_selection import train_test_split
from create_datasets import createData, cleanData
from create_datasets import createSplits

## Data Preprocessing

In [2]:
developed,developing = createData()
cleanData(developed)
cleanData(developing)

# print(type(developed))

indicatorList = pd.read_csv('Indicator.csv')
indicatorsToPick = indicatorList[indicatorList['included']==1].drop(index=2)
attributes = list(indicatorsToPick.feature_name)


## Training and Validation of the Model

In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge

def validate(ridge_model, splits): # KFold Cross Validation using RMSE as scoring metric
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    # evaluate model
    scores = cross_val_score(ridge_model, splits[0], splits[2], scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)
    # force scores to be positive
    scores = np.absolute(scores)
    print('RMSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))


def compute_coeff(is_developed, dev, split_size):
    if is_developed == 0: # Developing countries
            print("Computing for Developing Countries/Regions")
    else:
        print("Computing for Developed Countries/Regions")

    coeff_dict = {}
    for i in attributes:
        coeff_dict[i] = []

    # print(len(dev))
    for i in range(len(dev)):
        splits = createSplits(i,dev,split_size)
        ridge_model = Ridge(alpha=1.0)

        ridge_model.fit(splits[0],splits[2])
        coef = ridge_model.coef_

        validate(ridge_model,splits)

        # print(coef[0])
        # print(ridge_model.coef_[0])
        for j in range(len(attributes)):
            try:
                coeff_dict[attributes[j]].append(coef[0][j])
            except Exception as e:
                print(type(coeff_dict[attributes[i]]),type(coef))
        print("Year:", 2008+i, "Score:",ridge_model.score(splits[0],splits[2])) # Printing accuracy metrics
        print()
    return coeff_dict

attcoeff_developed = compute_coeff(1,developed, 0.15)
attcoeff_developing = compute_coeff(0,developing, 0.15)

avgcoef_developed = {}
avgcoef_developing = {}
for i in attcoeff_developed:
    avgcoef_developed[i] = np.mean(attcoeff_developed[i])
    avgcoef_developing[i] = np.mean(attcoeff_developing[i])

Computing for Developed Countries/Regions
RMSE: 2.625 (1.155)
Year: 2008 Score: 0.31093306830189427

RMSE: 3.505 (1.287)
Year: 2009 Score: 0.4740563864247139

RMSE: 3.898 (2.644)
Year: 2010 Score: 0.29342199682169345

RMSE: 2.817 (1.828)
Year: 2011 Score: 0.3914349835885348

RMSE: 3.117 (1.576)
Year: 2012 Score: 0.4335993469005748

RMSE: 2.724 (1.284)
Year: 2013 Score: 0.26937393975923707

RMSE: 2.396 (1.096)
Year: 2014 Score: 0.20203458155939158

RMSE: 1.921 (1.049)
Year: 2015 Score: 0.3751902182876109

RMSE: 2.024 (0.985)
Year: 2016 Score: 0.2922854295159457

RMSE: 1.885 (1.009)
Year: 2017 Score: 0.4769141347826949

Computing for Developing Countries/Regions
RMSE: 4.082 (1.166)
Year: 2008 Score: 0.023311219303607574

RMSE: 4.419 (1.013)
Year: 2009 Score: 0.12208779480543597

RMSE: 3.864 (0.912)
Year: 2010 Score: 0.07643068009505272

RMSE: 5.265 (3.651)
Year: 2011 Score: 0.08742585427199401

RMSE: 7.552 (8.161)
Year: 2012 Score: 0.019435991113821993

RMSE: 4.311 (2.414)
Year: 2013 Sco

## Estimating Coefficients for Developed and Developing Countries

In [7]:
print("Coefficients for Developed Countries:\n")
print("Indicator\t\tImportance")
for i,j in avgcoef_developed.items():
    print(f"{i}\t\t{j}")

Coefficients for Developed Countries:

Indicator		Importance
pop_grow		0.04908902891882737
gini_index		-0.033724577549935304
unemp		-0.1070537560480144
life_exp		-0.143753498448783
poverty		0.7763622090100712
mil_xpnd		0.17958136951908127
lit_rate		0.10754194812237317
labour_force		-4.201714569963176e-09
refugee_asylum		-1.8129983704264176e-09


In [8]:
print("Coefficients for Developing Countries:\n")
print("Indicator\t\tImportance")
for i,j in avgcoef_developing.items():
    print(f"{i}\t\t{j}")

Coefficients for Developing Countries:

Indicator		Importance
pop_grow		-0.34936465280201484
gini_index		0.011627498487542387
unemp		-0.06134737917253893
life_exp		-0.02084215995409148
poverty		-0.007253757105633314
mil_xpnd		-0.35165694738982883
lit_rate		-0.01871515840234888
labour_force		1.5889520135383243e-09
refugee_asylum		-1.7520402242582648e-07
