# Models for Analysis

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import os 
from sklearn.model_selection import train_test_split
from create_datasets import createData, cleanData
from create_datasets import createSplits

## Creating the required datasets and cleaning

In [2]:
developed,developing = createData()
cleanData(developed)
cleanData(developing)

# print(type(developed))

indicatorList = pd.read_csv('Indicator.csv')
indicatorsToPick = indicatorList[indicatorList['included']==1].drop(index=2)
attributes = list(indicatorsToPick.feature_name)


In [4]:
def print_coeff(coeff):
    attCoef = {}
    if len(attributes)==len(coeff):
        for i in range(len(coeff)):
            attCoef[attributes[i]] = coeff[i]
        print(attCoef)
    else:
        print("Len att:",len(attributes))
        print("Len coeff:",len(coeff))

## Decision Tree Regression

In [11]:
from sklearn.tree import DecisionTreeRegressor

DCR = DecisionTreeRegressor(random_state=0)

splits2010 = createSplits(0,developed,0.20)
DCR.fit(splits2010[0], splits2010[2])
print_coeff(DCR.feature_importances_)
print(DCR.score(splits2010[1],splits2010[3])) # Lol this is also terrible :)

{'pop_grow': 0.3090979551521467, 'gini_index': 0.026957213480958038, 'unemp': 0.42304318382451206, 'life_exp': 0.0, 'poverty': 0.0019014366737590584, 'mil_xpnd': 0.04160889551467804, 'lit_rate': 0.01482417466930584, 'labour_force': 0.10268424736742476, 'refugee_asylum': 0.0798828933172154}
-3.8389350453729687


## Lasso Regression for Feature Selection

In [25]:
from sklearn.linear_model import Lasso, LassoCV

model = Lasso(alpha=0.75)
splits2010 = createSplits(0,developed,0.15) # For Developed countries for the year 2010 
# print(splits2010[0])

model.fit(splits2010[0], splits2010[2])
print_coeff(model.coef_)

{'pop_grow': 0.0, 'gini_index': -0.0, 'unemp': -0.14712717411066126, 'life_exp': 0.0, 'poverty': -0.0, 'mil_xpnd': 0.21288755932488881, 'lit_rate': 0.0, 'labour_force': -9.013417546509046e-10, 'refugee_asylum': 3.7845592916933612e-06}


## Ridge Regression

In [38]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge

splits2010 = createSplits(0,developing,0.15)
ridge_model = Ridge(alpha=1.0)

ridge_model.fit(splits2010[0],splits2010[2])
# print(ridge_model.coef_[0])
print_coeff(ridge_model.coef_[0])
print(ridge_model.score(splits2010[1],splits2010[3]))

{'pop_grow': -0.39008838005730895, 'gini_index': 0.052294435688968216, 'unemp': -0.10387788228213231, 'life_exp': -0.02923358997740811, 'poverty': -0.01727669576061264, 'mil_xpnd': -0.098651465161892, 'lit_rate': -0.011724339020555925, 'labour_force': 2.1215463634107873e-09, 'refugee_asylum': -2.221463573056527e-07}
0.10817927772606349


In [39]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(ridge_model, splits2010[0], splits2010[2], scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.absolute(scores)
print('RMSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

RMSE: 3.864 (0.912)


### Tuning Hyperparameters

In [40]:
from sklearn.model_selection import GridSearchCV

ridge = Ridge()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['alpha'] = np.arange(0, 1, 0.01)
# define search
search = GridSearchCV(ridge, grid, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(splits2010[0],splits2010[2])
# summarize
print('R^2: %.3f' % results.score(splits2010[1],splits2010[3]))
print('Config: %s' % results.best_params_)

R^2: -3.373
Config: {'alpha': 0.99}


In [41]:
print('R^2: %.3f' % results.score(splits2010[0],splits2010[2]))
# print('Config: %s' % results.best_params_)

R^2: -3.736


## PCA for Developed and Developing 

Done to check what are the important features for each to try and bring about a comparison

In [24]:
X_dev = developed[0].drop(columns=['country','gdp_percap'])
X_deving = developing[0].drop(columns=['country','gdp_percap'])

X_dev.shape

(37, 9)

In [12]:
from sklearn.decomposition import PCA
import statsmodels.api as sm
PCA = PCA(n_components=5, random_state=0)
dev2010New = PCA.fit_transform(developed[0].iloc[:,1:-1])
paramdev2010 = PCA.get_params(deep=True)
Y = developed[0][['gdp_percap']]

model = sm.OLS(Y, dev2010New).fit()
predictions = model.predict(dev2010New) 
 
summary = str(model.summary())
print(summary)

                                 OLS Regression Results                                
Dep. Variable:             gdp_percap   R-squared (uncentered):                   0.049
Model:                            OLS   Adj. R-squared (uncentered):             -0.095
Method:                 Least Squares   F-statistic:                             0.3387
Date:                Sat, 28 Nov 2020   Prob (F-statistic):                       0.886
Time:                        23:08:47   Log-Likelihood:                         -85.075
No. Observations:                  38   AIC:                                      180.2
Df Residuals:                      33   BIC:                                      188.3
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [9]:
paramdev2010

{'copy': True,
 'iterated_power': 'auto',
 'n_components': 5,
 'random_state': 0,
 'svd_solver': 'auto',
 'tol': 0.0,
 'whiten': False}

## Multiple Linear Regression

In [6]:
import statsmodels.api as sm 

newXdev = sm.add_constant(developed[0].iloc[:,1:-1]) # adding a constant
Y = developed[0][['gdp_percap']]

model = sm.OLS(Y, newXdev).fit()
predictions = model.predict(newXdev) 
 
summary = str(model.summary())
print(summary)

                            OLS Regression Results                            
Dep. Variable:             gdp_percap   R-squared:                       0.225
Model:                            OLS   Adj. R-squared:                  0.011
Method:                 Least Squares   F-statistic:                     1.050
Date:                Sat, 28 Nov 2020   Prob (F-statistic):              0.424
Time:                        22:52:36   Log-Likelihood:                -80.282
No. Observations:                  38   AIC:                             178.6
Df Residuals:                      29   BIC:                             193.3
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
pop_grow          -1.2065      0.494     -2.

## Linear SVR

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR

dev2010 = developing[0]
gdp2010dev = dev2010[['gdp_percap']]
dev2010 = dev2010.drop(columns=['country','gdp_percap'])

X_train, X_test, y_train, y_test = train_test_split(dev2010,gdp2010dev,test_size=0.10, random_state=32)

In [50]:
linSVR = LinearSVR()   
linSVR.fit(X_train, y_train)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [51]:
weights = list(linSVR.coef_.ravel())
weights

[6.050692278464393e-06,
 0.00013801698841019598,
 3.279312965091957e-05,
 0.00022965606142711403,
 1.811014389506626e-05,
 5.362013014345077e-06,
 0.00026107649418555484,
 -3.422856401219995e-07,
 3.889452323181e-05]

In [52]:
# linSVR.predict(X_test)
linSVR.score(X_test,y_test) 

-3372.4784120347394

In [12]:
from sklearn.model_selection import GridSearchCV
def svr_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    # gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs}
    grid_search = GridSearchCV(LinearSVR(), param_grid, cv=nfolds, scoring="neg_mean_absolute_error")
    grid_search.fit(X, y)
    # grid_search.best_params_
    return (grid_search.best_params_, grid_search.best_score_)

print(svr_param_selection(X_train,y_train,5))

({'C': 0.1}, -26.210578347783617)
