In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import os 

In [10]:
indicatorList = pd.read_csv('Indicator.csv')

indicatorsToPick = indicatorList[indicatorList['included']==1]
gdp_percap = indicatorsToPick[indicatorsToPick['feature_name']=='gdp_percap']
indicatorsToPick = indicatorsToPick.drop(index=2) # GDP is Index 2

In [11]:
indicatorsToPick.head(10)

Unnamed: 0,Indicator_Code,Indicator_Name,included,feature_name
0,SP.POP.GROW,Population Growth (in %),1,pop_grow
1,SI.POV.GINI,Gini Index,1,gini_index
7,SL.UEM.TOTL.ZS,"Unemployment, total (% of labour force) (model...",1,unemp
10,SP.DYN.LE00.IN,"Life expectancy at birth, total (years)",1,life_exp
11,SI.POV.DDAY,Poverty headcount ratio at $1.90 a day (% of p...,1,poverty
12,MS.MIL.XPND.GD.ZS,Military expenditure (% of GDP),1,mil_xpnd
13,SE.ADT.LITR.ZS,"Literacy Rate, adult total",1,lit_rate
14,SL.TLF.TOTL.IN,"Labour force, total",1,labour_force
15,SM.POP.REFG,Refguee population by country of asylum,1,refugee_asylum


In [12]:
def getDF(code):
    path = os.getcwd() + "\Datasets\\"
    newPath = path + code
    df = pd.read_csv(newPath + "\\" + os.listdir(newPath)[0])
    return df

In [13]:
indicators = list(indicatorsToPick.Indicator_Code)
features = []
target = getDF("NY.GDP.PCAP.KD.ZG")

for i in indicators:
    features.append(getDF(i))

In [14]:
attributes = list(indicatorsToPick.feature_name)

In [15]:
Developed = set(["Andorra","Austria","Belgium","Cyprus","Czech Republic","Denmark","Estonia","Faroe Islands","Finland","France","Germany","Greece","Guernsey","Holy See","Iceland","Ireland","Italy","Jersey","Latvia","Liechtenstein","Lithuania","Luxembourg","Malta","Monaco","Netherlands","Norway","Portugal","San Marino","Slovakia","Slovenia","Spain","Sweden","Switzerland","United Kingdom","Hong Kong","Israel","Japan","Macau","Singapore","South Korea","Taiwan","Bermuda","Canada","Puerto Rico","United States","Australia","New Zealand"])

Developing = set(features[0]['Country Name'])
Developing -= Developed

In [16]:
dev = [None]*(len(features)+1)
dev1 = [None]*(len(features)+1)
n = len(features)

for i in range(n):
    dev[i] = features[i][features[i]['Country Name'].isin(Developed)]
    dev1[i] = features[i][features[i]['Country Name'].isin(Developing)]

# i+=1
dev[n] = target[target['Country Name'].isin(Developed)]
dev1[n] = target[target['Country Name'].isin(Developing)]


In [19]:
def createDataset(year,dfList):
    factors = [dfList[0]["Country Name"]]
    for x in dfList:
        factors.append(x[year])
    df = pd.concat(factors,axis =1, sort = False)
    # print(df.head)
    df.columns = ['country'] + attributes + ['gdp_percap']
    total_rows = max(df.count())
    
    return df

In [20]:
years = [str(i) for i in range(2010,2018)]
developed = [createDataset(i,dev) for i in years]
developing = [createDataset(i,dev1) for i in years]

## Cleaning the Data 

- Removing all rows with missing values for GDP
- Median replacement for all other parameters

### 1. Removing all rows that have NaNs/missing values in the target attribute

In [21]:
for i in range(len(years)):
    developed[i] = developed[i].dropna(subset=["gdp_percap"])
    developing[i] = developing[i].dropna(subset = ["gdp_percap"])
developed[5].head()



Unnamed: 0,country,pop_grow,gini_index,unemp,life_exp,poverty,mil_xpnd,lit_rate,labour_force,refugee_asylum,gdp_percap
4,Andorra,-1.529058,,,,,,,,,2.997046
11,Australia,1.439217,,6.054,82.4,,1.950601,,12585539.0,36917.0,0.7324
12,Austria,1.120993,30.5,5.723,81.190244,0.7,0.698138,,4460833.0,72198.0,-0.11154
15,Belgium,0.579446,27.7,8.482,80.992683,0.1,0.921668,,5000910.0,35302.0,1.443273
33,Canada,0.746339,,6.906,81.9,,1.150126,,19679325.0,135890.0,-0.089286


### 2. Median replacement of missing values for the other features

In [22]:
def cleanData(df):

    for i in range(len(years)):
        #print("\n\n\n",years[i])
        for j in df[i].iloc[:,1:]:
            df[i][j] = df[i][j].fillna(np.nanmedian(df[i][j]))
            #print(j,np.nanmedian(df[i][j]))

In [23]:
cleanData(developed)
cleanData(developing)

In [24]:
def checkLinearRelationship(df):
    for j in df.iloc[:,1:-1]:
        plt.title("GDP vs "+j)
        plt.scatter(df[j],df.gdp_percap)
        plt.show()

## Sparse PCA for Developed and Developing 

Done to check what are the important features for each to try and bring about a comparison

In [29]:
X_dev = developed[0].drop(columns=['country','gdp_percap'])
X_deving = developing[0].drop(columns=['country','gdp_percap'])

X_dev.head()

Unnamed: 0,pop_grow,gini_index,unemp,life_exp,poverty,mil_xpnd,lit_rate,labour_force,refugee_asylum
4,-0.016577,32.05,7.787,80.606098,0.15,1.383402,95.85733,4276558.0,8910.0
11,1.55549,34.7,5.211,81.695122,0.3,1.856791,95.85733,11702473.0,21790.0
12,0.240394,30.3,4.82,80.580488,0.5,0.821233,95.85733,4276558.0,42613.0
15,0.913639,28.4,8.292,80.182927,0.1,1.084633,95.85733,4887403.0,17891.0
25,-0.78006,32.05,7.787,79.288537,0.15,1.383402,95.85733,4276558.0,8910.0


In [35]:
from sklearn.decomposition import SparsePCA

PCA = SparsePCA(n_components=5, random_state=0)
dev2010New = PCA.fit_transform(X_dev)
paramdev2010 = PCA.get_params(deep=True)

In [36]:
paramdev2010

{'U_init': None,
 'V_init': None,
 'alpha': 1,
 'max_iter': 1000,
 'method': 'lars',
 'n_components': 5,
 'n_jobs': None,
 'normalize_components': 'deprecated',
 'random_state': 0,
 'ridge_alpha': 0.01,
 'tol': 1e-08,
 'verbose': False}