In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import numpy as np

In [2]:
df = pd.read_csv("../data/WHO-SIMPLE.csv")

In [3]:
socio_factors = [
    'NUTRITION_564',
    'LBW_PREVALENCE',
    'NUTRITION_HA_2',
    'NUTRITION_WA_2',
    'NUTRITION_WH2',
    'NUTRITION_WH_2',
    'MDG_0000000026',
    'WHS_PBR', 
    'WSH_SANITATION_SAFELY_MANAGED', 
    'M_Est_smk_curr',
    'M_Est_smk_daily',
    'TOBACCO_0000000192',
    'GHED_CHEGDP_SHA2011', 
    'GDP',
    'MDG_0000000003', 
    'MDG_0000000025'
]

imr = 'MDG_0000000001'

In [4]:
relevant_factors = socio_factors.copy()
relevant_factors.insert(0, imr)

In [5]:
socio_df = df[relevant_factors].copy()

In [6]:
socio_df = socio_df.dropna(thresh=1700, axis=1)

In [7]:
socio_df.describe()

Unnamed: 0,MDG_0000000001,LBW_PREVALENCE,MDG_0000000026,GHED_CHEGDP_SHA2011,GDP,MDG_0000000003,MDG_0000000025
count,11289.0,2320.0,3276.0,3560.0,8911.0,2176.0,1746.0
mean,54.199007,10.087026,207.671245,6.300938,191429200000.0,43.188787,92.125885
std,48.087793,4.923792,302.623492,3.013298,974493600000.0,40.854141,16.488824
min,1.49318,2.4,2.0,1.03,8824448.0,0.0,5.7
25%,15.53903,6.2,16.0,4.24,1666522000.0,12.875,95.5
50%,38.97495,8.8,65.0,5.785,9059340000.0,28.0,99.0
75%,82.06532,12.8,290.25,7.96,57582240000.0,62.4,99.8
max,279.65781,36.2,2480.0,50.18,21433230000000.0,229.0,100.0


In [8]:
len(socio_df)

11289

In [9]:
socio_df = socio_df.dropna(how='any')

In [10]:
len(socio_df)

1004

## Center and scale data for each columns

In [11]:
all_columns = socio_df.columns

In [12]:
# Create the Scalar object
scaler = preprocessing.StandardScaler()

In [13]:
# Fit data on the Scaler object
scaled_arr = scaler.fit_transform(socio_df[all_columns])
scaled_df = pd.DataFrame(scaled_arr, columns=all_columns)

In [14]:
scaled_df.head()

Unnamed: 0,MDG_0000000001,LBW_PREVALENCE,MDG_0000000026,GHED_CHEGDP_SHA2011,GDP,MDG_0000000003,MDG_0000000025
0,0.41062,-0.932545,-0.224021,-0.192345,-0.312742,-0.418043,0.317643
1,0.032162,-0.96393,-0.242608,-0.875019,-0.310796,-0.464554,0.267382
2,-0.300794,1.609634,-0.382008,-1.714629,-0.244628,-0.352152,0.337748
3,0.387862,-0.116537,0.184885,0.643342,-0.178168,1.2835,0.247278
4,0.342449,-0.085152,0.194179,0.702193,-0.185725,1.136213,0.176912


## Random Forest Implementation

In [15]:
# Separate explanatory variables and response variable
explanatory_vars = all_columns.copy()
explanatory_vars = list(explanatory_vars)
explanatory_vars.remove(imr)

# Split data into train and test sets
x = scaled_df[explanatory_vars]
y = scaled_df[imr]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [16]:
rf = RandomForestRegressor()

In [17]:
rf.fit(x_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [18]:
rf.score(x_test, y_test)

0.8302080402435795

In [19]:
feature_importances = rf.feature_importances_

In [20]:
for i, feature in enumerate(explanatory_vars):
    print("Feature:", feature, "Importance:", feature_importances[i])

Feature: LBW_PREVALENCE Importance: 0.06719591831145592
Feature: MDG_0000000026 Importance: 0.7459749649629666
Feature: GHED_CHEGDP_SHA2011 Importance: 0.05643541500340375
Feature: GDP Importance: 0.04659180452168099
Feature: MDG_0000000003 Importance: 0.05398734022598088
Feature: MDG_0000000025 Importance: 0.029814556974511987
