In [116]:
import warnings
warnings.filterwarnings('ignore')

In [117]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [118]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [119]:
# Load the cleaned data into a data frame
file_path = Path('./cleaned_data/data_by_country.csv')
df = pd.read_csv(file_path)
#df = pd.read_csv(file_path, index_col=0)
df.head(20)            

Unnamed: 0,Country,Literacy_Rate,Buddhists,Christians,Folk_Religions,Hindus,Jews,Muslims,Other_Religions,Unaffiliated,...,gpd_2010,gpd_2015,gdp_per_capita_2000,gdp_per_capita_2005,gdp_per_capita_2010,gdp_per_capita_2015,mortality_rate_2000,mortality_rate_2005,mortality_rate_2010,mortality_rate_2015
0,Albania,96.8,1.0,18.0,1.0,1.0,1.0,80.3,1.0,1.4,...,11926920000.0,11386850000.0,1126.68334,2673.786584,4094.348386,3952.802538,106.1,105.7,103.5,92.24
1,Algeria,72.6,1.0,1.0,1.0,1.0,1.0,97.9,1.0,1.8,...,161207000000.0,165979000000.0,1765.027146,3113.094883,4480.786318,4177.889542,141.9,119.3,104.4,96.49
2,Angola,70.4,1.0,90.5,4.2,1.0,1.0,1.0,1.0,5.1,...,81699560000.0,87219290000.0,556.836182,1902.42215,3497.974488,3127.890598,383.6,322.7,269.7,239.8
3,Antigua and Barbuda,99.0,1.0,93.0,3.6,1.0,1.0,1.0,1.0,1.7,...,1148700000.0,1336693000.0,10872.29295,12557.54785,13048.96058,14285.32978,156.7,156.1,134.6,121.9
4,Argentina,97.9,1.0,85.2,1.0,1.0,1.0,1.0,1.0,12.2,...,423627000000.0,594749000000.0,7708.099115,5109.852245,10385.96443,13789.06042,138.5,124.9,118.7,112.7
5,Armenia,99.6,1.0,98.5,1.0,1.0,1.0,1.0,1.0,1.3,...,9260285000.0,10553340000.0,622.740923,1643.756889,3218.378299,3607.289299,143.4,139.7,137.9,121.3
6,Australia,99.0,2.7,67.3,1.0,1.4,1.0,2.4,1.0,24.2,...,1147590000000.0,1350530000000.0,21697.70848,34080.9999,52087.97229,56707.02208,76.93,68.13,63.06,62.17
7,Austria,98.0,1.0,80.4,1.0,1.0,1.0,5.4,1.0,13.5,...,392275000000.0,381971000000.0,24625.60072,38417.45779,46903.76159,44195.81759,93.72,83.7,74.43,64.34
8,Azerbaijan,99.8,1.0,3.0,1.0,1.0,1.0,96.9,1.0,1.0,...,52909290000.0,53074370000.0,655.119945,1578.40239,5843.533768,5500.310382,161.0,160.0,131.8,119.5
9,Bahrain,94.6,2.5,14.5,1.0,9.8,1.0,70.3,1.0,1.9,...,25713270000.0,31050640000.0,13636.41675,17959.39683,20722.07049,22634.08565,100.8,83.48,65.97,58.52


In [120]:
high = 20
med = 10
# Turn mortality_rate_2015 into a %
df['mortality_rate'] = df['mortality_rate_2015'].div(1000).mul(100)

# Syntax to add column based on another column range found at
#https://stackoverflow.com/questions/59642338/creating-new-column-based-on-condition-on-other-column-in-pandas-dataframe
df['mortality_state']=np.select([df['mortality_rate']<med,df['mortality_rate'].between(med,high)],[1,2],3)
df.head(10)


Unnamed: 0,Country,Literacy_Rate,Buddhists,Christians,Folk_Religions,Hindus,Jews,Muslims,Other_Religions,Unaffiliated,...,gdp_per_capita_2000,gdp_per_capita_2005,gdp_per_capita_2010,gdp_per_capita_2015,mortality_rate_2000,mortality_rate_2005,mortality_rate_2010,mortality_rate_2015,mortality_rate,mortality_state
0,Albania,96.8,1.0,18.0,1.0,1.0,1.0,80.3,1.0,1.4,...,1126.68334,2673.786584,4094.348386,3952.802538,106.1,105.7,103.5,92.24,9.224,1
1,Algeria,72.6,1.0,1.0,1.0,1.0,1.0,97.9,1.0,1.8,...,1765.027146,3113.094883,4480.786318,4177.889542,141.9,119.3,104.4,96.49,9.649,1
2,Angola,70.4,1.0,90.5,4.2,1.0,1.0,1.0,1.0,5.1,...,556.836182,1902.42215,3497.974488,3127.890598,383.6,322.7,269.7,239.8,23.98,3
3,Antigua and Barbuda,99.0,1.0,93.0,3.6,1.0,1.0,1.0,1.0,1.7,...,10872.29295,12557.54785,13048.96058,14285.32978,156.7,156.1,134.6,121.9,12.19,2
4,Argentina,97.9,1.0,85.2,1.0,1.0,1.0,1.0,1.0,12.2,...,7708.099115,5109.852245,10385.96443,13789.06042,138.5,124.9,118.7,112.7,11.27,2
5,Armenia,99.6,1.0,98.5,1.0,1.0,1.0,1.0,1.0,1.3,...,622.740923,1643.756889,3218.378299,3607.289299,143.4,139.7,137.9,121.3,12.13,2
6,Australia,99.0,2.7,67.3,1.0,1.4,1.0,2.4,1.0,24.2,...,21697.70848,34080.9999,52087.97229,56707.02208,76.93,68.13,63.06,62.17,6.217,1
7,Austria,98.0,1.0,80.4,1.0,1.0,1.0,5.4,1.0,13.5,...,24625.60072,38417.45779,46903.76159,44195.81759,93.72,83.7,74.43,64.34,6.434,1
8,Azerbaijan,99.8,1.0,3.0,1.0,1.0,1.0,96.9,1.0,1.0,...,655.119945,1578.40239,5843.533768,5500.310382,161.0,160.0,131.8,119.5,11.95,2
9,Bahrain,94.6,2.5,14.5,1.0,9.8,1.0,70.3,1.0,1.9,...,13636.41675,17959.39683,20722.07049,22634.08565,100.8,83.48,65.97,58.52,5.852,1


In [121]:
# Set y
y = df[['mortality_state']]

# Drop Country column - non numeric AND unneeded
# Drop Mortality Columns as we will use 1 target = mortality_rate_2015
df = df.drop(['Country','mortality_rate_2000',
              'mortality_rate_2005','mortality_rate_2010',
              'mortality_rate_2015','mortality_rate','mortality_state'], axis=1)
# Set X
X=df

In [122]:
X.describe()

Unnamed: 0,Literacy_Rate,Buddhists,Christians,Folk_Religions,Hindus,Jews,Muslims,Other_Religions,Unaffiliated,alcohol_rate_2000,...,pg_2010,pg_2015,gdp_2000,gpd_2005,gpd_2010,gpd_2015,gdp_per_capita_2000,gdp_per_capita_2005,gdp_per_capita_2010,gdp_per_capita_2015
count,139.0,139.0,139.0,139.0,139.0,139.0,139.0,139.0,139.0,139.0,...,139.0,139.0,139.0,139.0,139.0,139.0,139.0,139.0,139.0,139.0
mean,84.597122,4.627338,56.810072,2.492086,3.816547,1.536691,25.540288,1.126619,8.372662,6.075086,...,1.607997,1.508579,134974800000.0,194198900000.0,293931400000.0,327887900000.0,6721.934067,10399.044353,13445.236083,13556.432606
std,18.451735,15.201296,37.196149,4.840336,11.054644,6.327489,36.371413,0.820443,11.906068,4.343943,...,1.691831,1.262347,489974800000.0,576713600000.0,843478100000.0,1095522000000.0,10176.117043,15825.146048,19210.41063,18671.255326
min,28.7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.038,...,-2.096943,-0.940754,204849600.0,261797600.0,366840000.0,437006200.0,124.460791,151.681566,234.235539,305.511144
25%,74.35,1.0,13.2,1.0,1.0,1.0,1.0,1.0,1.0,2.44,...,0.486291,0.580451,3026418000.0,6196934000.0,9993191000.0,11387510000.0,618.236654,995.758723,1554.010559,2087.25701
50%,93.1,1.0,71.2,1.0,1.0,1.0,4.6,1.0,3.0,5.48,...,1.354308,1.301979,10566580000.0,17003460000.0,32197270000.0,45780130000.0,2001.540049,3193.204358,5076.339872,5967.052204
75%,98.95,1.0,89.15,1.5,1.0,1.0,46.95,1.0,10.55,9.015,...,2.581132,2.490801,73849320000.0,117387500000.0,192767500000.0,197539000000.0,7368.332777,11159.22166,14365.283865,15385.6239
max,100.0,96.9,99.0,35.6,80.7,75.6,99.0,9.7,59.6,17.45,...,11.483371,5.790591,4968360000000.0,4831470000000.0,6087160000000.0,11061600000000.0,48659.59888,80988.13762,110885.9914,105462.0126


In [123]:
# Check the balance of our target values
y['mortality_state'].value_counts()

2    58
1    42
3    39
Name: mortality_state, dtype: int64

In [124]:
# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [125]:
print(X_train.shape)

(104, 25)


In [126]:
print(y_train.shape)

(104, 1)


## Easy Ensemble Classifier

In [127]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
easy_e_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)
easy_e_model.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [128]:
# Calculated the balanced accuracy score
y_pred = easy_e_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6833333333333332

In [129]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 8,  0,  0],
       [ 2, 12,  1],
       [ 0,  9,  3]], dtype=int64)

In [130]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.80      1.00      0.93      0.89      0.96      0.93         8
          2       0.57      0.80      0.55      0.67      0.66      0.45        15
          3       0.75      0.25      0.96      0.38      0.49      0.22        12

avg / total       0.68      0.66      0.78      0.62      0.67      0.48        35



## Balanced Random Forest Classifier

In [131]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

# Create a balanced random forest classifier.
brf_model = BalancedRandomForestClassifier(n_estimators = 100, random_state=1)

# Fitting the model
brf_model = brf_model.fit(X_train, y_train)
brf_model

BalancedRandomForestClassifier(random_state=1)

In [132]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = brf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7472222222222222

In [133]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 7,  1,  0],
       [ 1, 13,  1],
       [ 1,  5,  6]], dtype=int64)

In [134]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.78      0.88      0.93      0.82      0.90      0.81         8
          2       0.68      0.87      0.70      0.76      0.78      0.62        15
          3       0.86      0.50      0.96      0.63      0.69      0.46        12

avg / total       0.76      0.74      0.84      0.73      0.78      0.61        35

