In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import matplotlib.pyplot as plt

In [2]:
columns_to_drop = ['country', 'region', 'hle', 'year', 'cat_country', 'rounded_score']
full_dataset = pd.read_csv('../data/processed/processed_dataset.csv')
region_names = full_dataset.groupby('cat_region')['region'].first().to_dict()
dataset = full_dataset.drop(columns=columns_to_drop)
dataset.head()

Unnamed: 0,score,gdp,social_support,freedom,generosity,corruption,positive_affect,negative_affect,cat_region,scaled_hle
0,3.72359,7.3701,0.450662,0.718114,0.16764,0.881686,0.517637,0.258195,0,0.44
1,4.401778,7.539972,0.552308,0.678896,0.190099,0.850035,0.583926,0.237092,0,0.445714
2,4.758381,7.646709,0.539075,0.600127,0.12059,0.706766,0.618265,0.275324,0,0.451429
3,3.831719,7.619532,0.521104,0.495901,0.162427,0.731109,0.611387,0.267175,0,0.456
4,3.782938,7.705479,0.520637,0.530935,0.236032,0.77562,0.710385,0.267919,0,0.460571


In [67]:
full_dataset['region'].value_counts()

sub-saharan africa       463
asia                     321
latin amer. and carib    309
western europe           292
eastern europe           196
c.w. of ind. states      182
near east                171
northern africa           56
baltics                   46
northern america          32
oceania                   30
Name: region, dtype: int64

In [37]:
X = dataset.drop(columns=['cat_region'])
y = dataset['cat_region']

In [38]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [28]:
DecisionTreeClassifier(random_state=42).fit(x_train, y_train).score(x_test, y_test)

0.8166666666666667

In [39]:
RandomForestClassifier(random_state=42).fit(x_train, y_train).score(x_test, y_test)

0.919047619047619

In [31]:
KNeighborsClassifier().fit(x_train, y_train).score(x_test, y_test)

0.8261904761904761

In [41]:
rf_model = RandomForestClassifier(random_state=42).fit(x_train, y_train)
y_pred = rf_model.predict(x_test)
pd.DataFrame(metrics.classification_report(y_test, y_pred, output_dict=True, target_names=list(region_names.values())))

Unnamed: 0,asia,baltics,c.w. of ind. states,eastern europe,latin amer. and carib,near east,northern africa,northern america,oceania,sub-saharan africa,western europe,accuracy,macro avg,weighted avg
precision,0.907692,0.75,0.921053,0.818182,0.935484,1.0,1.0,1.0,1.0,0.9375,0.920635,0.919048,0.926413,0.922227
recall,0.921875,0.666667,0.945946,0.923077,0.935484,0.823529,0.636364,0.833333,0.666667,0.967742,0.983051,0.919048,0.845794,0.919048
f1-score,0.914729,0.705882,0.933333,0.86747,0.935484,0.903226,0.777778,0.909091,0.8,0.952381,0.95082,0.919048,0.87729,0.917738
support,64.0,9.0,37.0,39.0,62.0,34.0,11.0,6.0,6.0,93.0,59.0,0.919048,420.0,420.0


In [68]:
rf_model.feature_importances_

array([0.10401599, 0.17551988, 0.06953177, 0.05689076, 0.09031898,
       0.07648931, 0.10933178, 0.08078442, 0.2371171 ])

In [3]:
augmented_dataset = pd.read_csv('../data/processed/augmented_dataset.csv')
augmented_dataset['cat_region'].value_counts()

2     393
5     393
10    393
9     393
0     393
3     393
4     393
7     393
1     393
6     393
8     393
Name: cat_region, dtype: int64

In [71]:
X = augmented_dataset.drop(columns=['cat_region'])
y = augmented_dataset['cat_region']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
rf_model = RandomForestClassifier(random_state=42).fit(x_train, y_train)
y_pred = rf_model.predict(x_test)
pd.DataFrame(metrics.classification_report(y_test, y_pred, output_dict=True, target_names=list(region_names.values())))

Unnamed: 0,asia,baltics,c.w. of ind. states,eastern europe,latin amer. and carib,near east,northern africa,northern america,oceania,sub-saharan africa,western europe,accuracy,macro avg,weighted avg
precision,0.938272,1.0,0.975309,0.940476,0.974359,1.0,0.9875,0.975309,1.0,0.986301,1.0,0.979191,0.979775,0.979769
recall,0.974359,1.0,1.0,1.0,0.962025,0.948718,1.0,1.0,1.0,0.923077,0.961538,0.979191,0.979065,0.979191
f1-score,0.955975,1.0,0.9875,0.969325,0.968153,0.973684,0.993711,0.9875,1.0,0.953642,0.980392,0.979191,0.97908,0.979141
support,78.0,79.0,79.0,79.0,79.0,78.0,79.0,79.0,79.0,78.0,78.0,0.979191,865.0,865.0


In [72]:
validation_dataset = pd.read_csv('../data/processed/validation_dataset.csv')

In [73]:
X_val = validation_dataset.drop(columns=['cat_region'])
y_val = validation_dataset['cat_region']

In [74]:
y_pred = rf_model.predict(X_val)
pd.DataFrame(metrics.classification_report(y_val, y_pred, output_dict=True, target_names=list(region_names.values())))

Unnamed: 0,asia,baltics,c.w. of ind. states,eastern europe,latin amer. and carib,near east,northern africa,northern america,oceania,sub-saharan africa,western europe,accuracy,macro avg,weighted avg
precision,0.957447,0.7,0.896552,0.866667,1.0,1.0,0.7,0.833333,1.0,0.971429,0.976744,0.939683,0.900197,0.945845
recall,0.9375,1.0,0.962963,0.896552,0.956522,0.807692,0.875,1.0,1.0,0.971429,0.954545,0.939683,0.942018,0.939683
f1-score,0.947368,0.823529,0.928571,0.881356,0.977778,0.893617,0.777778,0.909091,1.0,0.971429,0.965517,0.939683,0.916003,0.940735
support,48.0,7.0,27.0,29.0,46.0,26.0,8.0,5.0,5.0,70.0,44.0,0.939683,315.0,315.0


In [77]:
rf_model = RandomForestClassifier(random_state=42).fit(X, y)
rf_model.score(X_val, y_val)

0.9301587301587302

In [76]:
rf_model.score(x_test, y_test)

0.9791907514450867