In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.preprocessing import StandardScaler, LabelEncoder

import sys
import six
sys.modules['sklearn.externals.six'] = six
from id3 import Id3Estimator, export_graphviz

import warnings
warnings.filterwarnings("ignore");

#### Process Data

In [2]:
df = pd.read_csv("archive/art_coverage_by_country_clean.csv")
df = df.dropna()

# df.head()
# df = df.drop(['Estimated number of people living with HIV_min',
#        'Estimated number of people living with HIV_median',
#        'Estimated number of people living with HIV_max', "Reported number of people receiving ART", "Estimated number of people living with HIV"], axis = 1);

df = df.drop(["Reported number of people receiving ART"], axis = 1);
df = df.reset_index();
df = df.drop(["index"], axis = 1);

s = "Estimated ART coverage among people living with HIV (%)";
for i in range(len(df[s])): 
       pos = df[s][i].find("[");
       df[s][i] = df[s][i][:pos];

s2: str = "Estimated number of people living with HIV"
for i in range(len(df[s2])): 
       pos = df[s2][i].find('[');
       df[s2][i] = df[s][i][:pos];

df.head()

Unnamed: 0,Country,Estimated number of people living with HIV,Estimated ART coverage among people living with HIV (%),Estimated number of people living with HIV_median,Estimated number of people living with HIV_min,Estimated number of people living with HIV_max,Estimated ART coverage among people living with HIV (%)_median,Estimated ART coverage among people living with HIV (%)_min,Estimated ART coverage among people living with HIV (%)_max,WHO Region
0,Afghanistan,13,13,7200.0,4100.0,11000.0,13.0,7.0,20.0,Eastern Mediterranean
1,Algeria,81,81,16000.0,15000.0,17000.0,81.0,75.0,86.0,Africa
2,Angola,27,27,330000.0,290000.0,390000.0,27.0,23.0,31.0,Africa
3,Argentina,61,61,140000.0,130000.0,150000.0,61.0,55.0,67.0,Americas
4,Armenia,53,53,3500.0,3000.0,4400.0,53.0,44.0,65.0,Europe


In [3]:
for i in range(len(df["WHO Region"])): 
    if df["WHO Region"][i] != "Africa" and df["WHO Region"][i] != "Americas":
        df["WHO Region"][i] = "Other"

df.head()

Unnamed: 0,Country,Estimated number of people living with HIV,Estimated ART coverage among people living with HIV (%),Estimated number of people living with HIV_median,Estimated number of people living with HIV_min,Estimated number of people living with HIV_max,Estimated ART coverage among people living with HIV (%)_median,Estimated ART coverage among people living with HIV (%)_min,Estimated ART coverage among people living with HIV (%)_max,WHO Region
0,Afghanistan,13,13,7200.0,4100.0,11000.0,13.0,7.0,20.0,Other
1,Algeria,81,81,16000.0,15000.0,17000.0,81.0,75.0,86.0,Africa
2,Angola,27,27,330000.0,290000.0,390000.0,27.0,23.0,31.0,Africa
3,Argentina,61,61,140000.0,130000.0,150000.0,61.0,55.0,67.0,Americas
4,Armenia,53,53,3500.0,3000.0,4400.0,53.0,44.0,65.0,Other


In [4]:
# label: list[str] = ['Eastern Mediterranean', 'Africa', 'Americas', 'Europe', 'Western Pacific', 'South-East Asia'];
# label: list[str] = ['Africa', 'Europe', 'Other'];
# cnt: list[int] = [i for i in range(len(label))];

# mp = {};
# for i in range(len(label)):
#     mp[label[i]] = cnt[i];

# df['WHO Region'] = df['WHO Region'].map(mp);

le = LabelEncoder()
df['WHO Region'] = le.fit_transform(df['WHO Region'])

df.head()

Unnamed: 0,Country,Estimated number of people living with HIV,Estimated ART coverage among people living with HIV (%),Estimated number of people living with HIV_median,Estimated number of people living with HIV_min,Estimated number of people living with HIV_max,Estimated ART coverage among people living with HIV (%)_median,Estimated ART coverage among people living with HIV (%)_min,Estimated ART coverage among people living with HIV (%)_max,WHO Region
0,Afghanistan,13,13,7200.0,4100.0,11000.0,13.0,7.0,20.0,2
1,Algeria,81,81,16000.0,15000.0,17000.0,81.0,75.0,86.0,0
2,Angola,27,27,330000.0,290000.0,390000.0,27.0,23.0,31.0,0
3,Argentina,61,61,140000.0,130000.0,150000.0,61.0,55.0,67.0,1
4,Armenia,53,53,3500.0,3000.0,4400.0,53.0,44.0,65.0,2


In [5]:
mp_country = {};
for i in df["Country"]: 
    if (i not in mp_country): mp_country[i] = 1;
    else: mp_country[i] += 1;


tmp = {};
for a, b in mp_country.items():
    tmp[a] = 1;

for i in range(len(df["Country"])):
    s: str = df["Country"][i];
    df['Country'][i] = float(df["WHO Region"][i] + tmp[s] * 0.01);
    tmp[s] += 1;

In [6]:
df = df.drop(["Country"], axis = 1);
df.head()

Unnamed: 0,Estimated number of people living with HIV,Estimated ART coverage among people living with HIV (%),Estimated number of people living with HIV_median,Estimated number of people living with HIV_min,Estimated number of people living with HIV_max,Estimated ART coverage among people living with HIV (%)_median,Estimated ART coverage among people living with HIV (%)_min,Estimated ART coverage among people living with HIV (%)_max,WHO Region
0,13,13,7200.0,4100.0,11000.0,13.0,7.0,20.0,2
1,81,81,16000.0,15000.0,17000.0,81.0,75.0,86.0,0
2,27,27,330000.0,290000.0,390000.0,27.0,23.0,31.0,0
3,61,61,140000.0,130000.0,150000.0,61.0,55.0,67.0,1
4,53,53,3500.0,3000.0,4400.0,53.0,44.0,65.0,2


#### Assign Data and target

In [7]:
target: list[int] = df['WHO Region'];
data = df.drop(['WHO Region'], axis = 1);

sc = StandardScaler();
data = sc.fit_transform(data);
# data = pp.scale(data);

In [19]:
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size = 0.25, random_state=42);

#### Model

##### Decision Tree

In [8]:
Tree = DecisionTreeClassifier(random_state = 42, max_depth = 5, min_samples_split = 3);
Tree.fit(x_train, y_train);

In [9]:
pred = Tree.predict(x_test);

In [10]:
print(classification_report(y_test, pred));

              precision    recall  f1-score   support

           0       0.43      0.53      0.48        19
           1       0.40      0.43      0.41        14
           2       0.41      0.32      0.36        22

    accuracy                           0.42        55
   macro avg       0.42      0.42      0.42        55
weighted avg       0.42      0.42      0.41        55



##### Random Forest 0.2: (65, 2) -> 60%, 0.4: 57%, 0.3: (19, 8) -> 58%, 0.35: (26, 4) -> 62%

In [20]:
rng: range = range(1, 100);
dep: range = range(1, 10);

score: list[float] = [];

mx: float = 0;

for i in rng:
    for j in dep: 
        rf = RandomForestClassifier(n_estimators = i, max_depth = j, random_state = 42);
        rf.fit(x_train, y_train);
        pred = rf.predict(x_test);
        acc = rf.score(x_test, y_test);
        if (acc >= mx): 
            mx = acc;
            print(f"i: {i}, j: {j}, acc: {acc}");

i: 1, j: 1, acc: 0.5
i: 1, j: 2, acc: 0.5294117647058824
i: 1, j: 6, acc: 0.5294117647058824
i: 2, j: 1, acc: 0.5588235294117647
i: 2, j: 4, acc: 0.5588235294117647
i: 2, j: 5, acc: 0.5588235294117647
i: 3, j: 1, acc: 0.5588235294117647
i: 3, j: 2, acc: 0.5588235294117647
i: 3, j: 4, acc: 0.5588235294117647
i: 3, j: 5, acc: 0.5882352941176471
i: 4, j: 5, acc: 0.5882352941176471
i: 5, j: 4, acc: 0.5882352941176471
i: 5, j: 5, acc: 0.6176470588235294
i: 6, j: 5, acc: 0.6470588235294118


In [21]:
rf = RandomForestClassifier(n_estimators = 6, max_depth=5, random_state = 42);
rf.fit(x_train, y_train);

pred = rf.predict(x_test);

print(classification_report(y_test, pred));

              precision    recall  f1-score   support

           0       0.57      0.67      0.62        12
           1       0.50      0.50      0.50         4
           2       0.75      0.67      0.71        18

    accuracy                           0.65        34
   macro avg       0.61      0.61      0.61        34
weighted avg       0.66      0.65      0.65        34



In [14]:
# np.float = float

# estimator = Id3Estimator();
# estimator.fit(data, target);

# export_graphviz(estimator.tree_, 'tree.dot', data);

In [15]:
# !dot -Tpdf tree.dot -o tree.pdf

In [37]:
df1 = pd.read_csv("archive/art_coverage_by_country_clean.csv")
df2 = pd.read_csv("archive/art_pediatric_coverage_by_country_clean.csv")

df2 = df2.drop(["WHO Region"], axis = 1);

df_all = pd.merge(df1, df2, on = "Country");

df_all = df_all.dropna();

df_all = df_all.reset_index();
df_all = df_all.drop(["index", "Country"], axis = 1);


s = "Reported number of children receiving ART"
for i in range(len(df_all[s])): 
    Str = list(df_all[s][i].split())
    df_all[s][i] = float(Str[0]);

for i in range(len(df_all["WHO Region"])): 
    if df_all["WHO Region"][i] != "Africa" and df_all["WHO Region"][i] != "Europe":
        df_all["WHO Region"][i] = "Other"

# df.head()
df_all.head()

Unnamed: 0,Reported number of people receiving ART,Estimated number of people living with HIV,Estimated ART coverage among people living with HIV (%),Estimated number of people living with HIV_median,Estimated number of people living with HIV_min,Estimated number of people living with HIV_max,Estimated ART coverage among people living with HIV (%)_median,Estimated ART coverage among people living with HIV (%)_min,Estimated ART coverage among people living with HIV (%)_max,WHO Region,Reported number of children receiving ART,Estimated number of children needing ART based on WHO methods,Estimated ART coverage among children (%),Estimated number of children needing ART based on WHO methods_median,Estimated number of children needing ART based on WHO methods_min,Estimated number of children needing ART based on WHO methods_max,Estimated ART coverage among children (%)_median,Estimated ART coverage among children (%)_min,Estimated ART coverage among children (%)_max
0,920.0,7200[4100–11000],13[7–20],7200.0,4100.0,11000.0,13.0,7.0,20.0,Other,60.0,500[500-530],17[10-26],500.0,500.0,530.0,17.0,10.0,26.0
1,12800.0,16000[15000–17000],81[75–86],16000.0,15000.0,17000.0,81.0,75.0,86.0,Africa,770.0,500[500-520],95[95-95],500.0,500.0,520.0,95.0,95.0,95.0
2,88700.0,330000[290000–390000],27[23–31],330000.0,290000.0,390000.0,27.0,23.0,31.0,Africa,4800.0,38000[30000-47000],13[10-16],38000.0,30000.0,47000.0,13.0,10.0,16.0
3,85500.0,140000[130000–150000],61[55–67],140000.0,130000.0,150000.0,61.0,55.0,67.0,Other,1700.0,1800[1600-2100],92[84-95],1800.0,1600.0,2100.0,92.0,84.0,95.0
4,3100.0,6000[5300–6700],52[45–58],6000.0,5300.0,6700.0,52.0,45.0,58.0,Other,40.0,200[200-500],21[18-26],200.0,200.0,500.0,21.0,18.0,26.0


In [38]:
s2: str = "Estimated number of people living with HIV"
for i in range(len(df_all[s2])): 
       pos = df_all[s2][i].find('[');
       df_all[s2][i] = df_all[s2][i][:pos];

s2: str = "Estimated ART coverage among people living with HIV (%)";
for i in range(len(df_all[s2])): 
       pos = df_all[s2][i].find('[');
       df_all[s2][i] = df_all[s2][i][:pos];

s2: str = "Estimated number of children needing ART based on WHO methods"
for i in range(len(df_all[s2])): 
       pos = df_all[s2][i].find('[');
       df_all[s2][i] = df_all[s2][i][:pos];

s2: str = "Estimated ART coverage among children (%)"
for i in range(len(df_all[s2])): 
       pos = df_all[s2][i].find('[');
       df_all[s2][i] = df_all[s2][i][:pos];

for i in range(len(df_all["WHO Region"])): 
    if df_all["WHO Region"][i] != "Africa" and df_all["WHO Region"][i] != "Europe":
        df_all["WHO Region"][i] = "Other"
        
df_all.head()

Unnamed: 0,Reported number of people receiving ART,Estimated number of people living with HIV,Estimated ART coverage among people living with HIV (%),Estimated number of people living with HIV_median,Estimated number of people living with HIV_min,Estimated number of people living with HIV_max,Estimated ART coverage among people living with HIV (%)_median,Estimated ART coverage among people living with HIV (%)_min,Estimated ART coverage among people living with HIV (%)_max,WHO Region,Reported number of children receiving ART,Estimated number of children needing ART based on WHO methods,Estimated ART coverage among children (%),Estimated number of children needing ART based on WHO methods_median,Estimated number of children needing ART based on WHO methods_min,Estimated number of children needing ART based on WHO methods_max,Estimated ART coverage among children (%)_median,Estimated ART coverage among children (%)_min,Estimated ART coverage among children (%)_max
0,920.0,7200,13,7200.0,4100.0,11000.0,13.0,7.0,20.0,Other,60.0,500,17,500.0,500.0,530.0,17.0,10.0,26.0
1,12800.0,16000,81,16000.0,15000.0,17000.0,81.0,75.0,86.0,Africa,770.0,500,95,500.0,500.0,520.0,95.0,95.0,95.0
2,88700.0,330000,27,330000.0,290000.0,390000.0,27.0,23.0,31.0,Africa,4800.0,38000,13,38000.0,30000.0,47000.0,13.0,10.0,16.0
3,85500.0,140000,61,140000.0,130000.0,150000.0,61.0,55.0,67.0,Other,1700.0,1800,92,1800.0,1600.0,2100.0,92.0,84.0,95.0
4,3100.0,6000,52,6000.0,5300.0,6700.0,52.0,45.0,58.0,Other,40.0,200,21,200.0,200.0,500.0,21.0,18.0,26.0


In [39]:
# label = ['Eastern Mediterranean', 'Africa', 'Americas', 'Europe', 'Western Pacific', 'South-East Asia'];
# label: list[str] = ['Africa', 'Europe', 'Other'];
# cnt = [i for i in range(len(label))];

# mp = {};
# for i in range(len(label)):
#     mp[label[i]] = cnt[i];

# df_all['WHO Region'] = df_all['WHO Region'].map(mp);
s = "WHO Region";
le = LabelEncoder();
df_all[s] = le.fit_transform(df_all[s]);

In [40]:
df_all.head()

Unnamed: 0,Reported number of people receiving ART,Estimated number of people living with HIV,Estimated ART coverage among people living with HIV (%),Estimated number of people living with HIV_median,Estimated number of people living with HIV_min,Estimated number of people living with HIV_max,Estimated ART coverage among people living with HIV (%)_median,Estimated ART coverage among people living with HIV (%)_min,Estimated ART coverage among people living with HIV (%)_max,WHO Region,Reported number of children receiving ART,Estimated number of children needing ART based on WHO methods,Estimated ART coverage among children (%),Estimated number of children needing ART based on WHO methods_median,Estimated number of children needing ART based on WHO methods_min,Estimated number of children needing ART based on WHO methods_max,Estimated ART coverage among children (%)_median,Estimated ART coverage among children (%)_min,Estimated ART coverage among children (%)_max
0,920.0,7200,13,7200.0,4100.0,11000.0,13.0,7.0,20.0,2,60.0,500,17,500.0,500.0,530.0,17.0,10.0,26.0
1,12800.0,16000,81,16000.0,15000.0,17000.0,81.0,75.0,86.0,0,770.0,500,95,500.0,500.0,520.0,95.0,95.0,95.0
2,88700.0,330000,27,330000.0,290000.0,390000.0,27.0,23.0,31.0,0,4800.0,38000,13,38000.0,30000.0,47000.0,13.0,10.0,16.0
3,85500.0,140000,61,140000.0,130000.0,150000.0,61.0,55.0,67.0,2,1700.0,1800,92,1800.0,1600.0,2100.0,92.0,84.0,95.0
4,3100.0,6000,52,6000.0,5300.0,6700.0,52.0,45.0,58.0,2,40.0,200,21,200.0,200.0,500.0,21.0,18.0,26.0


In [41]:
target = df_all['WHO Region'];
data = df_all.drop(['WHO Region'], axis = 1);

sc = StandardScaler();
data = sc.fit_transform(data);

In [45]:
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size = 0.4, random_state = 42);

In [46]:
tree = DecisionTreeClassifier(random_state = 42, max_depth = 5, min_samples_split = 3);
tree.fit(x_train, y_train);

pred = tree.predict(x_test);

print(classification_report(y_test, pred));

              precision    recall  f1-score   support

           0       0.64      0.82      0.72        17
           1       1.00      0.50      0.67         2
           2       0.80      0.63      0.71        19

    accuracy                           0.71        38
   macro avg       0.81      0.65      0.70        38
weighted avg       0.74      0.71      0.71        38



In [47]:
rng = range(1, 100);
dep = range(1, 10);

scores = []

mx: float = 0

for i in rng: 
    for k in dep: 
        rf = RandomForestClassifier(n_estimators = i, random_state = 42, max_depth = k);
        rf.fit(x_train, y_train)
        if (rf.score(x_test, y_test) >= mx): 
            mx = rf.score(x_test, y_test)
            print(f"i: {i}, k: {k}, score: {rf.score(x_test, y_test)}")
        scores.append(rf.score(x_test, y_test))

i: 1, k: 1, score: 0.8157894736842105
i: 1, k: 2, score: 0.8157894736842105
i: 1, k: 3, score: 0.8421052631578947
i: 8, k: 2, score: 0.8421052631578947
i: 9, k: 2, score: 0.8421052631578947
i: 10, k: 5, score: 0.8421052631578947
i: 11, k: 2, score: 0.868421052631579
i: 34, k: 4, score: 0.868421052631579
i: 35, k: 4, score: 0.868421052631579
i: 58, k: 7, score: 0.868421052631579


In [48]:
rf = RandomForestClassifier(n_estimators=58, max_depth=7, random_state=42)
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.94      0.86        17
           1       1.00      1.00      1.00         2
           2       0.94      0.79      0.86        19

    accuracy                           0.87        38
   macro avg       0.91      0.91      0.91        38
weighted avg       0.88      0.87      0.87        38



##### Africa & Americas

In [35]:
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size = 0.35, random_state = 42);

In [36]:
rng = range(1, 100);
dep = range(1, 10);

scores = []

mx: float = 0

for i in rng: 
    for k in dep: 
        rf = RandomForestClassifier(n_estimators = i, random_state = 42, max_depth = k);
        rf.fit(x_train, y_train)
        if (rf.score(x_test, y_test) >= mx): 
            mx = rf.score(x_test, y_test)
            print(f"i: {i}, k: {k}, score: {rf.score(x_test, y_test)}")
        scores.append(rf.score(x_test, y_test))

i: 1, k: 1, score: 0.5757575757575758
i: 1, k: 2, score: 0.6666666666666666
i: 1, k: 3, score: 0.7272727272727273
i: 2, k: 2, score: 0.7272727272727273
i: 2, k: 3, score: 0.7575757575757576
i: 3, k: 3, score: 0.7575757575757576
i: 4, k: 3, score: 0.7878787878787878
i: 65, k: 3, score: 0.7878787878787878
i: 66, k: 3, score: 0.7878787878787878
i: 67, k: 3, score: 0.7878787878787878
