In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neural_network import MLPRegressor

In [2]:
from google.colab import drive
drive.mount('/content/MyDrive')

Drive already mounted at /content/MyDrive; to attempt to forcibly remount, call drive.mount("/content/MyDrive", force_remount=True).


In [3]:
!ls ./MyDrive/MyDrive/omneda_sudan/

dtm_sdn_smu-bi-weekly-13-_-17122024_v02_public_hdx.xlsx
education_sdn.csv
final_socioeco_demographics.csv
gender_sdn.csv
hdro_indicators_aggregates_sdn.csv
hdro_indicators_sdn.csv
indicators_sdn.csv
primary-secondary-enrollment-completion-rates.csv
sdg_data_sdn.csv
sdg_indicatorlist_sdn.csv
WPP2024_GEN_F01_DEMOGRAPHIC_INDICATORS_COMPACT.xlsx
WPP2024_GEN_F01_DEMOGRAPHIC_INDICATORS.xlsx


#### Import the data

In [4]:
df = pd.read_csv('./MyDrive/MyDrive/omneda_sudan/final_socioeco_demographics.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,"Access to anti-retroviral drugs, female (%)","Access to anti-retroviral drugs, male (%)","Agriculture, forestry, and fishing, value added (% of GDP)",Current health expenditure (% of GDP),Domestic general government health expenditure (% of GDP),Domestic general government health expenditure (% of current health expenditure),Domestic private health expenditure (% of current health expenditure),"Immunization, DPT (% of children ages 12-23 months)","Immunization, HepB3 (% of one-year-old children)",...,People with basic handwashing facilities including soap and water (% of population),Political Stability and Absence of Violence/Terrorism: Estimate,Political Stability and Absence of Violence/Terrorism: Percentile Rank,Rural population,Rural population (% of total population),Tuberculosis treatment success rate (% of new cases),Urban population,Urban population (% of total population),Year,total_deaths_from_violence
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1950.0,
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1951.0,
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1952.0,
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1953.0,
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1954.0,


In [6]:
df.columns

Index(['Unnamed: 0', 'Access to anti-retroviral drugs, female (%)',
       'Access to anti-retroviral drugs, male (%)',
       'Agriculture, forestry, and fishing, value added (% of GDP)',
       'Current health expenditure (% of GDP)',
       'Domestic general government health expenditure (% of GDP)',
       'Domestic general government health expenditure (% of current health expenditure)',
       'Domestic private health expenditure (% of current health expenditure)',
       'Immunization, DPT (% of children ages 12-23 months)',
       'Immunization, HepB3 (% of one-year-old children)',
       'Immunization, measles (% of children ages 12-23 months)',
       'Incidence of malaria (per 1,000 population at risk)',
       'Incidence of tuberculosis (per 100,000 people)',
       'Internally displaced persons, new displacement associated with conflict and violence (number of cases)',
       'Internally displaced persons, new displacement associated with disasters (number of cases)',
    

In [7]:
# df2=df[df['Tuberculosis treatment success rate (% of new cases)'] != 0]
df2=df.copy()
df2.shape

(74, 34)

#### Filter out the columns that are related to the Tuberclosis treatment success rate.
This is an assumption made that the other selected columns contribute to the success in treating tuberclosis

In [8]:
cols_X = ['Access to anti-retroviral drugs, female (%)',
        'Access to anti-retroviral drugs, male (%)',
        'Current health expenditure (% of GDP)',
        'Domestic general government health expenditure (% of current health expenditure)',
        'Domestic private health expenditure (% of current health expenditure)',
        'Immunization, DPT (% of children ages 12-23 months)',
        'Immunization, HepB3 (% of one-year-old children)',
        'Immunization, measles (% of children ages 12-23 months)',
        'Incidence of malaria (per 1,000 population at risk)',
        'Incidence of tuberculosis (per 100,000 people)',
        'People using at least basic drinking water services (% of population)',
        'People using at least basic sanitation services (% of population)',
        'People with basic handwashing facilities including soap and water (% of population)',
        'Rural population (% of total population)',
        'Urban population (% of total population)',
        'Year'
        ]
cols_Y = 'Tuberculosis treatment success rate (% of new cases)'

In [9]:
df[cols_X]

Unnamed: 0,"Access to anti-retroviral drugs, female (%)","Access to anti-retroviral drugs, male (%)",Current health expenditure (% of GDP),Domestic general government health expenditure (% of current health expenditure),Domestic private health expenditure (% of current health expenditure),"Immunization, DPT (% of children ages 12-23 months)","Immunization, HepB3 (% of one-year-old children)","Immunization, measles (% of children ages 12-23 months)","Incidence of malaria (per 1,000 population at risk)","Incidence of tuberculosis (per 100,000 people)",People using at least basic drinking water services (% of population),People using at least basic sanitation services (% of population),People with basic handwashing facilities including soap and water (% of population),Rural population (% of total population),Urban population (% of total population),Year
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1950.0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1951.0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1952.0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1953.0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1954.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,28,34,4,22,70,93,93,90,65,67,61,36,14,65,34,2019.0
70,31,35,3,34,58,90,90,86,73,62,62,36,12,64,35,2020.0
71,29,30,2,27,62,84,84,81,72,58,63,0,10,64,35,2021.0
72,28,30,0,0,0,68,68,66,71,54,64,0,10,64,35,2022.0


In [10]:
## one-hot encoding of the year column
## Split the data in to X (features) and y (target)

# df2.drop(columns='Year', axis=1)
X = pd.get_dummies(df2[cols_X], columns=['Year'])
# X = df2[cols_X]
y = df2[cols_Y]

In [11]:
# X_Train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,)
# there isn't much data that is available and since its also by year.
# instead of splitting randomingly, take 80 percent for training and rest for testing
# the split is manual

split = int(df2.shape[0] *.8)

X_train = X[:split]
X_test = X[split:]
y_train = y[:split]
y_test = y[split:]

#### Try Linear Regression

In [12]:
reg = LinearRegression().fit(X_train, y_train)
print(f"score : {reg.score(X_train, y_train)} \n")

for i, j in zip(y_test, reg.predict(X_test)):
  print("target vs prediction : {:2}, vs {:2.0f}, difference: {:6.2f}".format(i, j, i-j))

score : 1.0 

target vs prediction : 76, vs 77, difference:  -1.22
target vs prediction : 75, vs 84, difference:  -9.31
target vs prediction : 68, vs 82, difference: -13.55
target vs prediction : 75, vs 84, difference:  -9.05
target vs prediction : 82, vs 85, difference:  -2.81
target vs prediction : 82, vs 88, difference:  -5.63
target vs prediction : 79, vs 91, difference: -11.95
target vs prediction : 78, vs 90, difference: -11.59
target vs prediction : 80, vs 88, difference:  -7.96
target vs prediction : 84, vs 88, difference:  -4.49
target vs prediction : 83, vs 87, difference:  -3.62
target vs prediction : 86, vs 87, difference:  -0.89
target vs prediction : 88, vs 23, difference:  65.07
target vs prediction : 66, vs  8, difference:  57.58
target vs prediction :  0, vs  4, difference:  -3.98


#### Try Ridge regression

In [13]:
reg = Ridge(solver='svd').fit(X_train, y_train)
print(f"score on training set: {reg.score(X_train, y_train)}\n")

for i, j in zip(y_test, reg.predict(X_test)):
  print("target vs prediction : {:2}, vs {:2.0f}, difference: {:6.2f}".format(i, j, i-j))

score on training set: 0.9996718590603032

target vs prediction : 76, vs 76, difference:  -0.21
target vs prediction : 75, vs 82, difference:  -6.78
target vs prediction : 68, vs 79, difference: -10.77
target vs prediction : 75, vs 80, difference:  -5.29
target vs prediction : 82, vs 82, difference:   0.32
target vs prediction : 82, vs 81, difference:   0.77
target vs prediction : 79, vs 84, difference:  -5.19
target vs prediction : 78, vs 81, difference:  -2.52
target vs prediction : 80, vs 79, difference:   1.18
target vs prediction : 84, vs 77, difference:   7.03
target vs prediction : 83, vs 75, difference:   8.17
target vs prediction : 86, vs 72, difference:  14.17
target vs prediction : 88, vs 36, difference:  52.45
target vs prediction : 66, vs 10, difference:  56.44
target vs prediction :  0, vs  8, difference:  -7.53


## Trying out the MLP Regressor

In [14]:
reg= MLPRegressor(random_state=1, max_iter=200).fit(X_train, y_train)
print(f"score on training set: {reg.score(X_train, y_train)}\n")


for i, j in zip(y_test, reg.predict(X_test)):
  print("target vs prediction : {:2}, vs {:2.0f}, difference: {:6.2f}".format(i, j, i-j))

score on training set: 0.9985820243665755

target vs prediction : 76, vs 77, difference:  -0.92
target vs prediction : 75, vs 79, difference:  -4.45
target vs prediction : 68, vs 77, difference:  -9.30
target vs prediction : 75, vs 78, difference:  -2.95
target vs prediction : 82, vs 80, difference:   1.80
target vs prediction : 82, vs 80, difference:   2.14
target vs prediction : 79, vs 83, difference:  -3.61
target vs prediction : 78, vs 84, difference:  -5.80
target vs prediction : 80, vs 85, difference:  -5.02
target vs prediction : 84, vs 85, difference:  -1.37
target vs prediction : 83, vs 87, difference:  -3.93
target vs prediction : 86, vs 86, difference:   0.06
target vs prediction : 88, vs 78, difference:   9.89
target vs prediction : 66, vs 53, difference:  12.66
target vs prediction :  0, vs 12, difference: -11.59




In [15]:
reg= MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)
print(f"score on training set: {reg.score(X_train, y_train)}\n")


for i, j in zip(y_test, reg.predict(X_test)):
  print("target vs prediction : {:2}, vs {:2.0f}, difference: {:6.2f}".format(i, j, i-j))

score on training set: 0.9995875644729398

target vs prediction : 76, vs 79, difference:  -3.04
target vs prediction : 75, vs 80, difference:  -5.00
target vs prediction : 68, vs 74, difference:  -6.07
target vs prediction : 75, vs 75, difference:  -0.40
target vs prediction : 82, vs 77, difference:   4.66
target vs prediction : 82, vs 77, difference:   4.76
target vs prediction : 79, vs 80, difference:  -1.21
target vs prediction : 78, vs 82, difference:  -4.42
target vs prediction : 80, vs 84, difference:  -3.68
target vs prediction : 84, vs 85, difference:  -1.36
target vs prediction : 83, vs 88, difference:  -4.55
target vs prediction : 86, vs 88, difference:  -1.91
target vs prediction : 88, vs 79, difference:   8.96
target vs prediction : 66, vs 53, difference:  13.09
target vs prediction :  0, vs  5, difference:  -5.06


