<a href="https://colab.research.google.com/github/OmdenaAI/SudanChapter_AnalyzeHealthcareAccessibility/blob/main/04_Data_analysis/Healthcare_Systems/HC_Indicators.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Healthcare Data Analysis (Indicator dataset)

### Import packages and load file

In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [2]:
# load file

df = pd.read_csv("HC_indicators.csv")
df.head()

Unnamed: 0,GHO (CODE),GHO (DISPLAY),YEAR (DISPLAY),COUNTRY (DISPLAY),DIMENSION (NAME),VALUE
0,#indicator+code,#indicator+name,#date+year,#country+name,#dimension+name,#numeric
1,PHE_HHAIR_POP_CATEGORY_FUELS,Population with primary reliance on fuels and ...,2024,Sudan,Urban,0.0543
2,NCD_PAA,Prevalence of insufficient physical activity a...,2024,Sudan,Female,26.649185
3,LIFE_0000000033,nLx - person-years lived between ages x and x+n,2023,Sudan,Both sexes,451194.0274
4,TB_c_newinc,Tuberculosis - new and relapse cases,2023,Sudan,,27562


In [3]:
# accessing the GHO (Display) coloumn

df['GHO (DISPLAY)']

Unnamed: 0,GHO (DISPLAY)
0,#indicator+name
1,Population with primary reliance on fuels and ...
2,Prevalence of insufficient physical activity a...
3,nLx - person-years lived between ages x and x+n
4,Tuberculosis - new and relapse cases
...,...
18297,nqx - probability of dying between ages x and x+n
18298,Tx - person-years lived above age x
18299,Distribution of causes of death among children...
18300,Population pushed below the $2.15 a day povert...


In [4]:
# display full string content

pd.set_option('display.max_colwidth', None)
df['GHO (DISPLAY)']

Unnamed: 0,GHO (DISPLAY)
0,#indicator+name
1,"Population with primary reliance on fuels and technologies for cooking, by fuel type (in millions)"
2,Prevalence of insufficient physical activity among adults aged 18+ years (age-standardized estimate) (%)
3,nLx - person-years lived between ages x and x+n
4,Tuberculosis - new and relapse cases
...,...
18297,nqx - probability of dying between ages x and x+n
18298,Tx - person-years lived above age x
18299,Distribution of causes of death among children aged < 5 years (%)
18300,"Population pushed below the $2.15 a day poverty line by household health expenditures (%, national, rural, urban)"


### Begin GHO (Display) column preprocessing
-- Creating tokens
- Remove punctuation
- Uniform case
- Tokenization implementation
- Stopword removal

In [5]:
# downloading 'punkt_tab'

!pip install nltk
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
# preprocess the "GHO (DISPLAY)" column into tokens

def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = text.lower()  # convert to lowercase
    tokens = word_tokenize(text)  # tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # remove stopwords
    return tokens

df['processed_text'] = df['GHO (DISPLAY)'].apply(preprocess_text) # display 'preprocessed_text' column

In [7]:
df['processed_text']

Unnamed: 0,processed_text
0,[indicatorname]
1,"[population, primary, reliance, fuels, technologies, cooking, fuel, type, millions]"
2,"[prevalence, insufficient, physical, activity, among, adults, aged, 18, years, agestandardized, estimate]"
3,"[nlx, personyears, lived, ages, x, xn]"
4,"[tuberculosis, new, relapse, cases]"
...,...
18297,"[nqx, probability, dying, ages, x, xn]"
18298,"[tx, personyears, lived, age, x]"
18299,"[distribution, causes, death, among, children, aged, 5, years]"
18300,"[population, pushed, 215, day, poverty, line, household, health, expenditures, national, rural, urban]"


In [8]:
# flattenning token list
all_tokens = [word for tokens in df['processed_text'] for word in tokens]

print("Number of tokens:", len(all_tokens))
print(all_tokens)

Number of tokens: 135703


In [9]:
# checking the list of most common keywords
keywords = Counter(all_tokens).most_common(50)
print("Number of common tokens:", len(keywords))
print("Most common keywords:", keywords)

Number of common tokens: 50
Most common keywords: [('number', 4004), ('among', 3914), ('children', 3119), ('deaths', 3117), ('per', 3078), ('age', 2976), ('years', 2745), ('prevalence', 2706), ('population', 2531), ('5', 2016), ('x', 1995), ('estimate', 1951), ('aged', 1910), ('crude', 1854), ('rate', 1603), ('primary', 1592), ('1', 1560), ('000', 1558), ('reliance', 1538), ('fuels', 1538), ('technologies', 1538), ('cooking', 1538), ('bmi', 1485), ('type', 1438), ('alcohol', 1425), ('agestandardized', 1370), ('ages', 1362), ('births', 1256), ('live', 1230), ('adolescents', 1227), ('dying', 1223), ('fuel', 1151), ('death', 1148), ('xn', 1140), ('health', 1121), ('mortality', 1105), ('2', 1059), ('standard', 916), ('probability', 915), ('median', 910), ('deviations', 891), ('1000', 886), ('mean', 852), ('proportion', 843), ('millions', 826), ('adults', 790), ('causes', 777), ('distribution', 755), ('cases', 741), ('people', 740)]


In [10]:
# defining simple keyword groups
categories = {
    'prevalence': ['prevalence', 'rate'],
    'deaths': ['deaths', 'dying', 'mortality'],
    'health': ['health', 'therapy', 'treatment', 'glucose', 'expenditure'],
    'nutrition': ['anaemia', 'underweight', 'overweight', 'wasted'],
    'activity': ['activity', 'active']
}

# assign categories to rows
def categorize(row):
    for category, words in categories.items():
        if any(word in row for word in words):
            return category
    return 'other'

df['category'] = df['processed_text'].apply(lambda x: categorize(x))

print(categories.items())

dict_items([('prevalence', ['prevalence', 'rate']), ('deaths', ['deaths', 'dying', 'mortality']), ('health', ['health', 'therapy', 'treatment', 'glucose', 'expenditure']), ('nutrition', ['anaemia', 'underweight', 'overweight', 'wasted']), ('activity', ['activity', 'active'])])


In [11]:
df['category']

Unnamed: 0,category
0,other
1,other
2,prevalence
3,other
4,other
...,...
18297,deaths
18298,other
18299,other
18300,health


In [12]:
print(df['category'].value_counts())

category
other         8096
prevalence    4309
deaths        3710
health        1699
nutrition      424
activity        64
Name: count, dtype: int64


### Cleaning the indicator dataset

In [13]:
df.head()

Unnamed: 0,GHO (CODE),GHO (DISPLAY),YEAR (DISPLAY),COUNTRY (DISPLAY),DIMENSION (NAME),VALUE,processed_text,category
0,#indicator+code,#indicator+name,#date+year,#country+name,#dimension+name,#numeric,[indicatorname],other
1,PHE_HHAIR_POP_CATEGORY_FUELS,"Population with primary reliance on fuels and technologies for cooking, by fuel type (in millions)",2024,Sudan,Urban,0.0543,"[population, primary, reliance, fuels, technologies, cooking, fuel, type, millions]",other
2,NCD_PAA,Prevalence of insufficient physical activity among adults aged 18+ years (age-standardized estimate) (%),2024,Sudan,Female,26.649185,"[prevalence, insufficient, physical, activity, among, adults, aged, 18, years, agestandardized, estimate]",prevalence
3,LIFE_0000000033,nLx - person-years lived between ages x and x+n,2023,Sudan,Both sexes,451194.0274,"[nlx, personyears, lived, ages, x, xn]",other
4,TB_c_newinc,Tuberculosis - new and relapse cases,2023,Sudan,,27562,"[tuberculosis, new, relapse, cases]",other


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18302 entries, 0 to 18301
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   GHO (CODE)         18302 non-null  object
 1   GHO (DISPLAY)      18302 non-null  object
 2   YEAR (DISPLAY)     18302 non-null  object
 3   COUNTRY (DISPLAY)  18302 non-null  object
 4   DIMENSION (NAME)   14864 non-null  object
 5   VALUE              18302 non-null  object
 6   processed_text     18302 non-null  object
 7   category           18302 non-null  object
dtypes: object(8)
memory usage: 1.1+ MB


In [15]:
df.columns # checking column names to rename

Index(['GHO (CODE)', 'GHO (DISPLAY)', 'YEAR (DISPLAY)', 'COUNTRY (DISPLAY)',
       'DIMENSION (NAME)', 'VALUE', 'processed_text', 'category'],
      dtype='object')

In [16]:
# renaming column names

df.rename(columns={'DIMENSION (NAME)': 'dimension', 'YEAR (DISPLAY)': 'year', 'VALUE': 'value', 'category': 'gho_category', \
                       'COUNTRY (DISPLAY)': 'country'}, inplace=True)
df.columns

Index(['GHO (CODE)', 'GHO (DISPLAY)', 'year', 'country', 'dimension', 'value',
       'processed_text', 'gho_category'],
      dtype='object')

In [17]:
# Saving the output in a CSV file
# df.to_csv('output.csv', index=False)

In [18]:
# selecting features
new_df = df[['country', 'year', 'dimension', 'value', 'gho_category']]

# dropping the first row
new_df = new_df.iloc[1:].copy()

# Displaying information about the DataFrame
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18301 entries, 1 to 18301
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   country       18301 non-null  object
 1   year          18301 non-null  object
 2   dimension     14863 non-null  object
 3   value         18301 non-null  object
 4   gho_category  18301 non-null  object
dtypes: object(5)
memory usage: 715.0+ KB


In [19]:
new_df['value'].dtype

dtype('O')

In [20]:
# coercing 'value' dtype

new_df['value'] = new_df['value'].astype(float)
new_df['value'].dtype

dtype('float64')

In [21]:
print(new_df['value'].unique(), "\n", new_df['gho_category'].unique())  # Display all unique values


[5.43000000e-02 2.66491850e+01 4.51194027e+05 ... 1.20300000e-02
 1.95552175e+06 2.03940000e+00] 
 ['other' 'prevalence' 'deaths' 'activity' 'health' 'nutrition']


### Creating visuals
- Some EDA
- Create Visuals

In [22]:
grouped = pd.DataFrame(new_df.groupby(['year', 'gho_category'])['value'].sum())

grouped.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,value
year,gho_category,Unnamed: 2_level_1
1953,deaths,0.01203
1953,health,3.19
1953,other,2034855.0
1953,prevalence,2.0394
1954,deaths,4309.806
1954,other,1445.142
1954,prevalence,6.599375
1955,deaths,0.0
1955,health,0.0
1955,other,9801.046


In [23]:
# Restructuring the DataFrame
reshaped_grouped = grouped.reset_index().pivot(index='year', columns= 'gho_category', values= 'value')

# Display the result
print(reshaped_grouped)


gho_category   activity         deaths        health    nutrition  \
year                                                                
1953                NaN       0.012030  3.190000e+00          NaN   
1954                NaN    4309.806320           NaN          NaN   
1955                NaN       0.000000  0.000000e+00          NaN   
1956                NaN   10270.366070  0.000000e+00          NaN   
1957                NaN    1365.688640           NaN          NaN   
...                 ...            ...           ...          ...   
2020          90.621570  899959.496490  4.414681e+02   8159.67375   
2021          90.647975  959912.019965  5.832880e+06  19270.16155   
2022          90.022745  846113.008420  4.982395e+06   8326.60000   
2023          88.170300   25495.779520           NaN          NaN   
2024                NaN            NaN           NaN          NaN   

gho_category         other   prevalence  
year                                     
1953          2.03