In [1]:

import numpy as np # linear algebra
import pandas as pd 
import warnings

warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv('HIV_AIDS prevalence estimates table.csv', encoding='ISO-8859-1')  # aka latin1


In [3]:
df.head(20)

Unnamed: 0,Country/Region,Adult prevalence of HIV/AIDS,Number of people with HIV/AIDS,Annual deaths from HIV/AIDS,Year of estimate
0,ÿEswatini,28.30%,240000,3000,2024
1,ÿLesotho,26.20%,445000,7000,2025
2,ÿBotswana,22.80%,399100,5275,2025
3,ÿZimbabwe,22.10%,1660000,25600,2024
4,ÿSouth Africa,14.40%,9400000,80000,2025
5,ÿMozambique,12.65%,2485000,59100,2024
6,ÿMalawi,11.40%,1642570,15270,2024
7,ÿEquatorial Guinea,7.66%,74165,2400,2024
8,ÿTanzania,7%,3300000,39000,2025
9,ÿKenya,5.10%,1710000,24000,2023


In [4]:
df.tail()

Unnamed: 0,Country/Region,Adult prevalence of HIV/AIDS,Number of people with HIV/AIDS,Annual deaths from HIV/AIDS,Year of estimate
188,ÿAndorra,-,600,-,2025
189,ÿFederated States of Micronesia,-,149,-,2025
190,ÿJordan,-,2000,-,2025
191,ÿLuxembourg,-,5700,-,2025
192,ÿMaldives,-,660,-,2025


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 5 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Country/Region                  193 non-null    object
 1   Adult prevalence of HIV/AIDS    193 non-null    object
 2   Number of people with HIV/AIDS  192 non-null    object
 3   Annual deaths from HIV/AIDS     193 non-null    object
 4   Year of estimate                193 non-null    object
dtypes: object(5)
memory usage: 7.7+ KB


In [6]:
df.describe()

Unnamed: 0,Country/Region,Adult prevalence of HIV/AIDS,Number of people with HIV/AIDS,Annual deaths from HIV/AIDS,Year of estimate
count,193,193,192,193,193
unique,193,73,168,56,16
top,ÿEswatini,-,11000,-,2024
freq,1,53,6,126,57


In [7]:
df.isnull().sum()

Country/Region                    0
Adult prevalence of HIV/AIDS      0
Number of people with HIV/AIDS    1
Annual deaths from HIV/AIDS       0
Year of estimate                  0
dtype: int64

In [8]:
df.duplicated().sum()

np.int64(0)

In [9]:
df.shape

(193, 5)

In [10]:
df.dtypes

Country/Region                    object
Adult prevalence of HIV/AIDS      object
Number of people with HIV/AIDS    object
Annual deaths from HIV/AIDS       object
Year of estimate                  object
dtype: object

In [11]:
df.columns


Index(['Country/Region', 'Adult prevalence of HIV/AIDS',
       'Number of people with HIV/AIDS', 'Annual deaths from HIV/AIDS',
       'Year of estimate'],
      dtype='object')

In [12]:
df.columns = [
    'Country',             
    'Prevalence_Rate',    
    'People_with_HIV',     
    'Annual_Deaths',      
    'Year'              
]

In [13]:
def clean_numeric(col):
    return (
        df[col]
        .astype(str)
        .str.replace(r'\[.*?\]', '', regex=True)  
        .str.replace(',', '', regex=False)        
        .str.replace('%', '', regex=False)        
        .str.replace('–', '', regex=False)        
        .str.replace('…', '', regex=False)        
        .str.replace('-', '', regex=False)        
        .str.replace(' ', '', regex=False)
        .str.replace(r'\[.*?\]', '', regex=True)  
        .str.replace(r'[^\d\.\-]', '', regex=True)
        .replace(['', 'nan', '-', '…'], np.nan) 
        .replace(['', 'nan'], np.nan)             
        .astype(float)                            
    )

In [14]:
df['Prevalence_Rate'] = clean_numeric('Prevalence_Rate')
df['People_with_HIV'] = clean_numeric('People_with_HIV')
df['Annual_Deaths'] = clean_numeric('Annual_Deaths')
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

In [15]:
df = df.dropna(subset=['People_with_HIV', 'Prevalence_Rate', 'Annual_Deaths', 'Year'])

In [16]:
df['Death_Rate_%'] = (df['Annual_Deaths'] / df['People_with_HIV']) * 100

In [17]:
print(df.columns.tolist())

['Country', 'Prevalence_Rate', 'People_with_HIV', 'Annual_Deaths', 'Year', 'Death_Rate_%']


## Predictive modeling

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score,classification_report
from sklearn.ensemble import RandomForestClassifier

In [19]:
# Optional: Encode country if needed
le = LabelEncoder()
df['Country_Code'] = le.fit_transform(df['Country'])

# Define threshold (e.g. top 25% death rates are "high")
threshold = df['Death_Rate_%'].quantile(0.75)
df['High_Death_Rate'] = (df['Death_Rate_%'] >= threshold).astype(int)

# Features and target
X = df[['Prevalence_Rate', 'People_with_HIV', 'Annual_Deaths', 'Year', 'Country_Code']]
y = df['High_Death_Rate']


In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Optional: Standardize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [21]:
def train_and_evaluate(model, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred) * 100
    print(f"\n🔍 {model_name} Accuracy: {acc:.2f}%")

    print("\n📋 Classification Report:")
    print(classification_report(y_test, y_pred))


In [22]:
models = [
    (RandomForestClassifier(), "Random Forest")
]

for model, name in models:
    train_and_evaluate(model, name)





🔍 Random Forest Accuracy: 92.31%

📋 Classification Report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        10
           1       1.00      0.67      0.80         3

    accuracy                           0.92        13
   macro avg       0.95      0.83      0.88        13
weighted avg       0.93      0.92      0.92        13

