In [1]:
# We're using these tools to help us work with data and make predictions
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.impute import SimpleImputer
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset and check the headers and statistics
file_path = 'C:/dhs_zm/ZMBR71DT_birth recode/ZMBR71FL.DTA'
df = pd.read_stata(file_path, convert_categoricals=False)

# Extracting important columns
important_columns = ['b5', 'b7', 'b8', 'v008', 'v005', 'v106', 'v025', 'v190', 'v101', 'v113']

# Check headers and statistics for the selected columns
df_headers = df[important_columns].head()
df_describe = df[important_columns].describe()

print(df_headers, df_describe)

# Check how many rows have missing values
missing_info = df[important_columns].isnull().sum()
print("Missing values per column:\n", missing_info)

# If there are missing values, impute them
imputer = SimpleImputer(strategy='most_frequent')
df[important_columns] = imputer.fit_transform(df[important_columns])

# Confirm dataset is not empty
if df[important_columns].shape[0] == 0:
    raise ValueError("Dataset is empty after preprocessing!")

print("Imputation and preprocessing done!")


   b5   b7    b8  v008     v005  v106  v025  v190  v101  v113
0   1  NaN   0.0  1424  1892890     0     2     1     3    21
1   1  NaN   6.0  1424  1892890     0     2     1     3    21
2   1  NaN   8.0  1424  1892890     0     2     1     3    21
3   1  NaN  11.0  1424  1892890     0     2     1     3    21
4   0  2.0   NaN  1424  1892890     0     2     1     3    21                  b5           b7            b8         v008          v005  \
count  38446.000000  3495.000000  34951.000000  38446.00000  3.844600e+04   
mean       0.909093    22.993133     10.156190   1425.79691  9.786958e+05   
std        0.287480    45.009659      7.370261      1.59545  6.787447e+05   
min        0.000000     0.000000      0.000000   1423.00000  5.194300e+04   
25%        1.000000     0.000000      4.000000   1424.00000  5.905410e+05   
50%        1.000000     8.000000      9.000000   1426.00000  8.217910e+05   
75%        1.000000    24.000000     15.000000   1427.00000  1.147467e+06   
max        1

In [11]:
# Scale weights
df['weight'] = df['v005'] / 1_000_000

# Filter for births within 5 years preceding the survey
current_month = df['v008'].max()
five_year_cutoff = current_month - 60
df_recent = df[df['v008'] >= five_year_cutoff]

# Calculate weighted total live births
weighted_total_births = df_recent[df_recent['b5'] == 1]['weight'].sum()

# Calculate weighted infant deaths (age < 12 months)
weighted_infant_deaths = df_recent[(df_recent['b5'] == 0) & (df_recent['b7'] < 12)]['weight'].sum()

# Calculate weighted under-5 deaths (age < 60 months)
weighted_under5_deaths = df_recent[(df_recent['b5'] == 0) & (df_recent['b7'] < 60)]['weight'].sum()

# Calculate IMR and U5MR
imr = (weighted_infant_deaths / weighted_total_births) * 1000
u5mr = (weighted_under5_deaths / weighted_total_births) * 1000

print(f"IMR (weighted): {imr:.2f} per 1,000 live births")
print(f"U5MR (weighted): {u5mr:.2f} per 1,000 live births")


IMR (weighted): 56.14 per 1,000 live births
U5MR (weighted): 88.22 per 1,000 live births
