In [4]:
import numpy as np
import pandas as pd
import copy
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import json
import joblib
import optuna

### Feature Gathering

https://jmai.amegroups.org/article/view/8590/html  

Paper suggests the following are important factors to reducing maternal mortality:
- birth control measures/family planning with related reproductive health services
- pregnancy complications management or skilled care during pregnancy and childbirth
- medication of pregnant women with timely emergency obstetric care
- post-immunisations complications focus
- IUCD
- nutrition deficiency

https://www.sciencedirect.com/science/article/pii/S0169023X23000587?pes=vor&utm_source=scopus&getft_integrator=scopus#sec5

- high BMI
- max, mean, min elevation (defines reachibility of area)
- proportion of population living with sanitation, electricity, and clean fuel for cooking 
- percentage of mothers taking iron folic acid for 100 days (to prevent anemia)
- average out-of-pocket spending per birth in a public facility 
- percentage of women who married when they were under 18
- total population sex ratio adn newborn sex ratio (for demographic information)
- health insurance/financial covereage
- proportion of C-section births
- percentage of institutional births in a public facility
- health facilities available


https://www.thelancet.com/action/showPdf?pii=S2214-109X%2823%2900468-0

- safe abortion
- modern contraception
- antenatal, intrapartum, and postpartum care
- pre-existing medical conditions
- climate/eco-system
- economic, political, and cultural bases of societies
(eg, health insurance policies and health budgetary
allocations, health-care policies and legislations on
reproductive rights and access to care, societal norms
and expectations related to gender roles and cultural
beliefs surrounding pregnancy and childbirth).
- social determinants of health are the
conditions in which women are born, grow, work, and live
before pregnancy, and during pregnancy, childbirth, and
the postpartum period
- exposure to external agents (eg,
physical, chemical, and biological hazards, infections,
accidents, and violence)
- pregnancies < 18 or > 35
- existing conditions
- violence against women 
- nutrition deficiency, substance abuse, etc. 


measured:
- skilled birth attendance 
- fertility rate
- life expectancy
- neonatal mortality rate
- universal health coverage
- human development index 
- Risk of impoverishing expend for surgical care
- Antenatal care coverage (four visits)
- cesaerean section rate
- Gender inequality index 
- Gini in equality

https://iris.who.int/bitstream/handle/10665/381012/9789240108462-eng.pdf?sequence=1

- haemorrhage
- hypertensive disorders (pre-existing and during pregnancy)
- complications of unsafe abortion
- noncommunicable diseases and pre-exiting conditions
    Gestational diabetes is the most common medical
    disorder in pregnancy (28). Other NCDs commonly
    experienced by pregnant women include asthma,
    cardiac conditions, epilepsy, haemoglobinopathies,
    and mental health and substance use condition
- social determinants of health, such as education, ethnicity, race, gender and income
- harmful gender norms, biases and inequalities that obstruct the rights of women and girls
- weak health systems that lack adequately trained and competent health workers and essential medical supplies, providing poor quality care with little accountability
- external factors, such as climate change, conflict and humanitarian crises, which cause instability and fragility 
- HIV

In [5]:
merged_data_1985 = pd.DataFrame(pd.read_pickle('../../raw_merged_data_v2_withinyears'))
merged_relevant = merged_data_1985[merged_data_1985['date'] > 1984]
merged_relevant = merged_relevant[merged_relevant['date'] < 2019]
merged_relevant = merged_relevant.drop(['Maternal mortality ratio (modeled estimate, per 100,000 live births)', 'Lifetime risk of maternal death (1 in: rate varies by country)', 'Lifetime risk of maternal death (%)', 'Number of maternal deaths'], axis=1)
merged_features = merged_relevant.drop(columns=['Maternal mortality ratio (national estimate, per 100,000 live births)'], axis=1)


In [82]:
useful_features = ['date', 'setting', 'Women participating in own health care decisions (% of women age 15-49)',
                   'Women and girls who use menstrual materials (% of women and girls ages 15-49 who had a menstrual period within the last year)',
                   'Mortality rate, infant (per 1,000 live births)', 'Female genital mutilation prevalence (%)',
                   'Pregnant women receiving prenatal care (%)', 'Knowledge of any method of contraception (% of all women ages 15-49)',
                   'Demand for family planning satisfied by any methods (% of married women with demand for family planning)',
                   'Unmet need for contraception (% of married women ages 15-49)', 'Pregnant women receiving prenatal care of at least four visits (% of pregnant women)',
                   'Decision maker about a woman\'s own health care: someone else (% of women age 15-49)',
                    "Women\'s share of population ages 15+ living with HIV (%)", 
                    'Teenage mothers (% of women ages 15-19 who have had children or are currently pregnant)', "Births attended by skilled health staff (% of total)",
                    "Prevalence of overweight (% of adults)", "Survival to age 65, female (% of cohort)", 
                    "Stillbirth rate (per 1,000 total births)", "Prevalence of anemia among pregnant women (%)",
                    "Fertility rate, total (births per woman)", 
                    "Women with high empowerment in the attitude to violence domain (%) Quintile 1 (poorest)",
                    "1.0.0 Communicable, maternal, neonatal, and nutritional diseases prevalence (age standardized) (per 100 000 population) female",
                    "1.D.0 Neglected tropical diseases and malaria prevalence (age standardized) (per 100 000 population) female",
                    "1.F.0 Maternal and neonatal disorders prevalence (age standardized) (per 100 000 population) female",
                    "1.G.0 Nutritional deficiencies prevalence (age standardized) (per 100 000 population) female",
                    "2.0.0 Non-communicable diseases prevalence (age standardized) (per 100 000 population) female",
                    "2.G.0 Substance use disorders prevalence (age standardized) (per 100 000 population) female",
                    "3.B.06 Adverse effects of medical treatment prevalence (age standardized) (per 100 000 population) female",
                    "3.C.0 Self-harm and interpersonal violence prevalence (age standardized) (per 100 000 population) female",
                    "3.B.10 Exposure to forces of nature prevalence (age standardized) (per 100 000 population) female",
                    "3.C.04 Conflict and terrorism prevalence (age standardized) (per 100 000 population) female",
                    "School enrollment, secondary (% net) female", "Literacy rate, adult total (% of people ages 15 and above) female",
                    "income_num", "Borrowed for health or medical purposes (% age 15+)",
                    "2.B.04 Hypertensive heart disease prevalence (age standardized) (per 100 000 population) female",
                    "2.H.0 Diabetes and kidney diseases prevalence (age standardized) (per 100 000 population) female",
                    "Probability of Survival to Age 5",
                    "Population living in areas where elevation is below 5 meters (% of total population) rural",
                    "Coverage of social safety net programs (% of population) Quintile 1 (poorest)"
                   ]

In [83]:
len(useful_features)

40

In [6]:
for feature in merged_features.columns:
    print(feature)

date
Women and girls who use menstrual materials (% of women and girls ages 15-49 who had a menstrual period within the last year)
setting
Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 5-14 (% of population ages 5-14)
Mortality rate attributed to household and ambient air pollution, age-standardized, female (per 100,000 female population)
Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 0-4 (% of population ages 0-4)
Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 60+ (% of population ages 60+)
Women participating in own health care decisions (% of women age 15-49)
Antiretroviral therapy coverage (% of adult males living with HIV)
Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 60+, male (% of male population ages 60+)
Comprehensive correct knowledge of HIV/AIDS, ages 15-24, male (2 prevent ways and 

### Loading Datasets

In [45]:
filepath_income = '../../split_income_data'
filepath_year = '../../split_year_data'

In [46]:
test_data_x_inc = pd.read_csv(filepath_income + '/test/X_test.csv')
test_data_x_inc = test_data_x_inc.drop(columns=['Unnamed: 0'], axis=1)
test_data_y_inc = pd.read_csv(filepath_income + '/test/y_test.csv')
test_data_y_inc = test_data_y_inc.drop(columns=['Unnamed: 0'], axis=1)

validation_inc = {}
for fold in range(0, 5):
    vdata_x = pd.read_csv(filepath_income + '/val/X_val_' + str(fold) + '.csv')
    vdata_x = vdata_x.drop(columns=['Unnamed: 0'], axis=1)
    vdata_y = pd.read_csv(filepath_income + '/val/y_val_' + str(fold) + '.csv')
    vdata_y = vdata_y.drop(columns=['Unnamed: 0'], axis=1)
    validation_inc[fold] = [vdata_x, vdata_y]

train_inc = {}
for fold in range(0, 5):
    tdata_x1 = pd.read_csv(filepath_income + '/train/X_train_' + str(fold) + '_1.csv')
    tdata_x1 = tdata_x1.drop(columns=['Unnamed: 0'], axis=1)
    tdata_y1 = pd.read_csv(filepath_income + '/train/y_train_' + str(fold) + '_1.csv')
    tdata_y1 = tdata_y1.drop(columns=['Unnamed: 0'], axis=1)

    train_inc[fold] = [tdata_x1, tdata_y1]

In [47]:
test_data_x_year = pd.read_csv(filepath_year + '/test/X_test.csv')
test_data_x_year = test_data_x_year.drop(columns=['Unnamed: 0'], axis=1)
test_data_y_year = pd.read_csv(filepath_year + '/test/y_test.csv')
test_data_y_year = test_data_y_year.drop(columns=['Unnamed: 0'], axis=1)

validation_year = {}
for fold in range(0, 5):
    vdata_x = pd.read_csv(filepath_year + '/val/X_val_' + str(fold) + '.csv')
    vdata_x = vdata_x.drop(columns=['Unnamed: 0'], axis=1)
    vdata_y = pd.read_csv(filepath_year + '/val/y_val_' + str(fold) + '.csv')
    vdata_y = vdata_y.drop(columns=['Unnamed: 0'], axis=1)
    validation_year[fold] = [vdata_x, vdata_y]

train_year = {}
for fold in range(0, 5):
    tdata_x1 = pd.read_csv(filepath_year + '/train/X_train_' + str(fold) + '_1.csv')
    tdata_x1 = tdata_x1.drop(columns=['Unnamed: 0'], axis=1)
    tdata_y1 = pd.read_csv(filepath_year + '/train/y_train_' + str(fold) + '_1.csv')
    tdata_y1 = tdata_y1.drop(columns=['Unnamed: 0'], axis=1)

    train_year[fold] = [tdata_x1, tdata_y1]

### Thresholding Code

In [48]:
def drop_col(dataset, threshold):
    #excluding columns with greater than the threshold of NAN values 
    to_drop = []
    for col in range(0, dataset.shape[1]):
        proportion = (dataset.iloc[:,col].isnull().sum())/(dataset.shape[0])
        if proportion > threshold:
            to_drop.append(dataset.columns[col])

    colthresh = dataset.drop(to_drop, axis=1)

    colthresh = colthresh.reset_index(drop=True)

    return colthresh

def drop_row(dataset, threshold):
    
    #excluding rows with greater than the threshold of NAN values 
    to_drop = []
    for row in range(1, dataset.shape[0]):
        proportion = (dataset.iloc[row, :].isnull().sum())/(dataset.shape[1])
        if proportion > threshold:
            to_drop.append(row)
    
    rowthresh = dataset.drop(to_drop, axis=0, inplace=False)
    rowthresh = rowthresh.reset_index(drop=True)
    
    return rowthresh

In [49]:
def total_nan_proportion(dataset):
    total_nan = dataset.isnull().sum().sum()
    total = dataset.shape[0] * dataset.shape[1]
    proportion_total = total_nan/total 
    return proportion_total

In [50]:
def iterative_thresholding(dataset_to_threshold, threshold):
    last_proportion = total_nan_proportion(dataset_to_threshold)
    keep_going = True
    dataset_copy = copy.deepcopy(dataset_to_threshold.reset_index())
    changes = []

    while keep_going == True:

        dataset_copy = drop_row(dataset_copy, threshold)
        dataset_copy = drop_col(dataset_copy, threshold)
        
        
        current_proportion = total_nan_proportion(dataset_copy)
        
        #to determine whether dropping the rows and/or columns has caused other rows/columns to breach the threshold
        breached = False
        for col in range(0, dataset_copy.shape[1]):
            proportion = (dataset_copy.iloc[:,col].isnull().sum())/(dataset_copy.shape[0])
            if proportion > threshold:
                breached = True
        for row in range(1, dataset_copy.shape[0]):
            proportion = (dataset_copy.iloc[row, :].isnull().sum())/(dataset_copy.shape[1])
            if proportion > threshold:
                breached = True
        
        if breached == False:
            keep_going = False
        else: 
            changes.append(current_proportion-last_proportion)
            last_proportion = current_proportion 

    return dataset_copy, current_proportion, changes


### Making Datasets

#### Split by Country

In [51]:
for t in (test_data_x_inc.columns):
    print(t)

date
Women and girls who use menstrual materials (% of women and girls ages 15-49 who had a menstrual period within the last year)
setting
Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 5-14 (% of population ages 5-14)
Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 0-4 (% of population ages 0-4)
Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 60+ (% of population ages 60+)
Women participating in own health care decisions (% of women age 15-49)
Antiretroviral therapy coverage (% of adult males living with HIV)
Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions, ages 60+, male (% of male population ages 60+)
Comprehensive correct knowledge of HIV/AIDS, ages 15-24, male (2 prevent ways and reject 3 misconceptions)
Mortality rate, infant (per 1,000 live births)
Female genital mutilation prevalence (%)
Cause of d

In [52]:
test_inc = test_data_x_inc[list(useful_features)]    

In [53]:
validation_inc_coll = {}

for fold in range(0, 5):
    val_y = validation_inc[fold][1].copy()
    val_x = validation_inc[fold][0].copy()

    vx_col = val_x.columns.intersection(list(useful_features))
    vx = val_x[vx_col]  
    validation_inc_coll[fold] = [vx, val_y]

In [54]:
train_inc_coll = {}

for fold in range(0, 5):
    t_y = train_inc[fold][1].copy()
    t_x = train_inc[fold][0].copy()

    tx_col = t_x.columns.intersection(list(useful_features))
    tx = t_x[tx_col]  
    train_inc_coll[fold] = [tx, t_y]
        

In [55]:
needed_thresholds = [0.85, 0.95, 1]

#fold : train_85, rows_left, col_left, train_95, rows_left, col_left, train_100, rows_left, col_left
folds_thresh_inc_coll = {0 : [], 1 : [], 2 : [], 3 : [], 4 : []}

for fold in range(0, 5):
    data_to_thresholdx = train_inc_coll[fold][0]
    data_to_thresholdy = train_inc_coll[fold][1]
    data_to_threshold = pd.concat([data_to_thresholdx, data_to_thresholdy], axis=1)
    
    for thresh in needed_thresholds:
        new_data, new_proportion, new_changes = iterative_thresholding(data_to_threshold, thresh)
        folds_thresh_inc_coll[fold].append(new_data)
        folds_thresh_inc_coll[fold].append(new_data.shape[0])
        folds_thresh_inc_coll[fold].append(new_data.shape[1])

#### Split by Year

In [56]:
test_year = test_data_x_year[list(useful_features)]    

In [57]:
validation_year_coll = {}

for fold in range(0, 5):
    val_y = validation_year[fold][1].copy()
    val_x = validation_year[fold][0].copy()

    vx_col = val_x.columns.intersection(list(useful_features))
    vx = val_x[vx_col]  
    validation_year_coll[fold] = [vx, val_y]

In [58]:
train_year_coll = {}

for fold in range(0, 5):
    t_y = train_year[fold][1].copy()
    t_x = train_year[fold][0].copy()

    tx_col = t_x.columns.intersection(list(useful_features))
    tx = t_x[tx_col]  
    train_year_coll[fold] = [tx, t_y]
        

In [59]:
needed_thresholds = [0.85, 0.95, 1]

#fold : train_85, rows_left, col_left, train_95, rows_left, col_left, train_100, rows_left, col_left
folds_thresh_year_coll = {0 : [], 1 : [], 2 : [], 3 : [], 4 : []}

for fold in range(0, 5):
    data_to_thresholdx = train_year_coll[fold][0]
    data_to_thresholdy = train_year_coll[fold][1]
    data_to_threshold = pd.concat([data_to_thresholdx, data_to_thresholdy], axis=1)
    
    for thresh in needed_thresholds:
        new_data, new_proportion, new_changes = iterative_thresholding(data_to_threshold, thresh)
        folds_thresh_year_coll[fold].append(new_data)
        folds_thresh_year_coll[fold].append(new_data.shape[0])
        folds_thresh_year_coll[fold].append(new_data.shape[1])

### Saving

In [60]:
train_filepath_inc = '../../fs_fromlit_data/income/train'
val_filepath_inc = '../../fs_fromlit_data/income/val'
test_filepath_inc = '../../fs_fromlit_data/income/test'

train_filepath_year = '../../fs_fromlit_data/year/train'
val_filepath_year = '../../fs_fromlit_data/year/val'
test_filepath_year = '../../fs_fromlit_data/year/test'

In [61]:
########### test set

pd.DataFrame(test_inc).to_csv(test_filepath_inc + '/X_test.csv')
pd.DataFrame(test_data_y_inc).to_csv(test_filepath_inc + '/y_test.csv')

In [62]:
########### test set

pd.DataFrame(test_year).to_csv(test_filepath_year + '/X_test.csv')
pd.DataFrame(test_data_y_year).to_csv(test_filepath_year + '/y_test.csv')

In [63]:
########### validation set

for fold in range(0, 5):
    val_input_inc = validation_inc_coll[fold][0]
    val_labels_inc = validation_inc_coll[fold][1]
    pd.DataFrame(val_input_inc).to_csv(val_filepath_inc + '/X_val_' + str(fold) + '.csv')
    pd.DataFrame(val_labels_inc).to_csv(val_filepath_inc + '/y_val_' + str(fold) + '.csv')



In [64]:
########### validation set

for fold in range(0, 5):
    val_input = validation_year_coll[fold][0]
    val_labels = validation_year_coll[fold][1]
    pd.DataFrame(val_input).to_csv(val_filepath_year + '/X_val_' + str(fold) + '.csv')
    pd.DataFrame(val_labels).to_csv(val_filepath_year + '/y_val_' + str(fold) + '.csv')

In [65]:
########### train set
threshs = ['85', '95', '1']

for fold in range(0, 5):
    for idx, thresh in enumerate(threshs):
        idx_multiple = idx*3

        data = folds_thresh_inc_coll[fold][idx_multiple]
        train_input = data.drop(['Maternal mortality ratio (national estimate, per 100,000 live births)'], axis=1)
        train_input_idx_drop = train_input.drop(columns=['index'], axis=1)
        train_labels = data[['Maternal mortality ratio (national estimate, per 100,000 live births)']]
        pd.DataFrame(train_input_idx_drop).to_csv(train_filepath_inc + '/X_train_' + str(fold) + '_' + thresh + '.csv')
        pd.DataFrame(train_labels).to_csv(train_filepath_inc + '/y_train_' + str(fold) + '_' + thresh + '.csv')

        data_year = folds_thresh_year_coll[fold][idx_multiple]
        train_input_year = data_year.drop(['Maternal mortality ratio (national estimate, per 100,000 live births)'], axis=1)
        train_input_idx_drop_year = train_input_year.drop(columns=['index'], axis=1)
        train_labels_year = data_year[['Maternal mortality ratio (national estimate, per 100,000 live births)']]
        pd.DataFrame(train_input_idx_drop_year).to_csv(train_filepath_year + '/X_train_' + str(fold) + '_' + thresh + '.csv')
        pd.DataFrame(train_labels_year).to_csv(train_filepath_year + '/y_train_' + str(fold) + '_' + thresh + '.csv')

In [66]:
########### train set
threshs = ['85', '95', '1']

for fold in range(0, 5):
    for idx, thresh in enumerate(threshs):
        idx_multiple = idx*3

        data_year = folds_thresh_year_coll[fold][idx_multiple]
        train_input_year = data_year.drop(['Maternal mortality ratio (national estimate, per 100,000 live births)'], axis=1)
        train_input_idx_drop_year = train_input_year.drop(columns=['index'], axis=1)
        train_labels_year = data_year[['Maternal mortality ratio (national estimate, per 100,000 live births)']]
        pd.DataFrame(train_input_idx_drop_year).to_csv(train_filepath_year + '/X_train_' + str(fold) + '_' + thresh + '.csv')
        pd.DataFrame(train_labels_year).to_csv(train_filepath_year + '/y_train_' + str(fold) + '_' + thresh + '.csv')

In [67]:
folds_thresh_inc_coll[0][0].shape

(1964, 15)

In [68]:
for fold in range(0, 5):
    print('income')
    print(folds_thresh_inc_coll[fold][0].shape, folds_thresh_inc_coll[fold][3].shape, folds_thresh_inc_coll[fold][6].shape)
    print('year')
    print(folds_thresh_year_coll[fold][0].shape, folds_thresh_year_coll[fold][3].shape, folds_thresh_year_coll[fold][6].shape)

income
(1964, 15) (1964, 33) (1964, 42)
year
(2068, 15) (2068, 35) (2068, 42)
income
(1964, 15) (1964, 35) (1964, 42)
year
(2070, 15) (2070, 33) (2070, 42)
income
(1964, 15) (1964, 36) (1964, 42)
year
(2066, 15) (2066, 34) (2066, 42)
income
(1964, 15) (1964, 36) (1964, 42)
year
(2070, 15) (2070, 34) (2070, 42)
income
(1964, 15) (1964, 35) (1964, 42)
year
(2070, 15) (2070, 34) (2070, 42)


### Loading Saved Datasets

In [69]:
filepath_income = '../../fs_fromlit_data/income'

In [70]:
validation_lit_country = {}

for fold in range(0, 5):
    vdata_x = pd.read_csv(filepath_income + '/val/X_val_' + str(fold) + '.csv')
    vdata_x = vdata_x.drop(columns=['Unnamed: 0'], axis=1)
    vdata_y = pd.read_csv(filepath_income + '/val/y_val_' + str(fold) + '.csv')
    vdata_y = vdata_y.drop(columns=['Unnamed: 0'], axis=1)
    validation_lit_country[fold] = [vdata_x, vdata_y]

In [71]:
train_lit_country = {}

for fold in range(0, 5):
    tdata_x85 = pd.read_csv(filepath_income + '/train/X_train_' + str(fold) + '_85.csv')
    tdata_x85 = tdata_x85.drop(columns=['Unnamed: 0'], axis=1)
    tdata_y85 = pd.read_csv(filepath_income + '/train/y_train_' + str(fold) + '_85.csv')
    tdata_y85 = tdata_y85.drop(columns=['Unnamed: 0'], axis=1)

    tdata_x95 = pd.read_csv(filepath_income + '/train/X_train_' + str(fold) + '_95.csv')
    tdata_x95 = tdata_x95.drop(columns=['Unnamed: 0'], axis=1)
    tdata_y95 = pd.read_csv(filepath_income + '/train/y_train_' + str(fold) + '_95.csv')
    tdata_y95 = tdata_y95.drop(columns=['Unnamed: 0'], axis=1)

    tdata_x1 = pd.read_csv(filepath_income + '/train/X_train_' + str(fold) + '_1.csv')
    tdata_x1 = tdata_x1.drop(columns=['Unnamed: 0'], axis=1)
    tdata_y1 = pd.read_csv(filepath_income + '/train/y_train_' + str(fold) + '_1.csv')
    tdata_y1 = tdata_y1.drop(columns=['Unnamed: 0'], axis=1)

    train_lit_country[fold] = [tdata_x85, tdata_y85, tdata_x95, tdata_y95, tdata_x1, tdata_y1]

In [72]:
filepath_year = '../../fs_fromlit_data/year'

In [73]:
validation_lit_year = {}

for fold in range(0, 5):
    vdata_x = pd.read_csv(filepath_year + '/val/X_val_' + str(fold) + '.csv')
    vdata_x = vdata_x.drop(columns=['Unnamed: 0'], axis=1)
    vdata_y = pd.read_csv(filepath_year + '/val/y_val_' + str(fold) + '.csv')
    vdata_y = vdata_y.drop(columns=['Unnamed: 0'], axis=1)
    validation_lit_year[fold] = [vdata_x, vdata_y]

In [74]:
train_lit_year = {}

for fold in range(0, 5):
    tdata_x85 = pd.read_csv(filepath_year + '/train/X_train_' + str(fold) + '_85.csv')
    tdata_x85 = tdata_x85.drop(columns=['Unnamed: 0'], axis=1)
    tdata_y85 = pd.read_csv(filepath_year + '/train/y_train_' + str(fold) + '_85.csv')
    tdata_y85 = tdata_y85.drop(columns=['Unnamed: 0'], axis=1)

    tdata_x95 = pd.read_csv(filepath_year + '/train/X_train_' + str(fold) + '_95.csv')
    tdata_x95 = tdata_x95.drop(columns=['Unnamed: 0'], axis=1)
    tdata_y95 = pd.read_csv(filepath_year + '/train/y_train_' + str(fold) + '_95.csv')
    tdata_y95 = tdata_y95.drop(columns=['Unnamed: 0'], axis=1)

    tdata_x1 = pd.read_csv(filepath_year + '/train/X_train_' + str(fold) + '_1.csv')
    tdata_x1 = tdata_x1.drop(columns=['Unnamed: 0'], axis=1)
    tdata_y1 = pd.read_csv(filepath_year + '/train/y_train_' + str(fold) + '_1.csv')
    tdata_y1 = tdata_y1.drop(columns=['Unnamed: 0'], axis=1)

    train_lit_year[fold] = [tdata_x85, tdata_y85, tdata_x95, tdata_y95, tdata_x1, tdata_y1]

## LightGBM

In [75]:
def objective(trial, x_train, y_train, x_val, y_val):

    n_trees = trial.suggest_int("number_trees", 10, 300)

    max_depth = trial.suggest_int("max_tree_depth", 3, 25)

    boosting_type = trial.suggest_categorical("boosting_type", ['gbdt', 'dart'])
    
    bagging_fraction = trial.suggest_float("bagging_fraction", 0.1, 1.0)
    bagging_freq = trial.suggest_int("bagging_freq", 0, 10)

    learning_rate = trial.suggest_float("learning_rate", 0, 1)
    l1_norm = trial.suggest_float("l1_norm", 0, 0.001)
    l2_norm = trial.suggest_float("l2_norm", 0, 0.001)

    lgbm_model = lgb.LGBMRegressor(random_state=42, verbosity = -1, n_estimators=n_trees, boosting=boosting_type, max_depth=max_depth, learning_rate=learning_rate, reg_alpha=l1_norm, reg_lambda=l2_norm, bagging_fraction=bagging_fraction, bagging_freq=bagging_freq)
    trained_model = lgbm_model.fit(x_train, y_train)
    y_pred = trained_model.predict(x_val)
    val_loss = mean_squared_error(y_pred, y_val)

    return val_loss  # Optuna minimizes this

### LightGBM by Country

In [77]:
output_dir_inc = '../../fs_fromlit_models/income/lightgbm'

In [78]:
for fold in range(0, 5):
        val_input_data = validation_lit_country[fold][0].copy()
        #because lightgbm cannot handle the comma
        val_input_data.columns = val_input_data.columns.str.replace(r'[\"\[\]\{\}\\:,]', '', regex=True)
        val_input_data['setting'] = val_input_data['setting'].astype("category")
        val_label = validation_lit_country[fold][1].copy()
        val_label.column = 'Maternal mortality ratio (national estimate per 100000 live births)'

        for thresh_idx, thresh in enumerate(['85', '95', '1']):
            train_input_data = train_lit_country[fold][thresh_idx * 2].copy()
            train_input_data['setting'] = train_input_data['setting'].astype("category")
            train_input_data.columns = train_input_data.columns.str.replace(r'[\"\[\]\{\}\\:,]', '', regex=True)

            train_label = train_lit_country[fold][thresh_idx * 2 + 1].copy()
            train_label.column = 'Maternal mortality ratio (national estimate per 100000 live births)'

            columns_needed = val_input_data.columns.intersection(train_input_data.columns)
            val_relevant_input = val_input_data[columns_needed]

            #Create a study object and optimize the objective function.
            study = optuna.create_study(direction='minimize')
            study.optimize(lambda trial: objective(trial, train_input_data, train_label, val_relevant_input, val_label), n_trials=300)
            best_model = lgb.LGBMRegressor(**study.best_params)
            best_model.fit(train_input_data, train_label)

            #save best model 
            joblib.dump(best_model, output_dir_inc + '/best_model_' + str(fold) + '_' + thresh +  '.pkl')
            joblib.dump(study.best_params, f"{output_dir_inc}/best_params_{fold}_{thresh}.pkl")

            # Save study for later visualization
            joblib.dump(study, f"{output_dir_inc}/optuna_study_{fold}_{thresh}.pkl")

            summary = {
                "dataset": str(fold) + '_' + thresh,
                "fold" : fold,
                "threshold": thresh,
                "model": 'lightgbm',
                "best_params": study.best_params,
                "best_optuna_loss": study.best_value
            }

            with open(f"{output_dir_inc}/results_{fold}_{thresh}.json", "w") as f:
                json.dump(summary, f, indent=2)

[I 2025-08-10 12:41:34,151] A new study created in memory with name: no-name-2d1bc111-71f5-418b-b4b6-6bf601e5d8a9
[I 2025-08-10 12:41:34,241] Trial 0 finished with value: 10670.642440947368 and parameters: {'number_trees': 22, 'max_tree_depth': 8, 'boosting_type': 'gbdt', 'bagging_fraction': 0.4090397056954012, 'bagging_freq': 2, 'learning_rate': 0.4959951315949911, 'l1_norm': 0.0005253928117187177, 'l2_norm': 0.0007866744033898418}. Best is trial 0 with value: 10670.642440947368.
[I 2025-08-10 12:41:34,328] Trial 1 finished with value: 8565.570283457295 and parameters: {'number_trees': 52, 'max_tree_depth': 3, 'boosting_type': 'gbdt', 'bagging_fraction': 0.5746923663342927, 'bagging_freq': 2, 'learning_rate': 0.06330548774609801, 'l1_norm': 0.0003790820123838412, 'l2_norm': 0.0005065034271646862}. Best is trial 1 with value: 8565.570283457295.
[I 2025-08-10 12:41:38,064] Trial 2 finished with value: 9534.223354365953 and parameters: {'number_trees': 300, 'max_tree_depth': 14, 'boostin

### LightGBM by Year

In [79]:
output_dir_year = '../../fs_fromlit_models/year/lightgbm'

In [80]:
for fold in range(0, 5):
        val_input_data = validation_lit_year[fold][0].copy()
        #because lightgbm cannot handle the comma
        val_input_data.columns = val_input_data.columns.str.replace(r'[\"\[\]\{\}\\:,]', '', regex=True)
        val_input_data['setting'] = val_input_data['setting'].astype("category")
        val_label = validation_lit_year[fold][1].copy()
        val_label.column = 'Maternal mortality ratio (national estimate per 100000 live births)'

        for thresh_idx, thresh in enumerate(['85', '95', '1']):
            train_input_data = train_lit_year[fold][thresh_idx * 2].copy()
            train_input_data['setting'] = train_input_data['setting'].astype("category")
            train_input_data.columns = train_input_data.columns.str.replace(r'[\"\[\]\{\}\\:,]', '', regex=True)

            train_label = train_lit_year[fold][thresh_idx * 2 + 1].copy()
            train_label.column = 'Maternal mortality ratio (national estimate per 100000 live births)'

            columns_needed = val_input_data.columns.intersection(train_input_data.columns)
            val_relevant_input = val_input_data[columns_needed]

            #Create a study object and optimize the objective function.
            study = optuna.create_study(direction='minimize')
            study.optimize(lambda trial: objective(trial, train_input_data, train_label, val_relevant_input, val_label), n_trials=300)
            best_model = lgb.LGBMRegressor(**study.best_params)
            best_model.fit(train_input_data, train_label)

            #save best model 
            joblib.dump(best_model, output_dir_year + '/best_model_' + str(fold) + '_' + thresh +  '.pkl')
            joblib.dump(study.best_params, f"{output_dir_year}/best_params_{fold}_{thresh}.pkl")

            # Save study for later visualization
            joblib.dump(study, f"{output_dir_year}/optuna_study_{fold}_{thresh}.pkl")

            summary = {
                "dataset": str(fold) + '_' + thresh,
                "fold" : fold,
                "threshold": thresh,
                "model": 'lightgbm',
                "best_params": study.best_params,
                "best_optuna_loss": study.best_value
            }

            with open(f"{output_dir_year}/results_{fold}_{thresh}.json", "w") as f:
                json.dump(summary, f, indent=2)

[I 2025-08-10 13:58:07,191] A new study created in memory with name: no-name-e6be1af3-7dda-4c83-87e3-f39c56d2eff4
[I 2025-08-10 13:58:07,381] Trial 0 finished with value: 4402.328935167294 and parameters: {'number_trees': 39, 'max_tree_depth': 25, 'boosting_type': 'dart', 'bagging_fraction': 0.8098567612297991, 'bagging_freq': 8, 'learning_rate': 0.6817046838102404, 'l1_norm': 0.0006807423771225078, 'l2_norm': 0.0004960791354674878}. Best is trial 0 with value: 4402.328935167294.
[I 2025-08-10 13:58:07,467] Trial 1 finished with value: 4717.513548434876 and parameters: {'number_trees': 53, 'max_tree_depth': 4, 'boosting_type': 'dart', 'bagging_fraction': 0.45148821722566895, 'bagging_freq': 2, 'learning_rate': 0.5965868609083141, 'l1_norm': 0.0008082776553766769, 'l2_norm': 0.000910313683052072}. Best is trial 0 with value: 4402.328935167294.
[I 2025-08-10 13:58:09,267] Trial 2 finished with value: 4353.7504732308835 and parameters: {'number_trees': 287, 'max_tree_depth': 11, 'boosting