In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy
import numpy as np
from sklearn.model_selection import GroupKFold

### Data Loading

In [11]:
filepath_income = '../split_income_data'
filepath_year = '../split_year_data'

In [12]:
test_data_x_inc = pd.read_csv(filepath_income + '/test/X_test.csv')
test_data_x_inc = test_data_x_inc.drop(columns=['Unnamed: 0'], axis=1)
test_data_y_inc = pd.read_csv(filepath_income + '/test/y_test.csv')
test_data_y_inc = test_data_y_inc.drop(columns=['Unnamed: 0'], axis=1)

validation_inc = {}
for fold in range(0, 5):
    vdata_x = pd.read_csv(filepath_income + '/val/X_val_' + str(fold) + '.csv')
    vdata_x = vdata_x.drop(columns=['Unnamed: 0'], axis=1)
    vdata_y = pd.read_csv(filepath_income + '/val/y_val_' + str(fold) + '.csv')
    vdata_y = vdata_y.drop(columns=['Unnamed: 0'], axis=1)
    validation_inc[fold] = [vdata_x, vdata_y]

train_inc = {}
for fold in range(0, 5):
    tdata_x1 = pd.read_csv(filepath_income + '/train/X_train_' + str(fold) + '_1.csv')
    tdata_x1 = tdata_x1.drop(columns=['Unnamed: 0'], axis=1)
    tdata_y1 = pd.read_csv(filepath_income + '/train/y_train_' + str(fold) + '_1.csv')
    tdata_y1 = tdata_y1.drop(columns=['Unnamed: 0'], axis=1)

    train_inc[fold] = [tdata_x1, tdata_y1]

In [13]:
test_data_x_year = pd.read_csv(filepath_year + '/test/X_test.csv')
test_data_x_year = test_data_x_year.drop(columns=['Unnamed: 0'], axis=1)
test_data_y_year = pd.read_csv(filepath_year + '/test/y_test.csv')
test_data_y_year = test_data_y_year.drop(columns=['Unnamed: 0'], axis=1)

validation_year = {}
for fold in range(0, 5):
    vdata_x = pd.read_csv(filepath_year + '/val/X_val_' + str(fold) + '.csv')
    vdata_x = vdata_x.drop(columns=['Unnamed: 0'], axis=1)
    vdata_y = pd.read_csv(filepath_year + '/val/y_val_' + str(fold) + '.csv')
    vdata_y = vdata_y.drop(columns=['Unnamed: 0'], axis=1)
    validation_year[fold] = [vdata_x, vdata_y]

train_year = {}
for fold in range(0, 5):
    tdata_x1 = pd.read_csv(filepath_year + '/train/X_train_' + str(fold) + '_1.csv')
    tdata_x1 = tdata_x1.drop(columns=['Unnamed: 0'], axis=1)
    tdata_y1 = pd.read_csv(filepath_year + '/train/y_train_' + str(fold) + '_1.csv')
    tdata_y1 = tdata_y1.drop(columns=['Unnamed: 0'], axis=1)

    train_year[fold] = [tdata_x1, tdata_y1]

In [14]:
import pickle
with open('../feature_selection_models/corr_60.pkl', 'rb') as f:
    corr_60 = pickle.load(f)
with open('../feature_selection_models/corr_70.pkl', 'rb') as f:
    corr_70 = pickle.load(f)
with open('../feature_selection_models/corr_80.pkl', 'rb') as f:
    corr_80 = pickle.load(f)

### Thresholding Code

In [15]:
def drop_col(dataset, threshold):
    #excluding columns with greater than the threshold of NAN values 
    to_drop = []
    for col in range(0, dataset.shape[1]):
        proportion = (dataset.iloc[:,col].isnull().sum())/(dataset.shape[0])
        if proportion > threshold:
            to_drop.append(dataset.columns[col])

    colthresh = dataset.drop(to_drop, axis=1)

    colthresh = colthresh.reset_index(drop=True)

    return colthresh

def drop_row(dataset, threshold):
    
    #excluding rows with greater than the threshold of NAN values 
    to_drop = []
    for row in range(1, dataset.shape[0]):
        proportion = (dataset.iloc[row, :].isnull().sum())/(dataset.shape[1])
        if proportion > threshold:
            to_drop.append(row)
    
    rowthresh = dataset.drop(to_drop, axis=0, inplace=False)
    rowthresh = rowthresh.reset_index(drop=True)
    
    return rowthresh

In [16]:
def total_nan_proportion(dataset):
    total_nan = dataset.isnull().sum().sum()
    total = dataset.shape[0] * dataset.shape[1]
    proportion_total = total_nan/total 
    return proportion_total

In [17]:
def iterative_thresholding(dataset_to_threshold, threshold):
    last_proportion = total_nan_proportion(dataset_to_threshold)
    keep_going = True
    dataset_copy = copy.deepcopy(dataset_to_threshold.reset_index())
    changes = []

    while keep_going == True:

        dataset_copy = drop_row(dataset_copy, threshold)
        dataset_copy = drop_col(dataset_copy, threshold)
        
        
        current_proportion = total_nan_proportion(dataset_copy)
        
        #to determine whether dropping the rows and/or columns has caused other rows/columns to breach the threshold
        breached = False
        for col in range(0, dataset_copy.shape[1]):
            proportion = (dataset_copy.iloc[:,col].isnull().sum())/(dataset_copy.shape[0])
            if proportion > threshold:
                breached = True
        for row in range(1, dataset_copy.shape[0]):
            proportion = (dataset_copy.iloc[row, :].isnull().sum())/(dataset_copy.shape[1])
            if proportion > threshold:
                breached = True
        
        if breached == False:
            keep_going = False
        else: 
            changes.append(current_proportion-last_proportion)
            last_proportion = current_proportion 

    return dataset_copy, current_proportion, changes


### Cross-Validation Correlation

#### Split by Country

##### Getting Correlation Based Subsets of Folds

In [18]:
test_inc_60_col = test_data_x_inc.columns.intersection(list(corr_60))
test_inc_60 = test_data_x_inc[test_inc_60_col]         

test_inc_70_col = test_data_x_inc.columns.intersection(list(corr_70))
test_inc_70 = test_data_x_inc[test_inc_70_col]      

test_inc_80_col = test_data_x_inc.columns.intersection(list(corr_80))
test_inc_80 = test_data_x_inc[test_inc_80_col]      

In [19]:
#corr : {1 : [x, y], 2 : [x, y], etc.}
validation_inc_corr = {}

v_fold_60_inc = {}
v_fold_70_inc = {}
v_fold_80_inc = {}

for fold in range(0, 5):
    val_y = validation_inc[fold][1].copy()
    val_x = validation_inc[fold][0].copy()

    vx_60_col = val_x.columns.intersection(list(corr_60))
    vx_60 = val_x[vx_60_col]  
    v_fold_60_inc[fold] = [vx_60, val_y]

    vx_70_col = val_x.columns.intersection(list(corr_70))
    vx_70 = val_x[vx_70_col]  
    v_fold_70_inc[fold] = [vx_70, val_y]

    vx_80_col = val_x.columns.intersection(list(corr_80))
    vx_80 = val_x[vx_80_col]  
    v_fold_80_inc[fold] = [vx_80, val_y]

validation_inc_corr['60'] = v_fold_60_inc
validation_inc_corr['70'] = v_fold_70_inc
validation_inc_corr['80'] = v_fold_80_inc
        

In [20]:
#corr : {1 : [x, y], 2 : [x, y], etc.}
train_inc_corr = {}

t_fold_60_inc = {}
t_fold_70_inc = {}
t_fold_80_inc = {}

for fold in range(0, 5):
    t_y = train_inc[fold][1].copy()
    t_x = train_inc[fold][0].copy()

    tx_60_col = t_x.columns.intersection(list(corr_60))
    tx_60 = t_x[tx_60_col]  
    t_fold_60_inc[fold] = [tx_60, t_y]

    tx_70_col = t_x.columns.intersection(list(corr_70))
    tx_70 = t_x[tx_70_col]  
    t_fold_70_inc[fold] = [tx_70, t_y]

    tx_80_col = t_x.columns.intersection(list(corr_80))
    tx_80 = t_x[tx_80_col]  
    t_fold_80_inc[fold] = [tx_80, t_y]

train_inc_corr['60'] = t_fold_60_inc
train_inc_corr['70'] = t_fold_70_inc
train_inc_corr['80'] = t_fold_80_inc
        

##### Thresholding

In [21]:
needed_thresholds = [0.85, 0.90, 0.95, 1]

#fold : train_85, rows_left, col_left, train_95, rows_left, col_left, train_100, rows_left, col_left
folds_thresh_inc_60 = {0 : [], 1 : [], 2 : [], 3 : [], 4 : []}
folds_thresh_inc_70 = {0 : [], 1 : [], 2 : [], 3 : [], 4 : []}
folds_thresh_inc_80 = {0 : [], 1 : [], 2 : [], 3 : [], 4 : []}

inc_60 = train_inc_corr['60']
inc_70 = train_inc_corr['70']
inc_80 = train_inc_corr['80']

for fold in range(0, 5):
    data_to_thresholdx_60 = inc_60[fold][0]
    data_to_thresholdy_60 = inc_60[fold][1]
    data_to_threshold_60 = pd.concat([data_to_thresholdx_60, data_to_thresholdy_60], axis=1)

    data_to_thresholdx_70 = inc_70[fold][0]
    data_to_thresholdy_70 = inc_70[fold][1]
    data_to_threshold_70 = pd.concat([data_to_thresholdx_70, data_to_thresholdy_70], axis=1)

    data_to_thresholdx_80 = inc_80[fold][0]
    data_to_thresholdy_80 = inc_80[fold][1]
    data_to_threshold_80 = pd.concat([data_to_thresholdx_80, data_to_thresholdy_80], axis=1)
    
    for thresh in needed_thresholds:
        new_data60, new_proportion60, new_changes60 = iterative_thresholding(data_to_threshold_60, thresh)
        folds_thresh_inc_60[fold].append(new_data60)
        folds_thresh_inc_60[fold].append(new_data60.shape[0])
        folds_thresh_inc_60[fold].append(new_data60.shape[1])

        new_data70, new_proportion70, new_changes70 = iterative_thresholding(data_to_threshold_70, thresh)
        folds_thresh_inc_70[fold].append(new_data70)
        folds_thresh_inc_70[fold].append(new_data70.shape[0])
        folds_thresh_inc_70[fold].append(new_data70.shape[1])

        new_data80, new_proportion80, new_changes80 = iterative_thresholding(data_to_threshold_80, thresh)
        folds_thresh_inc_80[fold].append(new_data80)
        folds_thresh_inc_80[fold].append(new_data80.shape[0])
        folds_thresh_inc_80[fold].append(new_data80.shape[1])

#### Split by Year

##### Correlation based sub-folds

In [22]:
test_year_60_col = test_data_x_year.columns.intersection(list(corr_60))
test_year_60 = test_data_x_year[test_year_60_col]         

test_year_70_col = test_data_x_year.columns.intersection(list(corr_70))
test_year_70 = test_data_x_year[test_year_70_col]      

test_year_80_col = test_data_x_year.columns.intersection(list(corr_80))
test_year_80 = test_data_x_year[test_year_80_col]       

In [23]:
#corr : {1 : [x, y], 2 : [x, y], etc.}
validation_year_corr = {}

v_fold_60_year = {}
v_fold_70_year = {}
v_fold_80_year = {}

for fold in range(0, 5):
    val_y = validation_year[fold][1].copy()
    val_x = validation_year[fold][0].copy()

    vx_60_col = val_x.columns.intersection(list(corr_60))
    vx_60 = val_x[vx_60_col]  
    v_fold_60_year[fold] = [vx_60, val_y]

    vx_70_col = val_x.columns.intersection(list(corr_70))
    vx_70 = val_x[vx_70_col]  
    v_fold_70_year[fold] = [vx_70, val_y]

    vx_80_col = val_x.columns.intersection(list(corr_80))
    vx_80 = val_x[vx_80_col]  
    v_fold_80_year[fold] = [vx_80, val_y]

validation_year_corr['60'] = v_fold_60_year
validation_year_corr['70'] = v_fold_70_year
validation_year_corr['80'] = v_fold_80_year
        

In [24]:
#corr : {1 : [x, y], 2 : [x, y], etc.}
train_year_corr = {}

t_fold_60_year = {}
t_fold_70_year = {}
t_fold_80_year = {}

for fold in range(0, 5):
    t_y = train_year[fold][1].copy()
    t_x = train_year[fold][0].copy()

    tx_60_col = t_x.columns.intersection(list(corr_60))
    tx_60 = t_x[tx_60_col]  
    t_fold_60_year[fold] = [tx_60, t_y]

    tx_70_col = t_x.columns.intersection(list(corr_70))
    tx_70 = t_x[tx_70_col]  
    t_fold_70_year[fold] = [tx_70, t_y]

    tx_80_col = t_x.columns.intersection(list(corr_80))
    tx_80 = t_x[tx_80_col]  
    t_fold_80_year[fold] = [tx_80, t_y]

train_year_corr['60'] = t_fold_60_year
train_year_corr['70'] = t_fold_70_year
train_year_corr['80'] = t_fold_80_year
        

##### Thresholding

In [25]:
needed_thresholds = [0.85, 0.90, 0.95, 1]

#fold : train_85, rows_left, col_left, train_95, rows_left, col_left, train_100, rows_left, col_left
folds_thresh_year_60 = {0 : [], 1 : [], 2 : [], 3 : [], 4 : []}
folds_thresh_year_70 = {0 : [], 1 : [], 2 : [], 3 : [], 4 : []}
folds_thresh_year_80 = {0 : [], 1 : [], 2 : [], 3 : [], 4 : []}

year_60 = train_year_corr['60']
year_70 = train_year_corr['70']
year_80 = train_year_corr['80']

for fold in range(0, 5):
    data_to_thresholdx_60 = year_60[fold][0]
    data_to_thresholdy_60 = year_60[fold][1]
    data_to_threshold_60 = pd.concat([data_to_thresholdx_60, data_to_thresholdy_60], axis=1)

    data_to_thresholdx_70 = year_70[fold][0]
    data_to_thresholdy_70 = year_70[fold][1]
    data_to_threshold_70 = pd.concat([data_to_thresholdx_70, data_to_thresholdy_70], axis=1)

    data_to_thresholdx_80 = year_80[fold][0]
    data_to_thresholdy_80 = year_80[fold][1]
    data_to_threshold_80 = pd.concat([data_to_thresholdx_80, data_to_thresholdy_80], axis=1)
    
    for thresh in needed_thresholds:
        new_data60, new_proportion60, new_changes60 = iterative_thresholding(data_to_threshold_60, thresh)
        folds_thresh_year_60[fold].append(new_data60)
        folds_thresh_year_60[fold].append(new_data60.shape[0])
        folds_thresh_year_60[fold].append(new_data60.shape[1])

        new_data70, new_proportion70, new_changes70 = iterative_thresholding(data_to_threshold_70, thresh)
        folds_thresh_year_70[fold].append(new_data70)
        folds_thresh_year_70[fold].append(new_data70.shape[0])
        folds_thresh_year_70[fold].append(new_data70.shape[1])

        new_data80, new_proportion80, new_changes80 = iterative_thresholding(data_to_threshold_80, thresh)
        folds_thresh_year_80[fold].append(new_data80)
        folds_thresh_year_80[fold].append(new_data80.shape[0])
        folds_thresh_year_80[fold].append(new_data80.shape[1])

#### Saving

In [26]:
train_filepath_inc = '../fs_corr_income_data/train'
val_filepath_inc = '../fs_corr_income_data/val'
test_filepath_inc = '../fs_corr_income_data/test'

train_filepath_year = '../fs_corr_year_data/train'
val_filepath_year = '../fs_corr_year_data/val'
test_filepath_year = '../fs_corr_year_data/test'

In [27]:
########### test year set

pd.DataFrame(test_year_60).to_csv(test_filepath_year + '/60/X_test.csv')
pd.DataFrame(test_data_y_year).to_csv(test_filepath_year + '/60/y_test.csv')

pd.DataFrame(test_year_70).to_csv(test_filepath_year + '/70/X_test.csv')
pd.DataFrame(test_data_y_year).to_csv(test_filepath_year + '/70/y_test.csv')

pd.DataFrame(test_year_80).to_csv(test_filepath_year + '/80/X_test.csv')
pd.DataFrame(test_data_y_year).to_csv(test_filepath_year + '/80/y_test.csv')


In [28]:
########### test country set

pd.DataFrame(test_inc_60).to_csv(test_filepath_inc + '/60/X_test.csv')
pd.DataFrame(test_data_y_inc).to_csv(test_filepath_inc + '/60/y_test.csv')

pd.DataFrame(test_inc_70).to_csv(test_filepath_inc + '/70/X_test.csv')
pd.DataFrame(test_data_y_inc).to_csv(test_filepath_inc + '/70/y_test.csv')

pd.DataFrame(test_inc_80).to_csv(test_filepath_inc + '/80/X_test.csv')
pd.DataFrame(test_data_y_inc).to_csv(test_filepath_inc + '/80/y_test.csv')

In [29]:
########### validation set
v_year_60 = validation_year_corr['60']
v_year_70 = validation_year_corr['70']
v_year_80 = validation_year_corr['80']

for fold in range(0, 5):
    val_input60 = v_year_60[fold][0]
    val_labels60 = v_year_60[fold][1]
    pd.DataFrame(val_input60).to_csv(val_filepath_year + '/60/X_val_' + str(fold) + '.csv')
    pd.DataFrame(val_labels60).to_csv(val_filepath_year + '/60/y_val_' + str(fold) + '.csv')

    val_input70 = v_year_70[fold][0]
    val_labels70 = v_year_70[fold][1]
    pd.DataFrame(val_input70).to_csv(val_filepath_year + '/70/X_val_' + str(fold) + '.csv')
    pd.DataFrame(val_labels70).to_csv(val_filepath_year + '/70/y_val_' + str(fold) + '.csv')

    val_input80 = v_year_80[fold][0]
    val_labels80 = v_year_80[fold][1]
    pd.DataFrame(val_input80).to_csv(val_filepath_year + '/80/X_val_' + str(fold) + '.csv')
    pd.DataFrame(val_labels80).to_csv(val_filepath_year + '/80/y_val_' + str(fold) + '.csv')


In [30]:
########### validation set
v_inc_60 = validation_inc_corr['60']
v_inc_70 = validation_inc_corr['70']
v_inc_80 = validation_inc_corr['80']

for fold in range(0, 5):
    val_input60 = v_inc_60[fold][0]
    val_labels60 = v_inc_60[fold][1]
    pd.DataFrame(val_input60).to_csv(val_filepath_inc + '/60/X_val_' + str(fold) + '.csv')
    pd.DataFrame(val_labels60).to_csv(val_filepath_inc + '/60/y_val_' + str(fold) + '.csv')

    val_input70 = v_inc_70[fold][0]
    val_labels70 = v_inc_70[fold][1]
    pd.DataFrame(val_input70).to_csv(val_filepath_inc + '/70/X_val_' + str(fold) + '.csv')
    pd.DataFrame(val_labels70).to_csv(val_filepath_inc + '/70/y_val_' + str(fold) + '.csv')

    val_input80 = v_inc_80[fold][0]
    val_labels80 = v_inc_80[fold][1]
    pd.DataFrame(val_input80).to_csv(val_filepath_inc + '/80/X_val_' + str(fold) + '.csv')
    pd.DataFrame(val_labels80).to_csv(val_filepath_inc + '/80/y_val_' + str(fold) + '.csv')


In [31]:
########### train set
t_year_60 = train_year_corr['60']
t_year_70 = train_year_corr['70']
t_year_80 = train_year_corr['80']

threshs = ['85', '90', '95', '1']

for fold in range(0, 5):
    for idx, thresh in enumerate(threshs):
        idx_multiple = idx*3

        data60 = folds_thresh_year_60[fold][idx_multiple]
        train_input60 = data60.drop(['Maternal mortality ratio (national estimate, per 100,000 live births)'], axis=1)
        train_input_idx_drop60 = train_input60.drop(columns=['index'], axis=1)
        train_labels60 = data60[['Maternal mortality ratio (national estimate, per 100,000 live births)']]
        pd.DataFrame(train_input_idx_drop60).to_csv(train_filepath_year + '/60/X_train_' + str(fold) + '_' + thresh + '.csv')
        pd.DataFrame(train_labels60).to_csv(train_filepath_year + '/60/y_train_' + str(fold) + '_' + thresh + '.csv')

        data70 = folds_thresh_year_70[fold][idx_multiple]
        train_input70 = data70.drop(['Maternal mortality ratio (national estimate, per 100,000 live births)'], axis=1)
        train_input_idx_drop70 = train_input70.drop(columns=['index'], axis=1)
        train_labels70 = data70[['Maternal mortality ratio (national estimate, per 100,000 live births)']]
        pd.DataFrame(train_input_idx_drop70).to_csv(train_filepath_year + '/70/X_train_' + str(fold) + '_' + thresh + '.csv')
        pd.DataFrame(train_labels70).to_csv(train_filepath_year + '/70/y_train_' + str(fold) + '_' + thresh + '.csv')

        data80 = folds_thresh_year_80[fold][idx_multiple]
        train_input80 = data80.drop(['Maternal mortality ratio (national estimate, per 100,000 live births)'], axis=1)
        train_input_idx_drop80 = train_input80.drop(columns=['index'], axis=1)
        train_labels80 = data80[['Maternal mortality ratio (national estimate, per 100,000 live births)']]
        pd.DataFrame(train_input_idx_drop80).to_csv(train_filepath_year + '/80/X_train_' + str(fold) + '_' + thresh + '.csv')
        pd.DataFrame(train_labels80).to_csv(train_filepath_year + '/80/y_train_' + str(fold) + '_' + thresh + '.csv')

In [32]:
########### train set
t_inc_60 = train_inc_corr['60']
t_inc_70 = train_inc_corr['70']
t_inc_80 = train_inc_corr['80']

threshs = ['85', '90', '95', '1']

for fold in range(0, 5):
    for idx, thresh in enumerate(threshs):
        idx_multiple = idx*3

        data60 = folds_thresh_inc_60[fold][idx_multiple]
        train_input60 = data60.drop(['Maternal mortality ratio (national estimate, per 100,000 live births)'], axis=1)
        train_input_idx_drop60 = train_input60.drop(columns=['index'], axis=1)
        train_labels60 = data60[['Maternal mortality ratio (national estimate, per 100,000 live births)']]
        pd.DataFrame(train_input_idx_drop60).to_csv(train_filepath_inc + '/60/X_train_' + str(fold) + '_' + thresh + '.csv')
        pd.DataFrame(train_labels60).to_csv(train_filepath_inc + '/60/y_train_' + str(fold) + '_' + thresh + '.csv')

        data70 = folds_thresh_inc_70[fold][idx_multiple]
        train_input70 = data70.drop(['Maternal mortality ratio (national estimate, per 100,000 live births)'], axis=1)
        train_input_idx_drop70 = train_input70.drop(columns=['index'], axis=1)
        train_labels70 = data70[['Maternal mortality ratio (national estimate, per 100,000 live births)']]
        pd.DataFrame(train_input_idx_drop70).to_csv(train_filepath_inc + '/70/X_train_' + str(fold) + '_' + thresh + '.csv')
        pd.DataFrame(train_labels70).to_csv(train_filepath_inc + '/70/y_train_' + str(fold) + '_' + thresh + '.csv')

        data80 = folds_thresh_inc_80[fold][idx_multiple]
        train_input80 = data80.drop(['Maternal mortality ratio (national estimate, per 100,000 live births)'], axis=1)
        train_input_idx_drop80 = train_input80.drop(columns=['index'], axis=1)
        train_labels80 = data80[['Maternal mortality ratio (national estimate, per 100,000 live births)']]
        pd.DataFrame(train_input_idx_drop80).to_csv(train_filepath_inc + '/80/X_train_' + str(fold) + '_' + thresh + '.csv')
        pd.DataFrame(train_labels80).to_csv(train_filepath_inc + '/80/y_train_' + str(fold) + '_' + thresh + '.csv')

### Cross-Validation Model Selection

#### Split by Country

#### Split by Year

### Cross-Validation Literature Selection

#### Split by Country

#### Split by Year