In [None]:
import sys
sys.path.append('../')
%reload_ext autoreload 
%autoreload 2

In [None]:
from classification_class import Classification
from cross_country_functions import *

### Prepare the data:

In [None]:
#read the features
data_all = pd.read_csv('../data/nga/nga_features.csv', index_col=0)
data_all.index = data_all.index.astype(str)

read the target of interest, e.g., mimi_simple:

In [None]:
#targets can be ['va_ai', 'fol_ai', 'vb12_ai', 'fe_ai', 'zn_ai', 'mimi_simple']
t = 'mimi_simple'

In [None]:
y = target('../data/ethiopia_nigeria_targets.csv', t, survey_id='NGA_2018_LSS_v01_M')

prepare the target for classification:

In [None]:
y = prepare_target_classes(y, t)

### Classification:

!!! take the random state and best hyper-parameters after resampling 5 times:

In [None]:
best_random_state = pd.read_csv('../data/results/perf_%s_NGA_undersampling_3.2_xgboost.csv'%t).best_random_state[0]

In [None]:
classification = Classification(y, data_all, type_target=t, random_state=best_random_state, sampling='undersampling', sampling_strategy=1)

In [None]:
dummy = classification.dummy_classification()

In [None]:
dummy_pred = classification.predictions(dummy)

In [None]:
dummy_performance_indicators = classification.perf_ind_classification(dummy_pred)

train the model getting the best hyperparameters:

In [None]:
model = classification.xgbclassification_best_model('../data/results/besthyper_%s_NGA_undersampling_3.2_XGBoost.csv'%t)

get the predictions:

In [None]:
predictions = classification.predictions(model)

get the performances:

In [None]:
performance_indicators = classification.perf_ind_classification(predictions)

In [None]:
#set the probability threshold
threshold_probability = 0.5

In [None]:
#Generate dictionary for adjusted performance indicators
adjusted_performance_indicators = {}

#probalilities
y_proba = classification.y_proba(model)

#---Performance---
if threshold_probability != None:
    predictions = predictions_proba(y_proba, threshold_probability)
else:
    predictions = classification.predictions(model)

array_precision, array_recall, average_pre_recall = calculates_precision_recall_auc(y_proba, classification, drop_intermediate=True)
#append average_pre_recall on performance_indicators
performance_indicators['average_pre_recall'] = average_pre_recall
array_fpr, array_tpr, rocauc_score = calculates_roc_auc(y_proba, classification, drop_intermediate=True)
#append roc scores on performance_indicators
performance_indicators['rocauc_score'] = rocauc_score
#calculate adjusted precision-recall and roc values and save them on the adjusted_performance_indicators
adjusted_roc_auc, adjusted_average_pre_recall = get_adjusted_values(classification, rocauc_score, average_pre_recall)
adjusted_performance_indicators['adjusted_rocauc'] = adjusted_roc_auc
adjusted_performance_indicators['adjusted_average_pre_recall'] = adjusted_average_pre_recall

### Variable importance:

get the shap values:

In [None]:
shap_values=classification.shap_values(model)

convert code variable names to meaningful variable names by using lsms information:

In [None]:
explanations = pd.read_csv('../data/features_explanations.csv', index_col=0)
dict_all = explanations.set_index('codename')['explanation'].to_dict()

In [None]:
trainset = classification.train_test['X_train']

In [None]:
trainset = replace_features_col_names(trainset, dict_all)

In [None]:
classification.shap_summary_plot(model, trainset, title="Feature importance - risk of inadequate overall intake (NGA)", display=10, iso3='NGA', titlefontsize=16, save=False)