In [38]:
##### Investigate why LOOCV performs worse on clinics with a high prevalence of CIP R
#%reset
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import numpy as np
import os 
from matplotlib.patches import Polygon
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.inspection import permutation_importance
from scipy.stats import loguniform
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix
from eli5.sklearn import PermutationImportance
from sklearn.metrics import f1_score, matthews_corrcoef, roc_auc_score
from Functions_AMR_gonorrhea import effective_unnecessary_threshold, get_best_hyperparameters, get_best_features, get_test_train_data, get_feature_effects, f1_mcc_score_threshold
hfont = {'fontname':'Helvetica'}
import pickle
## read data 
CIP_data_no_drop = pd.read_csv("CIP_data_encode_prev_not_dropped.csv")
print(CIP_data_no_drop.columns)

Index(['Unnamed: 0.4', 'Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1',
       'Unnamed: 0', 'CLINIC', 'YEAR', 'GENDERSP', 'Susceptible', 'MSM',
       'MSMW', 'MSW', 'Oth/Unk/Missing', 'REGION', 'Midwest', 'Northeast',
       'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC',
       'DELTA_REGION', 'DELTA_CLINIC'],
      dtype='object')


In [39]:
############# data

#nn_data 

best_hyperparameters_by_year_lr = {2005: {'solver': 'liblinear', 'penalty': 'l1', 'C': 54.85}, 2006: {'solver': 'liblinear', 'penalty': 'l1', 'C': 8.65}, 2007: {'solver': 'liblinear', 'penalty': 'l1', 'C': 35.730000000000004}, 2008: {'solver': 'liblinear', 'penalty': 'l1', 'C': 35.730000000000004}, 2009: {'solver': 'liblinear', 'penalty': 'l1', 'C': 8.65}, 2010: {'solver': 'liblinear', 'penalty': 'l1', 'C': 8.65}}
best_hyperparameters_by_year_rf ={2005: {'n_estimators': 181, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_depth': 99}, 2006: {'n_estimators': 181, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_depth': 99}, 2007: {'n_estimators': 181, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_depth': 99}, 2008: {'n_estimators': 181, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_depth': 99}, 2009: {'n_estimators': 181, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_depth': 99}, 2010: {'n_estimators': 181, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_depth': 99}}
best_hyperparameters_by_year_nn ={2005: {'solver': 'adam', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (13,), 'alpha': 0.12080421346773286, 'activation': 'relu'}, 2006: {'solver': 'adam', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (13,), 'alpha': 0.12080421346773286, 'activation': 'relu'}, 2007: {'solver': 'adam', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (13,), 'alpha': 0.12080421346773286, 'activation': 'relu'}, 2008: {'solver': 'adam', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (13,), 'alpha': 0.12080421346773286, 'activation': 'relu'}, 2009: {'solver': 'adam', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (13,), 'alpha': 0.12080421346773286, 'activation': 'relu'}, 2010: {'solver': 'adam', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (13,), 'alpha': 0.12080421346773286, 'activation': 'relu'}}
best_features_by_year_lr = {2005: ['PREV_CLINIC', 'MSW', 'DELTA_CLINIC', 'PREV_REGION', 'MSM', 'Northeast', 'Oth/Unk/Missing'], 2006: ['DELTA_REGION', 'PREV_CLINIC', 'DELTA_CLINIC', 'MSW', 'PREV_REGION', 'Oth/Unk/Missing', 'MSM', 'Southwest', 'Southeast'], 2007: ['MSW', 'PREV_CLINIC', 'MSM', 'Oth/Unk/Missing'], 2008: ['PREV_CLINIC', 'DELTA_CLINIC', 'West', 'MSW', 'MSM', 'PREV_REGION', 'MSMW', 'Oth/Unk/Missing'], 2009: ['PREV_CLINIC', 'Oth/Unk/Missing', 'DELTA_CLINIC'], 2010: ['MSW', 'MSM', 'DELTA_CLINIC', 'PREV_CLINIC', 'Oth/Unk/Missing', 'West', 'Southwest', 'MSMW', 'DELTA_REGION', 'Southeast', 'PREV_REGION', 'Northeast']}
best_features_by_year_rf ={2005: ['MSW', 'DELTA_CLINIC', 'MSM', 'PREV_CLINIC', 'West', 'PREV_REGION', 'DELTA_REGION', 'Oth/Unk/Missing', 'MSMW'], 2006: ['MSW', 'MSM', 'DELTA_CLINIC', 'PREV_CLINIC', 'MSMW', 'Northeast', 'Southeast', 'Oth/Unk/Missing'], 2007: ['MSW', 'PREV_CLINIC', 'MSM', 'Oth/Unk/Missing', 'DELTA_CLINIC', 'Southeast', 'MSMW', 'Northeast'], 2008: ['MSW', 'PREV_CLINIC', 'MSM', 'West', 'Northeast', 'DELTA_CLINIC', 'Oth/Unk/Missing', 'DELTA_REGION', 'Southwest'], 2009: ['PREV_CLINIC', 'MSW', 'DELTA_REGION', 'Oth/Unk/Missing', 'West', 'Northeast', 'Southeast'], 2010: ['MSW', 'DELTA_CLINIC', 'MSM', 'PREV_CLINIC', 'PREV_REGION', 'Oth/Unk/Missing', 'West', 'Southeast', 'MSMW', 'DELTA_REGION', 'Northeast']}
best_features_by_year_nn ={2005: ['MSM', 'DELTA_CLINIC', 'MSW', 'Southwest', 'MSMW', 'DELTA_REGION', 'Oth/Unk/Missing'], 2006: ['DELTA_CLINIC', 'MSM', 'PREV_CLINIC', 'MSW', 'DELTA_REGION', 'PREV_REGION', 'MSMW', 'Northeast', 'Oth/Unk/Missing'], 2007: ['PREV_CLINIC', 'MSW', 'MSM', 'Southwest', 'MSMW'], 2008: ['MSM', 'DELTA_CLINIC', 'PREV_CLINIC', 'Northeast', 'MSW', 'MSMW'], 2009: ['PREV_CLINIC', 'MSM', 'DELTA_CLINIC', 'Southeast', 'DELTA_REGION', 'Oth/Unk/Missing', 'Southwest', 'West', 'Northeast'], 2010: ['MSM', 'MSW', 'PREV_CLINIC', 'DELTA_CLINIC', 'PREV_REGION', 'Southeast', 'MSMW', 'Southwest', 'Oth/Unk/Missing', 'Northeast', 'West']}
ROC_by_year_lr = {2005: 0.7322853685805495, 2006: 0.7423710317796873, 2007: 0.7048918256421187, 2008: 0.6968498187602681, 2009: 0.639172181709682, 2010: 0.6797716960932587}
ROC_by_year_rf ={2005: 0.7586099894081821, 2006: 0.7418094620985698, 2007: 0.7033306886765269, 2008: 0.6939654992567867, 2009: 0.679491422061069, 2010: 0.6840913725423555}
ROC_by_year_nn ={2005: 0.7372748012808253, 2006: 0.7324350027761852, 2007: 0.6936770334581981, 2008: 0.7043116801835867, 2009: 0.6705138326697341, 2010: 0.6835202587500921}


In [16]:
### Get proportion MSM for each year 

i = 0
years = [2005, 2006, 2007, 2008, 2009, 2010]
clinics_above_average_prevalnce_all = {}
clinics_above_MSM_all = {}
clinics_below_MSM_all = {}
clinics_above_MSW_all = {}
clinics_below_MSW_all = {}
for year in years: 
    years_train = np.array(range(year - 5, year))

    # first do for all clinics 
    model_type = 1
    CIP_data_training_years = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin(years_train)]
    CIP_data_testing_years = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin([year])]

    test_data, train_data, X_train, y_train, X_test, y_test, cipro_R_prev =  get_test_train_data(CIP_data_no_drop = CIP_data_no_drop, year = year, feature_names =best_features_by_year_rf[year], years_train = years_train, model_type = model_type)

    clinics = CIP_data_testing_years["CLINIC"].unique()
    clinics_above_average_prevalence = []
    test_data_MSM_above = []
    test_data_MSM_below = []    
    test_data_MSW_above = []
    test_data_MSW_below = [] 
    for clinic in clinics: 
        train_data = CIP_data_training_years.loc[CIP_data_training_years['CLINIC'] != clinic]
        oversample = RandomOverSampler(sampling_strategy = 'minority',random_state=42)

          #train data - does not have clinic
        X_train = train_data[best_features_by_year_rf[year]]
        y_train = 1 - train_data['Susceptible']
        X_train, y_train = oversample.fit_resample(X_train,y_train)

          #test data - has clinic 
        test_data = CIP_data_testing_years.loc[CIP_data_testing_years['CLINIC'] == clinic]
        X_test = test_data[best_features_by_year_rf[year]]
        y_test = 1 - test_data['Susceptible']
        cipro_R = y_test.sum()/len(y_test)

        if cipro_R > 0: 
          if cipro_R_prev < cipro_R:
            clinics_above_average_prevalence.append(clinic)
            test_data_MSM_above.append(test_data["MSM"].sum()/len(test_data["MSM"]))
            test_data_MSW_above.append(test_data["MSW"].sum()/len(test_data["MSW"]))

          else:
            test_data_MSM_below.append(test_data["MSM"].sum()/len(test_data["MSM"]))
            test_data_MSW_below.append(test_data["MSW"].sum()/len(test_data["MSW"]))

    clinics_above_average_prevalnce_all.__setitem__(year,clinics_above_average_prevalence)
    clinics_above_MSM_all.__setitem__(year, test_data_MSM_above)
    clinics_below_MSM_all.__setitem__(year, test_data_MSM_below)
    clinics_above_MSW_all.__setitem__(year, test_data_MSW_above)
    clinics_below_MSW_all.__setitem__(year, test_data_MSW_below)



Oversample
Oversample
Oversample
Oversample
Oversample
Oversample


In [24]:
year = 2010
print(clinics_above_average_prevalnce_all[year])
print(clinics_above_MSM_all[year])
print(clinics_below_MSM_all[year])


['ALB', 'CHI', 'DEN', 'HON', 'LAX', 'MIA', 'MIN', 'NOR', 'NYC', 'ORA', 'PHI', 'PHX', 'POR', 'SDG', 'SEA', 'SFO']
[0.26666666666666666, 0.36, 0.2384937238493724, 0.358974358974359, 0.6477272727272727, 0.19617224880382775, 0.4647887323943662, 0.102880658436214, 0.18686868686868688, 0.5192307692307693, 0.12837837837837837, 0.35678391959798994, 0.6226415094339622, 0.8097560975609757, 0.6386554621848739, 0.591304347826087]
[0.14634146341463414, 0.05737704918032787, 0.03007518796992481, 0.031088082901554404, 0.0660377358490566, 0.11824324324324324, 0.04081632653061224, 0.042328042328042326, 0.04666666666666667, 0.18, 0.055793991416309016, 0.14285714285714285]


In [None]:
## what if I add a fature to the dataset that is the proportion of MSM in each clinic? 




In [34]:
## Prevalence by region and clinic
years = CIP_data_no_drop["YEAR"].unique()
for year in years:
    CIP_data_year = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'] == year]
    MSM_by_clinic = (CIP_data_year.groupby(by = ["CLINIC"])["MSM"].sum())

    clinics = CIP_data_year["CLINIC"].unique() ## each year surveyed different clinics
    Proportion_MSM_by_clinic = {}
    for clinic in clinics:

            Proportion_MSM_by_clinic[clinic] = 1 - MSM_by_clinic[clinic]/len(CIP_data_year[CIP_data_year.CLINIC == clinic])
            CIP_data_no_drop.loc[(CIP_data_no_drop["YEAR"] == year) & (CIP_data_no_drop["CLINIC"] == clinic), "MSM_CLINIC"] = Proportion_MSM_by_clinic[clinic]



In [None]:
### Try neural network LOOCV except add in MSM_Clinic as an important feature

In [36]:
############# data

#nn_data 

best_hyperparameters_by_year_lr = {2005: {'solver': 'liblinear', 'penalty': 'l1', 'C': 54.85}, 2006: {'solver': 'liblinear', 'penalty': 'l1', 'C': 8.65}, 2007: {'solver': 'liblinear', 'penalty': 'l1', 'C': 35.730000000000004}, 2008: {'solver': 'liblinear', 'penalty': 'l1', 'C': 35.730000000000004}, 2009: {'solver': 'liblinear', 'penalty': 'l1', 'C': 8.65}, 2010: {'solver': 'liblinear', 'penalty': 'l1', 'C': 8.65}}
best_hyperparameters_by_year_rf ={2005: {'n_estimators': 181, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_depth': 99}, 2006: {'n_estimators': 181, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_depth': 99}, 2007: {'n_estimators': 181, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_depth': 99}, 2008: {'n_estimators': 181, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_depth': 99}, 2009: {'n_estimators': 181, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_depth': 99}, 2010: {'n_estimators': 181, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_depth': 99}}
best_hyperparameters_by_year_nn ={2005: {'solver': 'adam', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (13,), 'alpha': 0.12080421346773286, 'activation': 'relu'}, 2006: {'solver': 'adam', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (13,), 'alpha': 0.12080421346773286, 'activation': 'relu'}, 2007: {'solver': 'adam', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (13,), 'alpha': 0.12080421346773286, 'activation': 'relu'}, 2008: {'solver': 'adam', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (13,), 'alpha': 0.12080421346773286, 'activation': 'relu'}, 2009: {'solver': 'adam', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (13,), 'alpha': 0.12080421346773286, 'activation': 'relu'}, 2010: {'solver': 'adam', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (13,), 'alpha': 0.12080421346773286, 'activation': 'relu'}}
best_features_by_year_lr = {2005: ['MSM_Clinic', 'PREV_CLINIC', 'MSW', 'DELTA_CLINIC', 'PREV_REGION', 'MSM', 'Northeast', 'Oth/Unk/Missing'], 2006: ['MSM_Clinic', 'DELTA_REGION', 'PREV_CLINIC', 'DELTA_CLINIC', 'MSW', 'PREV_REGION', 'Oth/Unk/Missing', 'MSM', 'Southwest', 'Southeast'], 2007: ['MSM_Clinic', 'MSW', 'PREV_CLINIC', 'MSM', 'Oth/Unk/Missing'], 2008: ['MSM_Clinic', 'PREV_CLINIC', 'DELTA_CLINIC', 'West', 'MSW', 'MSM', 'PREV_REGION', 'MSMW', 'Oth/Unk/Missing'], 2009: ['MSM_Clinic', 'PREV_CLINIC', 'Oth/Unk/Missing', 'DELTA_CLINIC'], 2010: ['MSM_Clinic', 'MSW', 'MSM', 'DELTA_CLINIC', 'PREV_CLINIC', 'Oth/Unk/Missing', 'West', 'Southwest', 'MSMW', 'DELTA_REGION', 'Southeast', 'PREV_REGION', 'Northeast']}
best_features_by_year_rf ={2005: ['MSW', 'DELTA_CLINIC', 'MSM', 'PREV_CLINIC', 'West', 'PREV_REGION', 'DELTA_REGION', 'Oth/Unk/Missing', 'MSMW'], 2006: ['MSW', 'MSM', 'DELTA_CLINIC', 'PREV_CLINIC', 'MSMW', 'Northeast', 'Southeast', 'Oth/Unk/Missing'], 2007: ['MSW', 'PREV_CLINIC', 'MSM', 'Oth/Unk/Missing', 'DELTA_CLINIC', 'Southeast', 'MSMW', 'Northeast'], 2008: ['MSW', 'PREV_CLINIC', 'MSM', 'West', 'Northeast', 'DELTA_CLINIC', 'Oth/Unk/Missing', 'DELTA_REGION', 'Southwest'], 2009: ['PREV_CLINIC', 'MSW', 'DELTA_REGION', 'Oth/Unk/Missing', 'West', 'Northeast', 'Southeast'], 2010: ['MSW', 'DELTA_CLINIC', 'MSM', 'PREV_CLINIC', 'PREV_REGION', 'Oth/Unk/Missing', 'West', 'Southeast', 'MSMW', 'DELTA_REGION', 'Northeast']}
best_features_by_year_nn ={2005: ['MSM', 'DELTA_CLINIC', 'MSW', 'Southwest', 'MSMW', 'DELTA_REGION', 'Oth/Unk/Missing'], 2006: ['DELTA_CLINIC', 'MSM', 'PREV_CLINIC', 'MSW', 'DELTA_REGION', 'PREV_REGION', 'MSMW', 'Northeast', 'Oth/Unk/Missing'], 2007: ['PREV_CLINIC', 'MSW', 'MSM', 'Southwest', 'MSMW'], 2008: ['MSM', 'DELTA_CLINIC', 'PREV_CLINIC', 'Northeast', 'MSW', 'MSMW'], 2009: ['PREV_CLINIC', 'MSM', 'DELTA_CLINIC', 'Southeast', 'DELTA_REGION', 'Oth/Unk/Missing', 'Southwest', 'West', 'Northeast'], 2010: ['MSM', 'MSW', 'PREV_CLINIC', 'DELTA_CLINIC', 'PREV_REGION', 'Southeast', 'MSMW', 'Southwest', 'Oth/Unk/Missing', 'Northeast', 'West']}
ROC_by_year_lr = {2005: 0.7322853685805495, 2006: 0.7423710317796873, 2007: 0.7048918256421187, 2008: 0.6968498187602681, 2009: 0.639172181709682, 2010: 0.6797716960932587}
ROC_by_year_rf ={2005: 0.7586099894081821, 2006: 0.7418094620985698, 2007: 0.7033306886765269, 2008: 0.6939654992567867, 2009: 0.679491422061069, 2010: 0.6840913725423555}
ROC_by_year_nn ={2005: 0.7372748012808253, 2006: 0.7324350027761852, 2007: 0.6936770334581981, 2008: 0.7043116801835867, 2009: 0.6705138326697341, 2010: 0.6835202587500921}


In [40]:
### Leave one out validation for MLP 
from matplotlib.patches import Polygon
oversample = RandomOverSampler(sampling_strategy = 0.5,random_state=42)
labels = ["A", "B", "C", "D", "E", "F" ]

polygon_important0 = Polygon([(95,100), (100,100), (100,0), (95,0)], alpha = 0.4)
polygon_important1 = Polygon([(95,100), (100,100), (100,0), (95,0)], alpha = 0.4)
polygon_important2 = Polygon([(95,100), (100,100), (100,0), (95,0)], alpha = 0.4)
polygon_important3 = Polygon([(95,100), (100,100), (100,0), (95,0)], alpha = 0.4)
polygon_important4 = Polygon([(95,100), (100,100), (100,0), (95,0)], alpha = 0.4)
polygon_important5 = Polygon([(95,100), (100,100), (100,0), (95,0)], alpha = 0.4)
threshold_seq = np.linspace(0,1,101)

fig, axs = plt.subplots(2,3, figsize=(15, 9), facecolor='w', edgecolor='k', sharex = 'all', sharey = 'all')


axs[0,0].tick_params(axis='both', which='major', labelsize=18)
axs[1,0].tick_params(axis='both', which='major', labelsize=18)
axs[1,1].tick_params(axis='both', which='major', labelsize=18)
axs[1,2].tick_params(axis='both', which='major', labelsize=18)

axs[0,0].set_ylabel("Unnecessarily received\n  dual therapy (%)", fontsize = 18)
axs[1,0].set_ylabel("Unnecessarily received\n  dual therapy (%)", fontsize = 18)
axs[1,0].set_xlabel("Received effective\n treatment (%)", fontsize = 18)
axs[1,1].set_xlabel("Received effective\n treatment (%)", fontsize = 18)
axs[1,2].set_xlabel("Received effective\n treatment (%)", fontsize = 18)

axs[0,0].set_yticks(np.linspace(0,100,6), labelsize=18,**hfont)
axs[1,0].set_yticks(np.linspace(0,100,6), fontsize=18,**hfont)
axs[1,0].set_xticks([45, 55, 65, 75, 85, 95, 100], fontsize=18,**hfont)
axs[1,1].set_xticks([45, 55, 65, 75, 85, 95, 100], fontsize=18,**hfont)
axs[1,2].set_xticks([45, 55, 65, 75, 85, 95, 100], fontsize=18,**hfont)

axs[0,0].set_ylim([0-5,101])
axs[0,0].set_xlim([44, 101])
axs[0,1].set_ylim([0-5,101])
axs[0,1].set_xlim([44, 101])
axs[0,2].set_ylim([0-5,101])
axs[0,2].set_xlim([44, 101])
axs[1,0].set_ylim([0-5,101])
axs[1,0].set_xlim([44, 101])
axs[1,1].set_ylim([0-5,101])
axs[1,1].set_xlim([44, 101])
axs[1,2].set_ylim([0-5,101])
axs[1,2].set_xlim([44, 101])
test_years = [2005, 2006, 2007, 2008, 2009, 2010]
#test_years = [2005]
axs[0,0].set_ylim([0-10,101])
axs[0,0].set_xlim([45, 101])
fig.subplots_adjust(hspace = .15, wspace=.1)
axs = axs.ravel()
i = 0
years = [2005, 2006, 2007, 2008, 2009, 2010]
for year in years: 
    years_train = np.array(range(year - 5, year))

    # first do for all clinics 

    CIP_data_training_years = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin(years_train)]
    CIP_data_testing_years = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin([year])]


    test_data, train_data, X_train, y_train, X_test, y_test, cipro_R_prev  =  get_test_train_data(CIP_data_no_drop = CIP_data_no_drop, year = year, feature_names = best_features_by_year_nn[year],years_train = years_train, model_type = 0)

    model_nn = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 5000 ,hidden_layer_sizes= best_hyperparameters_by_year_nn[year]['hidden_layer_sizes'], alpha =  best_hyperparameters_by_year_nn[year]['alpha'], random_state=10, learning_rate = 'adaptive' )

    ## fit model
    model_fit_train = model_nn.fit(X_train, y_train)
    y_predict_test = model_fit_train.predict(X_test)
    y_predict_proba = model_fit_train.predict_proba(X_test)
 
    senstivitity_threshold_all, specificity_threshold_all, get_effective_threshold_all, incorrectly_get_X_threshold_all = effective_unnecessary_threshold(threshold_seq, y_predict_proba, y_test, cipro_R_prev)


    clinics = CIP_data_testing_years["CLINIC"].unique()
    for clinic in clinics: 
        train_data = CIP_data_training_years.loc[CIP_data_training_years['CLINIC'] != clinic]
          #train data - does not have clinic
        X_train = train_data[ best_features_by_year_nn[year]]
        y_train = 1 - train_data['Susceptible']
        X_train, y_train = oversample.fit_resample(X_train,y_train)

          #test data - has clinic 
        test_data = CIP_data_testing_years.loc[CIP_data_testing_years['CLINIC'] == clinic]
        X_test = test_data[ best_features_by_year_nn[year]]
        y_test = 1 - test_data['Susceptible']
        cipro_R = y_test.sum()/len(y_test)

        if cipro_R > 0: 
          if cipro_R_prev < cipro_R:
              col = '#DBABBE'
              alpha_graph = 0.4
          else:
              col = '#ffc07c'
              alpha_graph = 0.4
          try:
          # test 
              X_test, y_test = oversample.fit_resample(X_test,y_test)

              model_fit_train = model_nn.fit(X_train, y_train)
              y_predict_test = model_fit_train.predict(X_test)
              sensitivity_threshold_clinic, specificity_threshold_clinic, get_effective_threshold_clinic, incorrectly_get_X_threshold_clinic = effective_unnecessary_threshold(threshold_seq, y_predict_proba, y_test, cipro_R_prev)

              axs[i].plot(get_effective_threshold_clinic, incorrectly_get_X_threshold_clinic,color = col, linewidth = 3, alpha=alpha_graph)
              #axs[i].plot(get_effective_threshold_clinic[0:index_for_plot], incorrectly_get_X_threshold_clinic[0:index_for_plot],color = "#457b9d", linewidth = 3, alpha=0.7)

              axs[i].plot(100, (1 - cipro_R)*100, marker='s', ls='none', ms=12, color = col, alpha=0.3)
              axs[i].plot((1-cipro_R)*100, 0, marker='*', ls='none', ms=12, color = col, alpha=0.3) 
          
          except ValueError:
            #try:
              oversample = RandomOverSampler(sampling_strategy = 'minority',random_state=1)
          #train data - does not have clinic
              X_train, y_train = oversample.fit_resample(X_train,y_train)

          #test data - has clinic 

              X_test, y_test = oversample.fit_resample(X_test,y_test)
            # test 
              model_fit_train = model_nn.fit(X_train, y_train)
              y_predict_test = model_fit_train.predict(X_test)
              y_predict_proba = model_fit_train.predict_proba(X_test)
              #y_test.astype(y_predict_test.dtype)
              sensitivity_threshold_clinic, specificity_threshold_clinic, get_effective_threshold_clinic, incorrectly_get_X_threshold_clinic = effective_unnecessary_threshold(threshold_seq, y_predict_proba, y_test, cipro_R)

              axs[i].plot(get_effective_threshold_clinic, incorrectly_get_X_threshold_clinic,color = col, linewidth = 3, alpha=alpha_graph)
              axs[i].plot(100, (1 - cipro_R)*100, marker='s', ls='none', ms=12, color = col, alpha=0.3)
              axs[i].plot((1-cipro_R)*100, 0, marker='*', ls='none', ms=12, color = col, alpha=0.3) 
   


            #except ValueError:
              #print('x')
              #pass
    index_for_plot = np.max(np.where(np.array(get_effective_threshold_all)>95))

    axs[i].plot(get_effective_threshold_all, incorrectly_get_X_threshold_all,color = "black", linewidth = 3)
    #axs[i].plot(get_effective_threshold_all[0:index_for_plot], incorrectly_get_X_threshold_all[0:index_for_plot],color = "#457b9d", linewidth = 3)
    axs[i].plot(100, (1 - cipro_R_prev)*100, marker='s', ls='none', ms=14, color = "black", label = "Dual")
    axs[i].plot((1-cipro_R_prev)*100, 0, marker='*', ls='none', ms=14, color = "black", label = "Cipro")
    axs[i].text(axs[i].get_xlim()[0] , axs[i].get_ylim()[1] + 5, labels[i], fontsize = 30, **hfont)
    axs[i].axvline(x = 95, color = 'black', linestyle="--")
    axs[i].set_ylim([0-5,101])
    axs[i].set_xlim([44, 101])
    axs[i].text(axs[i].get_xlim()[0] + 5 , axs[i].get_ylim()[1] - 35, f'auROC: {round(ROC_by_year_nn[year], 3)}', fontsize = 16, **hfont)
    axs[i].text(axs[i].get_xlim()[0] + 5 , axs[i].get_ylim()[1] - 45, f'CIP-R: {round(cipro_R_prev*100, 3)}%', fontsize = 16, **hfont)
    #for index in indices:
    #    axs[i].plot(get_effective_threshold_all[index], incorrectly_get_X_threshold_all[index], marker='.', ls='none', ms=11, color = "#b56576")
    axs[i].set_title(year,fontsize=20)

    i += 1   #
axs[0].legend(prop={'size':14})

axs[0].add_patch(polygon_important0)
axs[1].add_patch(polygon_important1)
axs[2].add_patch(polygon_important2)
axs[3].add_patch(polygon_important3)
axs[4].add_patch(polygon_important4)
axs[5].add_patch(polygon_important5)
plt.tight_layout()