In [74]:
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.model_selection import cross_validate, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.utils import resample
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from scipy.stats import loguniform
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix

## read data 
CIP_data = pd.read_csv("CIP_data_encode_prev.csv")
CIP_data_no_drop = pd.read_csv("CIP_data_encode_prev_not_dropped.csv")
print(CIP_data_no_drop.columns)

Index(['Unnamed: 0.1', 'Unnamed: 0', 'CLINIC', 'YEAR', 'GENDERSP',
       'Susceptible', 'MSM', 'MSMW', 'MSW', 'Oth/Unk/Missing', 'REGION',
       'Midwest', 'Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION',
       'PREV_CLINIC'],
      dtype='object')


In [64]:
## FOR GRANT WE WANT ALL DATA 
all_data = pd.read_csv("GISP20002019.csv")
total_obs_all_data = all_data["TOTAL"].sum()
print(total_obs_all_data)
all_data.iloc[:, 5:len(all_data.columns)] = all_data.iloc[:, 5:len(all_data.columns)].apply(pd.to_numeric, errors='coerce')


112487


  all_data.iloc[:, 5:len(all_data.columns)] = all_data.iloc[:, 5:len(all_data.columns)].apply(pd.to_numeric, errors='coerce')


In [53]:
## To get summary stats, use original data 
CIP_data_full = pd.read_csv("CIP_Resistant_disagregated.csv")
print(CIP_data_full.columns)
total_obs = len(CIP_data_full)
print(total_obs)
#####
## initial stats for regional data 
#####
west = ['POR', 'PHX', 'HON', 'SDG', 'SFO', 'ANC', 'SEA', 'DEN', 'LVG', 'ORA', 'LBC', 'SLC', 'LAX']
southwest = ['OKC','MIN', 'ALB', 'DAL']
midwest = ['KCY','CHI', 'PON', 'CIN', 'JAC', 'IND', 'STL','DTR', 'MIL', 'COL', 'CLE']
southeast = ['GRB', 'NOR','WDC','MIA', 'BHM','FBG','ATL', 'RIC']
northeast = ['BUF','BOS', 'CAM', 'NYC', 'BAL', 'PHI']

CIP_data_full['REGION'] = CIP_data_full['CLINIC'].apply(lambda x: 
    'West' if (x in west) else (
        'Southwest' if (x in southwest) else(
            'Midwest' if (x in midwest) else(
                'Southeast' if (x in southeast) else(
                    'Northeast' if (x in northeast) else 'Other'))))) #drug_combinations = ['TetI']



regions = CIP_data_full['REGION'].unique()
regional_average = []
for region in regions:
    regional_data =len(CIP_data_full.loc[CIP_data_full['REGION'] == region])
    regional_average.append({region,(regional_data/total_obs)*100}) 
 
print(regional_average)
#####
## initial stats for gender data
#####

gendersps = CIP_data_full['GENDERSP'].unique()
gendersp_average = []
for gendersp in gendersps:
    gendersp_data =len(CIP_data_full.loc[CIP_data_full['GENDERSP'] == gendersp])
    gendersp_average.append({gendersp,(gendersp_data/total_obs)*100}) 
 
print(gendersp_average)


Index(['Unnamed: 0', 'CLINIC', 'YEAR', 'GENDERSP', 'Susceptible'], dtype='object')
112487
[{'Southwest', 12.560562553895116}, {'West', 35.42098198013993}, {17.462462328980237, 'Southeast'}, {'Northeast', 11.79602976343933}, {'Midwest', 22.75996337354539}]
[{'MSW', 70.61527109799354}, {22.368807062149404, 'MSM'}, {4.299163458888583, 'MSMW'}, {'Oth/Unk/Missing', 2.7167583809684674}]


In [68]:
### % resistant to CIP 
1 - sum(CIP_data_full["Susceptible"])/total_obs



####
all_resistance_to_CIPR = all_data[['CipR_PenR_TetR', 'CipR_PenI_TetI', 'CipR_TetR_PenI', 'CipR_PenI',
       'CipR_PenR_TetI', 'AziRS_CipR_PenR_TetR', 'CipR_PenR',
       'AziRS_CipR_TetR_PenI', 'CfxRS_CipR_PenR_TetR', 'AziRS_CipR_PenI_TetI',
       'CipR_TetI', 'CipR', 'CipR_TetR', 'CfxRS_CipR_TetR_PenI',
       'AziRS_CipR_PenR_TetI', 'CroRS_CfxRS_CipR_PenR_TetR']].sum(axis = 1) 

print(all_resistance_to_CIPR.sum()/total_obs)
print(all_resistance_to_CIPR.sum())

###14.45%?
### 0.14488785370753954

0.14488785370753954
16298.0


In [81]:
## permutation importance on test data 
model_nn = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 3000 ,hidden_layer_sizes= 12, alpha = 1.291549665014884, random_state=10, learning_rate = 'adaptive' )
#train data: 2000 - 2009
train_data = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009])]
X_train = train_data[['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC']]
y_train = train_data['Susceptible']

oversample = RandomOverSampler(sampling_strategy = 0.5)
X_train, y_train = oversample.fit_resample(X_train,y_train)
model_fit = model_nn.fit(X_train, y_train)

#test data: 2010 - 2019 
test_data = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])]
X_test = test_data[['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC']]
y_test = test_data['Susceptible']
oversample = RandomOverSampler(sampling_strategy = 0.5)
X_test, y_test = oversample.fit_resample(X_test,y_test)

y_predict = model_nn.predict(X_test)

ROC_AUC_neural_network_apparent = metrics.roc_auc_score(y_test, y_predict)



In [84]:
print(ROC_AUC_neural_network_apparent)
perm = PermutationImportance(model_fit, random_state=1).fit(X_test,y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

0.6200686486423123


Weight,Feature
0.0635  ± 0.0026,PREV_CLINIC
0.0173  ± 0.0016,PREV_REGION
0.0062  ± 0.0018,West
0.0052  ± 0.0027,MSW
0.0042  ± 0.0016,Southwest
0.0034  ± 0.0007,MSMW
0.0010  ± 0.0019,MSM
0.0001  ± 0.0002,Oth/Unk/Missing
-0.0009  ± 0.0019,Southeast
-0.0012  ± 0.0007,Midwest


West    0.062 +/- 0.001
Southwest0.057 +/- 0.001
MSMW    0.054 +/- 0.001
Southeast0.038 +/- 0.001
Midwest 0.017 +/- 0.001
Oth/Unk/Missing0.015 +/- 0.000
Northeast0.010 +/- 0.000
MSW     0.003 +/- 0.000
MSM     0.000 +/- 0.000


In [88]:
## now do hyperparameter tuning again post PI
space = dict()
space['activation'] = ['tanh', 'relu']
space['alpha'] = np.logspace(-1, 1, 10)
space['learning_rate'] = ['constant','adaptive']
space['hidden_layer_sizes'] = [(4), (6),(8), (12), (13)]

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=1)
model_nn = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 3000 ,hidden_layer_sizes= 12, alpha = 1.291549665014884, random_state=10, learning_rate = 'adaptive' )

X = CIP_data_no_drop[['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC']]
y = CIP_data_no_drop['Susceptible']
oversample = RandomOverSampler(sampling_strategy = 0.5)
X, y = oversample.fit_resample(X,y)
model_fit = model_nn.fit(X, y)

search = RandomizedSearchCV(model_nn, space, scoring='roc_auc', n_jobs=-1, cv=cv, random_state=1)
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

#Best Score: 0.8012056228019937
#Best Hyperparameters: {'learning_rate': 'constant', 'hidden_layer_sizes': 13, 'alpha': 1.291549665014884, 'activation': 'tanh'}

Best Score: 0.8012056228019937
Best Hyperparameters: {'learning_rate': 'constant', 'hidden_layer_sizes': 13, 'alpha': 1.291549665014884, 'activation': 'tanh'}


In [90]:
## now get ROC_AUC based on threshold of 0.5 with dropped dataset
model_nn = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 3000 ,hidden_layer_sizes= 13, alpha = 1.291549665014884, random_state=10, learning_rate = 'constant' )
#train data: 2000 - 2010 
train_data = CIP_data.loc[CIP_data['YEAR'].isin([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009])]
X_train = train_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
y_train = train_data['Susceptible']

oversample = RandomOverSampler(sampling_strategy = 0.5)
X_train, y_train = oversample.fit_resample(X_train,y_train)

#test data: 2011 - 2019 
test_data = CIP_data.loc[CIP_data['YEAR'].isin([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])]
X_test = test_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
y_test = test_data['Susceptible']
oversample = RandomOverSampler(sampling_strategy = 0.5)
X_test, y_test = oversample.fit_resample(X_test,y_test)


#fit model on training data
model_fit = model_nn.fit(X_train, y_train)

#test data
y_predict = model_fit.predict(X_test)

ROC_AUC_nn = metrics.roc_auc_score(y_predict, y_test)


print('ROC_AUC_nn_apparent:', ROC_AUC_nn) 
#0.5677

In [None]:
## now get ROC_AUC based on threshold of 0.5 with non-dropped dataset
model_nn = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 3000 ,hidden_layer_sizes= 13, alpha = 1.291549665014884, random_state=10, learning_rate = 'constant' )
#train data: 2000 - 2010 
## permutation importance on test data 
model_nn = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 3000 ,hidden_layer_sizes= 12, alpha = 1.291549665014884, random_state=10, learning_rate = 'adaptive' )
#train data: 2000 - 2009
train_data = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009])]
X_train = train_data[['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC']]
y_train = train_data['Susceptible']

oversample = RandomOverSampler(sampling_strategy = 0.5)
X_train, y_train = oversample.fit_resample(X_train,y_train)

#test data: 2010 - 2019 
test_data = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])]
X_test = test_data[['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC']]
y_test = test_data['Susceptible']
oversample = RandomOverSampler(sampling_strategy = 0.5)
X_test, y_test = oversample.fit_resample(X_test,y_test)

#fit model on training data
model_fit = model_nn.fit(X_train, y_train)

#test data
y_predict = model_fit.predict(X_test)

ROC_AUC_nn = metrics.roc_auc_score(y_predict, y_test)


print('ROC_AUC_nn_apparent:', ROC_AUC_nn) 

In [None]:
### now look at response of sensitivity and specificity to classification threshold 
### using split data - do not do bootstrapping
##model
model_nn = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 3000 ,hidden_layer_sizes= 12, alpha = 1.291549665014884, random_state=10, learning_rate = 'adaptive' )
#train data: 2000 - 2010 
train_data = CIP_data.loc[CIP_data['YEAR'].isin([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010])]
X_train = train_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
y_train = train_data['Susceptible']

oversample = RandomOverSampler(sampling_strategy = 0.5)
X_train, y_train = oversample.fit_resample(X_train,y_train)

#test data: 2011 - 2019 
test_data = CIP_data.loc[CIP_data['YEAR'].isin([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])]
X_test = test_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
y_test = test_data['Susceptible']
oversample = RandomOverSampler(sampling_strategy = 0.5)
X_test, y_test = oversample.fit_resample(X_test,y_test)

#loop setup
threshold_seq = [0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1]
#only using split data - can't have optimisation-corrected metrics
#sensitivity_optimised = []
#specificity_optimised = [] #no bootstrapping, no 95% CI 

sensitivity_test_threshold = []
specificity_test_threshold = [] #no bootstrapping, no 95% CI 

for threshold in threshold_seq:
  print(threshold)
  bootstrapped_stats = []
  #1. Create model using all data and get the apparent sensitivity and specificty 
  train_data = CIP_data.loc[CIP_data['YEAR'].isin([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010])]
  X_train = train_data[['MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'PREV_REGION', 'PREV_CLINIC']]
  y_train = train_data['Susceptible']

  oversample = RandomOverSampler(sampling_strategy = 0.5)
  X_train, y_train = oversample.fit_resample(X_train,y_train)

  model_fit_train = model_nn.fit(X_train, y_train)

  ### Don't have to do apparent sensitivity and specificity, just get one estimate
  # apparent sensitivity and specificity 

  #y_predict_train = model_fit_train.predict(X_train)
  #y_predict_proba_train = model_fit.predict_proba(X_train)
 
  #y_predict_train = np.where(y_predict_proba_train[:, 1] > threshold, 1, 0)

  #tn_apparent , fp_apparent, fn_apparent, tp_apparent = confusion_matrix(y_true=y_train, y_pred=y_predict_train).ravel()

  #sensitivity_apparent = tp_apparent / (tp_apparent  + fn_apparent )
  #specificity_apparent  = tn_apparent / (tn_apparent + fp_apparent )

  #2. Test model on training data to get test specificity and sensitivity 
  y_predict = model_fit_train.predict(X_test)
  y_predict_proba = model_fit.predict_proba(X_test)
 
  y_predict_test = np.where(y_predict_proba[:, 1] > threshold, 1, 0)

  tn_test , fp_test , fn_test , tp_test  = confusion_matrix(y_true=y_test, y_pred=y_predict_test).ravel()

  sensitivity_test  = tp_test  / (tp_test   + fn_test )
  specificity_test   = tn_test / (tn_test + fp_test )
  
  sensitivity_test_threshold.append(sensitivity_test)
  specificity_test_threshold.append(specificity_test)
  #3. Get optimised sensitivity and specificity 
  #specificity_optimised = specificity_apparent - specificity_test ##
  #sensitivity_optimised = sensitivity_apparent - sensitivity_test ##
  #sensitivity_optimised.append(sensitivity_optimised)
  #specificity_optimised.append(specificity_optimised)

  
