In [2]:
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.model_selection import cross_validate, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.utils import resample
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from scipy.stats import loguniform
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix

## read data 
CIP_data_no_drop = pd.read_csv("CIP_data_encode_prev_not_dropped.csv")


In [None]:
#𝑈(𝜆,𝑝)=𝜆𝑞(𝑝)−𝑐(𝑝)
#𝑈(𝜆,𝑝)=𝜆𝑞(𝑝)−𝑐(𝑝) =𝜆[𝛼(𝑝)× 𝜇𝐹𝐿𝑄-𝑅+(1−𝜇𝐹𝐿𝑄-𝑅)]−(1−𝛽(𝑝))×(1−𝜇𝐹𝐿𝑄-𝑅).
#𝑝 is the classification threshold (𝑝=0 is equivalent to sensitivity 0% and specificity 100%, and 𝑝=1 is equivalent to sensitivity 100% and specificity 0%),
#𝑞(𝑝) is the expected proportion of individuals with RR-TB who receive effective treatment (i.e., a regiment that is consistent with susceptibility of their M. tuberculosis strain to FLQs) if the classification threshold is set to 𝑝,
#𝑐(𝑝) is the expected proportion of individuals with RR-TB who unnecessarily receive DLM (instead of an FLQ) if the classification threshold is set to 𝑝; and
#𝜆 is a trade-off threshold that represent the policymaker’s willingness to accept an increase in the proportion of individuals who unnecessarily receive DLM (i.e., 𝑐(𝑝)) in order to increase the proportion of individuals who receive effective treatment (i.e., 𝑞(𝑝)).
#sensitivity 𝛼(𝑝) and specificity 𝛽(𝑝)
#mu is the prevalence of resistance


In [None]:
#SO, what we would have is, for a specific sensitivity and specificity for a particular classificiation threshold. 
#The prevalenece of resistant strains is fixed by year... so we can't change the past, can only say "at each time point, if we had done this, this is what would have happened"
# q_p = proprotion that get effective treatment 
# do we have different lines for different years? or do we just artificially say 'of if the prevalence was this much, and the threshold was this much, this is what the graph would be'? Maybe try that...



In [12]:
## function for getting utility 

def utility(lambda_to, sensitivity, specificity, prevalence):
    lambda_to = 5
    q_p = float(sensitivity*prevalence + (1-prevalence))
    c_p = float((1 - specificity)*(1- prevalence))
    utility_p = -c_p + lambda_to*q_p
    return utility_p

In [14]:
threshold_seq = np.linspace(0,1,101)
#prevalence_seq = np.linspace(0,100,1001)


# need to use the threshold to get the senstivity and specificity of the model 
utility_results = []

### set up model 
model_nn = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 3000 ,hidden_layer_sizes= 13, alpha = 1.291549665014884, random_state=10, learning_rate = 'constant' )
#train data: 2000 - 2010 
train_data = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,2010])]
X_train = train_data[['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC']]
y_train = train_data['Susceptible']

oversample = RandomOverSampler(sampling_strategy = 0.5,random_state=42)
X_train, y_train = oversample.fit_resample(X_train,y_train)


#test data: 2011 - 2019 
test_data = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin([2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])]
X_test = test_data[['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC']]
y_test = test_data['Susceptible']
oversample = RandomOverSampler(sampling_strategy = 0.5,random_state=42)
X_test, y_test = oversample.fit_resample(X_test,y_test)

#loop setup

for threshold in threshold_seq:
      #fit model on training data 
    model_fit_train = model_nn.fit(X_train, y_train)

     #2. Test model on test data to get test specificity and sensitivity 
  
    y_predict = model_fit_train.predict(X_test)
    y_predict_proba = model_fit_train.predict_proba(X_test)
 
    y_predict_test = np.where(y_predict_proba[:, 1] > threshold, 1, 0)

    # predicted prevalence = predicted number of cases/total cases? or are we using a specific year? 
    tn_test , fp_test , fn_test , tp_test  = confusion_matrix(y_true=y_test, y_pred=y_predict_test).ravel()
     
    sensitivity  = tp_test  / (tp_test   + fn_test) 
    specificity   = tn_test / (tn_test + fp_test )
    utility_threshold = utility(5, sensitivity, specificity, prevalence = 0.05)
    utility_results.append(utility_threshold)


KeyboardInterrupt: 

In [None]:

#threshold_seq = [0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1]
threshold_seq = np.linspace(0,1,101)
#only using split data - can't have optimisation-corrected metrics
#sensitivity_optimised = []
#specificity_optimised = [] #no bootstrapping, no 95% CI 

#want to keep the data the same between threshold runs
#   train 
train_data = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010])]
X_train = train_data[['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC']]
y_train = train_data['Susceptible']

oversample = RandomOverSampler(sampling_strategy = 0.5,random_state=42)
X_train, y_train = oversample.fit_resample(X_train,y_train)

# test
test_data = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin([2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])]
#try sensitivity and specificity for graph
test_data = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin([2011])]

X_test = test_data[['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC']]
y_test = test_data['Susceptible']
oversample = RandomOverSampler(sampling_strategy = 0.5,random_state=42)
X_test, y_test = oversample.fit_resample(X_test,y_test)


#dataframes
sensitivity_test_threshold = []
specificity_test_threshold = [] #no bootstrapping, no 95% CI 

for threshold in threshold_seq:
  print(threshold)
  #1. Create model using all data and get the apparent sensitivity and specificty 

  model_fit_train = model_nn.fit(X_train, y_train)


  #2. Test model on training data to get test specificity and sensitivity 
  
  y_predict = model_fit_train.predict(X_test)
  y_predict_proba = model_fit.predict_proba(X_test)
 
  y_predict_test = np.where(y_predict_proba[:, 1] > threshold, 1, 0)

  tn_test , fp_test , fn_test , tp_test  = confusion_matrix(y_true=y_test, y_pred=y_predict_test).ravel()

  sensitivity_test  = tp_test  / (tp_test   + fn_test )
  specificity_test   = tn_test / (tn_test + fp_test )
  
  # Save results
  sensitivity_test_threshold.append(sensitivity_test)
  specificity_test_threshold.append(specificity_test)