In [1]:
#%reset
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import numpy as np
import eli5
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.model_selection import cross_validate, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.utils import resample
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from scipy.stats import loguniform
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix
from eli5.sklearn import PermutationImportance
## read data 
CIP_data = pd.read_csv("CIP_data_encode_prev.csv")
CIP_data_no_drop = pd.read_csv("CIP_data_encode_prev_not_dropped.csv")
print(CIP_data_no_drop.columns)

Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'CLINIC', 'YEAR',
       'GENDERSP', 'Susceptible', 'MSM', 'MSMW', 'MSW', 'Oth/Unk/Missing',
       'REGION', 'Midwest', 'Northeast', 'Southeast', 'Southwest', 'West',
       'PREV_REGION', 'PREV_CLINIC', 'DELTA_REGION'],
      dtype='object')


In [None]:
########### GRAPH SHOWING TRENDS IN RESISTANCE ###########
### Overall data 
xAxisYears = [2000, 2005, 2010, 2015, 2019]
years = CIP_data_no_drop["YEAR"].unique()
reistance_by_year_overall = []
reistance_by_year_all = pd.DataFrame() 
for year in years:
    CIP_data_year = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'] == year]
    reistance_by_year_overall.append(1 - CIP_data_year["Susceptible"].sum()/len(CIP_data_year))


fig, (ax1, ax2) = plt.subplots(1, 2)
plt.setp(ax1, xlim=(2000, 2020), ylim=(0,.50), xticks=[2000, 2005, 2010, 2015, 2019])
plt.setp(ax2, xlim=(2000, 2020), ylim=(0,.50), xticks=[2000, 2005, 2010, 2015, 2019])
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=0.9,
                    top=0.9,
                    wspace=0.25,
                    hspace=1.4)
## By region

regions = CIP_data_no_drop["REGION"].unique()

plt.subplot(1, 2, 1) 
#plt.subplots(figsize=(7.5, 3.75))

for region in regions:
    CIP_data_region = CIP_data_no_drop.loc[CIP_data_no_drop['REGION'] == region]
    reistance_by_year = []
    for year in years:
        CIP_data_region_year = CIP_data_region.loc[CIP_data_region['YEAR'] == year]

        reistance_by_year.append(1 - CIP_data_region_year["Susceptible"].sum()/len(CIP_data_region_year))
    ax1.plot(years, reistance_by_year, label=region, alpha=0.75)

ax1.plot(years, reistance_by_year_overall, color = "black", label = "Overall")

legend_labels = [regions, 'Overall']


### By gender of sp 
genders = CIP_data_no_drop["GENDERSP"].unique()

plt.subplot(1, 2, 1) 
#plt.subplots(figsize=(7.5, 3.75))

#reistance_by_year_all = pd.DataFrame()
for gender in genders:
    CIP_data_region = CIP_data_no_drop.loc[CIP_data_no_drop['GENDERSP'] == gender]
    reistance_by_year = []
    for year in years:
        CIP_data_region_year = CIP_data_region.loc[CIP_data_region['YEAR'] == year]

        reistance_by_year.append(1 - CIP_data_region_year["Susceptible"].sum()/len(CIP_data_region_year))
        
    ax2.plot(years, reistance_by_year, label=gender, alpha=0.75)

ax2.plot(years, reistance_by_year_overall, color = "black", label = "Overall")

legend_labels = [genders, 'Overall']
ax1.set_xlabel('Year', fontsize = 16.0)
ax1.set_ylabel('Percent with ciprofloxacin resistance', fontsize = 16.0)
ax2.set_xlabel('Year', fontsize = 16.0)
ax1.legend()
ax2.legend(loc="upper left")
#ax1.annotate("A", xy=(0, 0.51), xycoords=trans,annotation_clip=False)
ax1.text(ax1.get_xlim()[0] , ax1.get_ylim()[1] + 0.01, 'A', fontsize = 16, **hfont)
ax2.text(ax1.get_xlim()[0] , ax1.get_ylim()[1] + 0.01, 'B', fontsize = 16, **hfont)

#ax1.set_title('ax1 title')
#ax2.set_title('ax2 title')

#plt.tight_layout()
print(ax1.get_xlim())
print(ax1.get_ylim())

#plt.savefig('Trends_in_ciprofloxacin_reisistance_region_gendersp.png')

In [20]:
### Hyperparameter tuning with entire dataset 

## now do hyperparameter tuning again post PI
space = dict()
#space['activation'] = ['tanh', 'relu']
space['alpha'] = np.logspace(-1, 1, 10)
#space['learning_rate'] = ['constant','adaptive']
space['hidden_layer_sizes'] = [(12), (13), (14), (16)]

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=100, random_state=1)
model_nn = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 3000 ,hidden_layer_sizes= 12, alpha =  0.46415888336127786, random_state=42, learning_rate = 'adaptive' )

X = CIP_data_no_drop[['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC', 'DELTA_REGION']]
y = CIP_data_no_drop['Susceptible']
oversample = RandomOverSampler(sampling_strategy = 0.5,random_state=42)
X, y = oversample.fit_resample(X,y)
model_fit = model_nn.fit(X, y)

search = RandomizedSearchCV(model_nn, space, scoring='roc_auc', n_jobs=-1, cv=cv, random_state=1)
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

#Best Score: 0.8081387251717022
#Best Hyperparameters: {'learning_rate': 'adaptive', 'hidden_layer_sizes': 16, 'alpha': 0.46415888336127786}

In [17]:
### Permutation importance w/ hyperparameters and ENTIRE dataset 
model_nn = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 3000 ,hidden_layer_sizes= 14, alpha = 1.291549665014884, random_state=10, learning_rate = 'adaptive' )
#train data: 2000 - 2010
train_data = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,2010])]
X_train = train_data[['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC', 'DELTA_REGION']]
y_train = 1 - train_data['Susceptible']

oversample = RandomOverSampler(sampling_strategy = 0.5,random_state=42)
X_train, y_train = oversample.fit_resample(X_train,y_train)
model_fit = model_nn.fit(X_train, y_train)

#test data: 2009 - 2019 
test_data = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin([2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])]
X_test = test_data[['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC', 'DELTA_REGION']]
y_test = 1 - test_data['Susceptible']
oversample = RandomOverSampler(sampling_strategy = 0.5,random_state=42)
X_test, y_test = oversample.fit_resample(X_test,y_test)

y_predict = model_nn.predict(X_test)

ROC_AUC_neural_network = metrics.roc_auc_score(y_test, y_predict)

print('ROC_AUC_nn:', ROC_AUC_neural_network) 

print(ROC_AUC_neural_network)
perm = PermutationImportance(model_fit, random_state=1).fit(X_test,y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())




ROC_AUC_nn: 0.6133720679704389
0.6133720679704389


Weight,Feature
0.1089  ± 0.0022,PREV_REGION
0.0255  ± 0.0010,PREV_CLINIC
0.0150  ± 0.0030,West
0.0126  ± 0.0012,DELTA_REGION
0.0117  ± 0.0022,MSM
0.0043  ± 0.0011,Southeast
0.0040  ± 0.0012,Southwest
0.0007  ± 0.0004,MSMW
0.0006  ± 0.0002,Oth/Unk/Missing
0.0000  ± 0.0007,Northeast


In [9]:
PI = permutation_importance(model_fit, X_test, y_test, n_repeats = 100, random_state = 42)
feature_names = ['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC','DELTA_REGION']
for i in PI.importances_mean.argsort()[::-1]:
    if PI.importances_mean[i] - 2 * PI.importances_std[i] > -1:
            print(f"{feature_names[i]:<8}"
            f"{PI.importances_mean[i]:.3f}"
            f" +/- {PI.importances_std[i]:.3f}")

PREV_REGION0.090 +/- 0.001
West    0.042 +/- 0.001
DELTA_REGION0.020 +/- 0.001
PREV_CLINIC0.018 +/- 0.001
Southeast0.007 +/- 0.001
MSMW    -0.002 +/- 0.001
Oth/Unk/Missing-0.002 +/- 0.000
Northeast-0.003 +/- 0.001
Southwest-0.006 +/- 0.001
MSM     -0.007 +/- 0.001
Midwest -0.014 +/- 0.001
MSW     -0.021 +/- 0.001


In [10]:
### Permutation importance w/ hyperparameters and ENTIRE dataset 
model_nn = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 3000 ,hidden_layer_sizes= 16, alpha =  1.46415888336127786, random_state=10, learning_rate = 'adaptive' )
#train data: 2000 - 2010
train_data = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,2010])]
X_train = train_data[['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC', 'DELTA_REGION']]
y_train = 1 - train_data['Susceptible']

oversample = RandomOverSampler(sampling_strategy = 0.5,random_state=42)
X_train, y_train = oversample.fit_resample(X_train,y_train)
model_fit = model_nn.fit(X_train, y_train)

#test data: 2009 - 2019 
test_data = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin([2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])]
X_test = test_data[['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC', 'DELTA_REGION']]
y_test = 1 - test_data['Susceptible']
oversample = RandomOverSampler(sampling_strategy = 0.5,random_state=42)
X_test, y_test = oversample.fit_resample(X_test,y_test)

y_predict = model_nn.predict(X_test)

ROC_AUC_neural_network = metrics.roc_auc_score(y_test, y_predict)

print('ROC_AUC_nn:', ROC_AUC_neural_network) 

print(ROC_AUC_neural_network)
perm = PermutationImportance(model_fit, random_state=1).fit(X_test,y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

ROC_AUC_nn: 0.6037654070936272
0.6037654070936272


Weight,Feature
0.0907  ± 0.0023,PREV_REGION
0.0384  ± 0.0014,MSM
0.0362  ± 0.0008,MSW
0.0181  ± 0.0013,DELTA_REGION
0.0154  ± 0.0017,West
0.0130  ± 0.0020,PREV_CLINIC
0.0059  ± 0.0006,Midwest
0.0040  ± 0.0012,MSMW
0.0030  ± 0.0009,Northeast
0.0004  ± 0.0009,Southeast
