In [2]:
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.model_selection import cross_validate, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.utils import resample
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from scipy.stats import loguniform
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix

## read data 
CIP_data_no_drop = pd.read_csv("CIP_data_encode_prev_not_dropped.csv")


In [37]:
## PI - frequency of importance 

n_iterations = 100
PI_dataframe = pd.DataFrame({
'MSM':0,
'MSMW':0,
'MSW':0,
'Other':0,
'Midwest':0,
'Northeast':0,
'West':0,
'Southeast':0,
'Southwest':0,
'Region':0,
'Clinic':0},index=[0])

PI_dataframe_all = pd.DataFrame()

model_nn = MLPClassifier(solver = 'lbfgs', activation = 'tanh', max_iter = 3000 ,hidden_layer_sizes= 13, alpha = 1.291549665014884, random_state=10, learning_rate = 'constant' )
feature_names = ['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC']
def get_features_based_on_importance_rank(features, importance_col_name, importance_value, n_features_wanted):
    """ create a sorted table containing feature name and its importance score
    :param features: list of feature names
    :param importance_col_name: the name of the column which shows the importance value or rank for each feature
    :param importance_value: the importance value or rank for each feature
    :param n_features_wanted: number of features you want to select
    :return a sorted DataFrame containing the name of feature and its corresponding importance
    """

    # creat a DataFrame
    d = {'Features': features, importance_col_name: importance_value}
    df = pd.DataFrame(d)
    # sort by descending
    df = df.sort_values(by=[importance_col_name], ascending=False)
    selected_feature_names = (df['Features'][0: n_features_wanted]).tolist()
    return selected_feature_names
for i in range(n_iterations):
    #Generate new test and train data 
            # train
    train_data = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010])]
    X_train = train_data[['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC']]
    y_train = train_data['Susceptible']

    oversample = RandomOverSampler(sampling_strategy = 0.5)
    X_train, y_train = oversample.fit_resample(X_train,y_train)

            # test
    test_data = CIP_data_no_drop.loc[CIP_data_no_drop['YEAR'].isin([2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])]
    X_test = test_data[['MSM','MSMW', 'MSW', 'Oth/Unk/Missing','Northeast', 'Southeast', 'Southwest', 'West', 'Midwest','PREV_REGION', 'PREV_CLINIC']]
    y_test = test_data['Susceptible']
    oversample = RandomOverSampler(sampling_strategy = 0.5)
    X_test, y_test = oversample.fit_resample(X_test,y_test)

    # Train model on training data - don't need to test on test data 
    model_fit_train = model_nn.fit(X_train, y_train)

    # Permutation importance 
    PI = permutation_importance(model_fit_train, X_test, y_test, n_repeats = 10, random_state = 42)
    important_features = get_features_based_on_importance_rank(feature_names, 'Importance', PI.importances_mean, 5)
    for feature in important_features:
        if feature == 'PREV_CLINIC':
                PI_dataframe["Clinic"] += 1
        elif feature == "PREV_REGION":
                PI_dataframe["Region"] += 1
        elif feature == "MSM":
                PI_dataframe["MSM"] += 1
        elif feature == "MSMW":
                PI_dataframe["MSMW"] += 1
        elif feature == "MSW":
                PI_dataframe["MSW"] += 1
        elif feature == "Oth/Unk/Missing":
                PI_dataframe["Other"] += 1  
        elif feature == "Northeast":
                PI_dataframe["Northeast"] += 1 
        elif feature == "Southeast":
                PI_dataframe["Southeast"] += 1    
        elif feature == "Southewest":
                PI_dataframe["Southwest"] += 1   
        elif feature == "Midwest":
                PI_dataframe["Midwest"] += 1    
        elif feature == "West":
                PI_dataframe["West"] += 1   

    #Save results. 1 = +ve, -1 = -ve 



  selected_feature_names = (df['Features'][0: n_features_wanted]).tolist()
  selected_feature_names = (df['Features'][0: n_features_wanted]).tolist()
  selected_feature_names = (df['Features'][0: n_features_wanted]).tolist()
  selected_feature_names = (df['Features'][0: n_features_wanted]).tolist()
  selected_feature_names = (df['Features'][0: n_features_wanted]).tolist()
  selected_feature_names = (df['Features'][0: n_features_wanted]).tolist()
  selected_feature_names = (df['Features'][0: n_features_wanted]).tolist()
  selected_feature_names = (df['Features'][0: n_features_wanted]).tolist()
  selected_feature_names = (df['Features'][0: n_features_wanted]).tolist()
  selected_feature_names = (df['Features'][0: n_features_wanted]).tolist()
  selected_feature_names = (df['Features'][0: n_features_wanted]).tolist()
  selected_feature_names = (df['Features'][0: n_features_wanted]).tolist()
  selected_feature_names = (df['Features'][0: n_features_wanted]).tolist()
  selected_feature_names 

In [38]:
print(PI_dataframe)

   MSM  MSMW  MSW  Other  Midwest  Northeast  West  Southeast  Southwest  \
0   83    40   90      8       24         34    40         34          0   

   Region  Clinic  
0      37     100  


In [None]:
y_pos = np.arange(len(feature_names))
importance = PI_dataframe.sum(axis=1)


plt.bar(y_pos, importance, align='center', alpha=0.5)
plt.xticks(y_pos, feature_names)
plt.ylabel('Frequency predictor identified as significant')
plt.xlabel('Features')