In [1]:
import nltk
import random
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from textblob.classifiers import NaiveBayesClassifier
import numpy as np
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
pd.set_option('display.max_columns', 10000)
##Explore this blog: https://jakevdp.github.io/PythonDataScienceHandbook/05.08-random-forests.html for visualisation ideas

In [2]:
#Loads rated tweets into dataframe
df = pd.read_excel("C:/Users/ertur/Documents/Work/Workwork/ARUK/Submission - JMIR Aging/Revisions/Categorised tweets 1500.xlsx", converters={'Tweet':str,'Theme':int})

In [3]:
df = df.rename(columns = {'Tweet':'body_text', 'Theme':'label'})

In [4]:
df.shape

(1500, 2)

In [5]:
#removing cases where rating is missing
df = df.dropna()
df.shape

(1497, 2)

In [6]:
#obtaining sentiment and subjectivity
def sentAnal(df):
    for index, row in df.iterrows():
        temp = TextBlob(row['body_text'])
        df.loc[index,'Sentiment'] = temp.sentiment.polarity
    return df

In [7]:
df = sentAnal(df)

In [8]:
#removing tweets rated as uncertain or unknown
themes=[1,2,3,4,5,6]
df = df[df.label.isin(themes)]
df.shape

(1414, 3)

In [9]:
#converting assigned themes into corresponding rating of stigmatising and non-stigmatising
theme_map = {1:0, 2:0, 3:0, 4:1, 5:1, 6:1}
df['stig_label'] = df.label.map(theme_map)
df = df.drop('label', axis = 1)

In [10]:
#literature defined features are generated
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))
df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

# Average Word Length. simply take the sum of the length of all the words and divide it by the total length of the tweet as defined in function above
df['avg_word'] = df['body_text'].apply(lambda x: avg_word(x))

# Number of Words in tweet
df['word_count'] = df['body_text'].apply(lambda x: len(str(x).split(" ")))

# Number of characters. Here, we calculate the number of characters in each tweet. This is done by calculating the length of the tweet.
df['char_count'] = df['body_text'].str.len() ## this also includes spaces

# number of special characters like hashtags. we make use of the ‘starts with’ function because hashtags (or mentions) always appear at the beginning of a word.
df['hastags'] = df['body_text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))

# number of numerics in tweet
df['numerics'] = df['body_text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))


In [11]:
#care-partner defined features are generated
#senile
Search_for_These_values = ['senile', 'SENILE'] 
pattern = '|'.join(Search_for_These_values) 
df['senile'] = df['body_text'].str.contains(pattern)
df['senile'] = df['senile'].map({True: 1, False: 0})
#demented
Search_for_These_values = ['demented', 'DEMENTED'] 
pattern = '|'.join(Search_for_These_values) 
df['demented'] = df['body_text'].str.contains(pattern)
df['demented'] = df['demented'].map({True: 1, False: 0})
#donald trump
Search_for_These_values = ['donald', 'trump', 'DONALD', 'TRUMP', '@realDonaldTrump'] 
pattern = '|'.join(Search_for_These_values) 
df['donaldtrump'] = df['body_text'].str.contains(pattern)
df['donaldtrump'] = df['donaldtrump'].map({True: 1, False: 0})
#memory
Search_for_These_values = ['MEMORY', 'memory'] 
pattern = '|'.join(Search_for_These_values) 
df['Memory'] = df['body_text'].str.contains(pattern)
df['Memory'] = df['Memory'].map({True: 1, False: 0})
#research
Search_for_These_values = ['research', 'RESEARCH'] 
pattern = '|'.join(Search_for_These_values) 
df['Research'] = df['body_text'].str.contains(pattern)
df['Research'] = df['Research'].map({True: 1, False: 0})
#crazy
Search_for_These_values = ['crazy', 'CRAZY'] 
pattern = '|'.join(Search_for_These_values) 
df['Crazy'] = df['body_text'].str.contains(pattern)
df['Crazy'] = df['Crazy'].map({True: 1, False: 0})
#senility
Search_for_These_values = ['senility', 'SENILITY'] 
pattern = '|'.join(Search_for_These_values) 
df['Senility'] = df['body_text'].str.contains(pattern)
df['Senility'] = df['Senility'].map({True: 1, False: 0})
# URL
Search_for_These_values = ['https'] 
pattern = '|'.join(Search_for_These_values) 
df['Link'] = df['body_text'].str.contains(pattern)
df['Link'] = df['Link'].map({True: 1, False: 0})
#caregiver
Search_for_These_values = ['caregiver', 'CAREGIVER'] 
pattern = '|'.join(Search_for_These_values) 
df['Caregiver'] = df['body_text'].str.contains(pattern)
df['Caregiver'] = df['Caregiver'].map({True: 1, False: 0})

In [12]:
df.shape

(1414, 19)

In [13]:
cols = df[df.columns.difference(["stig_label"])].columns

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[cols], df['stig_label'], test_size=0.2, random_state = 1)

In [15]:
cols = df[df.columns.difference(["stig_label", "body_text"])].columns

In [16]:
print("The size of each training and testing datasets are:")
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

The size of each training and testing datasets are:
(1131, 18)
(283, 18)
(1131,)
(283,)


In [17]:
# instantiate the vectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)

# learn training data vocabulary, then use it to create a document-term matrix
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])
tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])


# transform testing data (using fitted vocabulary) into a document-term matrix
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

X_train_vect = pd.concat([X_train[cols].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray(), columns=tfidf_vect.get_feature_names())], axis=1)

X_test_vect = pd.concat([X_test[cols].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray(), columns=tfidf_vect.get_feature_names())], axis=1)

<hr style="border:1px solid black"> </hr>

**Cross Fold Validation Code from Nick**, it's slower but everything is comparable and the test vectors have been vectorised by the training vectors. I've not included all modesl, but you should be able to see the pattern of how to do it if you want to

In [18]:
from sklearn.model_selection import StratifiedKFold

n_folds = 5 # number of folds to do being set
kf_Strat = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0) #sets this as a 'train test split equiv'

In [19]:
#original estimators [10, 25, 50, 100, 250, 500]
#original depths [[25, 50, 100, 250, None]]

In [20]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Parameters to seach over
n_estimators_RF = [500, 525, 550, 600, 750, 1000] #list of estimators
max_depth_RF = [25, 10, None] #list of depths
num_param_RF = len(n_estimators_RF)*len(max_depth_RF) #calculates number of potentials

# To identify the best set of parameters later
param_RF = []

# For storing RF results
precision_RF = np.zeros((num_param_RF,n_folds), dtype=float) #an array of float 0s is made with
recall_RF = np.zeros((num_param_RF,n_folds), dtype=float)
fscore_RF = np.zeros((num_param_RF,n_folds), dtype=float)
accuracy_RF = np.zeros((num_param_RF,n_folds), dtype=float)

In [21]:
#original n_estimators_GB = [10, 50, 100, 150, 200]
#original depth_GB = [10, 20, 30, 50, None]

In [22]:
from sklearn.ensemble import GradientBoostingClassifier

# Gradient Boost Parameters to seach over
n_estimators_GB = [200, 250, 300, 350, 400]
depth_GB = [10, None]
num_param_GB = len(n_estimators_GB)*len(depth_GB)

# To identify the best set of parameters later
param_GB = []

# For storing RF results
precision_GB = np.zeros((num_param_GB,n_folds), dtype=float)
recall_GB = np.zeros((num_param_GB,n_folds), dtype=float)
fscore_GB = np.zeros((num_param_GB,n_folds), dtype=float)
accuracy_GB = np.zeros((num_param_GB,n_folds), dtype=float)

In [23]:
# SVM rbf
from sklearn.svm import SVC

cost = [0.0001, 0.001, 0.01, 0.1, 1, 10] # Keep SVM cost at a max of 10, any bigger and you are most likely overfitting
num_param_SVM = len(cost)

# To identify the best set of parameters later
param_SVM = []

precision_SVM = np.zeros((num_param_SVM,n_folds), dtype=float)
recall_SVM = np.zeros((num_param_SVM,n_folds), dtype=float)
fscore_SVM = np.zeros((num_param_SVM,n_folds), dtype=float)
accuracy_SVM = np.zeros((num_param_SVM,n_folds), dtype=float)

In [24]:
# SVM Linear
from sklearn.svm import SVC

cost = [0.0001, 0.001, 0.01, 0.1, 1, 10] # Keep SVM cost at a max of 10, any bigger and you are most likely overfitting
num_param_SVML = len(cost)

# To identify the best set of parameters later
param_SVML = []

precision_SVML = np.zeros((num_param_SVM,n_folds), dtype=float)
recall_SVML = np.zeros((num_param_SVM,n_folds), dtype=float)
fscore_SVML = np.zeros((num_param_SVM,n_folds), dtype=float)
accuracy_SVML = np.zeros((num_param_SVM,n_folds), dtype=float)

In [25]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.preprocessing import MinMaxScaler

# For monitoring training
from IPython.display import clear_output
from tqdm import tqdm #progress bars

fld_cnt = 0
for train_index, test_index in kf_Strat.split(df[cols], df['stig_label']):
    
    print("Processing fold " + str((fld_cnt+1)))
    
    cols = df[df.columns.difference(["stig_label"])].columns
    
    X_train_CFV = df.loc[df.index[train_index],cols]
    y_train_CFV =df.loc[df.index[train_index],'stig_label']
    X_test_CFV = df.loc[df.index[test_index],cols]
    y_test_CFV =df.loc[df.index[test_index],'stig_label']
    
    cols = df[df.columns.difference(["stig_label", "body_text"])].columns

    # instantiate the vectorizer
    tfidf_vect = TfidfVectorizer(analyzer=clean_text)

    # learn training data vocabulary, then use it to create a document-term matrix
    tfidf_vect_fit = tfidf_vect.fit(X_train_CFV['body_text'])
    
    tfidf_train_CFV = tfidf_vect_fit.transform(X_train_CFV['body_text'])

    # transform testing data (using fitted vocabulary) into a document-term matrix
    tfidf_test_CFV = tfidf_vect_fit.transform(X_test_CFV['body_text'])

    X_train_CFV_vect = pd.concat([X_train_CFV[cols].reset_index(drop=True), 
               pd.DataFrame(tfidf_train_CFV.toarray(), columns=tfidf_vect.get_feature_names())], axis=1)

    X_test_CFV_vect = pd.concat([X_test_CFV[cols].reset_index(drop=True), 
               pd.DataFrame(tfidf_test_CFV.toarray(), columns=tfidf_vect.get_feature_names())], axis=1) 
    
    # Scale the data to reduce influence of features with large values and to speed up training
    min_max_scaler = MinMaxScaler()
    X_train_CFV_vect = min_max_scaler.fit_transform(X_train_CFV_vect)
    X_test_CFV_vect = min_max_scaler.transform(X_test_CFV_vect)   


    # Perform Random Forest Grid Search and store results 
    print("................. Random Forest")
    grid_cnt = 0
    for est in tqdm(n_estimators_RF):
        for dpth in max_depth_RF:
            
            if fld_cnt == 0:
                param_RF.append([est, dpth])

            rf_CFV = RandomForestClassifier(n_estimators=est, max_depth=dpth, n_jobs=-1, random_state=0) 
            rf_model_CFV = rf_CFV.fit(X_train_CFV_vect, y_train_CFV)

            y_pred_CFV_RF = rf_model_CFV.predict(X_test_CFV_vect)

            precision_RF[grid_cnt, fld_cnt], recall_RF[grid_cnt, fld_cnt], fscore_RF[grid_cnt, fld_cnt], train_support = score(y_test_CFV, y_pred_CFV_RF, pos_label=1, average="binary")
            accuracy_RF[grid_cnt, fld_cnt] = (y_pred_CFV_RF==y_test_CFV).sum()/len(y_pred_CFV_RF)

            grid_cnt += 1
                
    # Perform Gradient Boost Grid Search and store results     
    print("................. Gradient Boost")
    grid_cnt = 0
    for n_est in tqdm(n_estimators_GB):
        for depth in depth_GB:
            
            if fld_cnt == 0:
                param_GB.append([n_est, depth])
            
            gb_CFV = GradientBoostingClassifier(n_estimators=n_est, max_depth=depth, random_state=0)
            gb_model_CFV = gb_CFV.fit(X_train_CFV_vect, y_train_CFV)

            y_pred_CFV_GB = gb_model_CFV.predict(X_test_CFV_vect)

            precision_GB[grid_cnt, fld_cnt], recall_GB[grid_cnt, fld_cnt], fscore_GB[grid_cnt, fld_cnt], train_support = score(y_test_CFV, y_pred_CFV_GB, pos_label=1, average="binary")
            accuracy_GB[grid_cnt, fld_cnt] = (y_pred_CFV_GB==y_test_CFV).sum()/len(y_pred_CFV_GB)

            grid_cnt += 1   
            
            
    #print("................. RBF SVM")     
    #grid_cnt = 0
    #for cst in tqdm(cost):
        
    #    if fld_cnt == 0:
    #        param_SVM.append(cst)
    
   #     svmClas_CFV = SVC(C = cst, probability=True, random_state=0)
        
    #    svmClas_model_CFV = svmClas_CFV.fit(X_train_CFV_vect, y_train_CFV)
        
     #   y_pred_CFV_SVM = svmClas_model_CFV.predict(X_test_CFV_vect)
    
      #  precision_SVM[grid_cnt, fld_cnt], recall_SVM[grid_cnt, fld_cnt], fscore_SVM[grid_cnt, fld_cnt], train_support = score(y_test_CFV, y_pred_CFV_SVM, pos_label=1, average="binary")
       # accuracy_SVM[grid_cnt, fld_cnt] = (y_pred_CFV_SVM==y_test_CFV).sum()/len(y_pred_CFV_SVM)            
            
       # grid_cnt += 1  
    
    #print("................. Linear SVM")     
    #grid_cnt = 0
    #for cst in tqdm(cost):
        
     #   if fld_cnt == 0:
      #      param_SVML.append(cst)
    
       # svmClas_CFVL = SVC(kernel='linear', C = cst, probability=True, random_state=0)
        
        #svmClas_model_CFVL = svmClas_CFVL.fit(X_train_CFV_vect, y_train_CFV)
        
        #y_pred_CFV_SVML = svmClas_model_CFV.predict(X_test_CFV_vect)
    
        #precision_SVML[grid_cnt, fld_cnt], recall_SVML[grid_cnt, fld_cnt], fscore_SVML[grid_cnt, fld_cnt], train_support = score(y_test_CFV, y_pred_CFV_SVML, pos_label=1, average="binary")
        #accuracy_SVML[grid_cnt, fld_cnt] = (y_pred_CFV_SVML==y_test_CFV).sum()/len(y_pred_CFV_SVML)            
            
        #grid_cnt += 1  

    #clear_output(wait=True)                      
    fld_cnt += 1

Processing fold 1


  0%|          | 0/6 [00:00<?, ?it/s]

................. Random Forest


100%|██████████| 6/6 [01:09<00:00, 11.56s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

................. Gradient Boost


100%|██████████| 5/5 [37:38<00:00, 451.73s/it]


Processing fold 2


  0%|          | 0/6 [00:00<?, ?it/s]

................. Random Forest


100%|██████████| 6/6 [00:45<00:00,  7.52s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

................. Gradient Boost


100%|██████████| 5/5 [22:38<00:00, 271.61s/it]


Processing fold 3


  0%|          | 0/6 [00:00<?, ?it/s]

................. Random Forest


100%|██████████| 6/6 [00:40<00:00,  6.67s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

................. Gradient Boost


100%|██████████| 5/5 [22:00<00:00, 264.07s/it]


Processing fold 4


  0%|          | 0/6 [00:00<?, ?it/s]

................. Random Forest


100%|██████████| 6/6 [00:46<00:00,  7.80s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

................. Gradient Boost


100%|██████████| 5/5 [23:17<00:00, 279.55s/it]


Processing fold 5


  0%|          | 0/6 [00:00<?, ?it/s]

................. Random Forest


100%|██████████| 6/6 [00:45<00:00,  7.53s/it]
  0%|          | 0/5 [00:00<?, ?it/s]

................. Gradient Boost


100%|██████████| 5/5 [31:58<00:00, 383.74s/it]


In [26]:
# choosing best RF paramters 
prec_final_RF = np.mean(precision_RF, axis = 1)
recall_final_RF = np.mean(recall_RF, axis = 1)        
fscore_final_RF = np.mean(fscore_RF, axis = 1)  
accuracy_final_RF = np.mean(accuracy_RF, axis = 1)  

# Choosing final system set-up based on best F-score
max_ind_RF = np.argmax(accuracy_final_RF)
final_param_RF = param_RF[max_ind_RF]  

print("RF best n_estimators is: " + str(final_param_RF[0]) + " best max_depth is: " + str(final_param_RF[1]))

prec_best_RF = prec_final_RF[max_ind_RF]
recall_best_RF = recall_final_RF[max_ind_RF]
fscore_best_RF = fscore_final_RF[max_ind_RF]
accuracy_best_RF = accuracy_final_RF[max_ind_RF]

print( "RF CFV Training Results: Mean Prec " + str(round(prec_best_RF,3)) + 
      ", Mean Recall " + str(round(recall_best_RF,3)) + 
      ", Mean Fscore " + str(round(fscore_best_RF,3)) + 
      ", Mean Accuracy " + str(round(accuracy_best_RF,3)) )

RF best n_estimators is: 500 best max_depth is: 25
RF CFV Training Results: Mean Prec 0.979, Mean Recall 0.956, Mean Fscore 0.967, Mean Accuracy 0.965


In [27]:
# choosing best GB paramters 
prec_final_GB = np.mean(precision_GB, axis = 1)
recall_final_GB = np.mean(recall_GB, axis = 1)        
fscore_final_GB = np.mean(fscore_GB, axis = 1)  
accuracy_final_GB = np.mean(accuracy_GB, axis = 1)  

# Choosing final system set-up based on best F-score
max_ind_GB = np.argmax(accuracy_final_GB)
final_param_GB = param_GB[max_ind_GB]  

print("GB best n_estimators is: " + str(final_param_GB[0]) + " best depth is: " + str(final_param_GB[1]))

prec_best_GB = prec_final_GB[max_ind_GB]
recall_best_GB = recall_final_GB[max_ind_GB]
fscore_best_GB = fscore_final_GB[max_ind_GB]
accuracy_best_GB = accuracy_final_GB[max_ind_GB]

print( "RF CFV Training Results: Mean Prec " + str(round(prec_best_GB,3)) + 
      ", Mean Recall " + str(round(recall_best_GB,3)) + 
      ", Mean Fscore " + str(round(fscore_best_GB,3)) + 
      ", Mean Accuracy " + str(round(accuracy_best_GB,3)) )

GB best n_estimators is: 200 best depth is: 10
RF CFV Training Results: Mean Prec 0.966, Mean Recall 0.938, Mean Fscore 0.952, Mean Accuracy 0.948


In [54]:
# choosing best SVM paramters 
prec_final_SVM = np.mean(precision_SVM, axis = 1)
recall_final_SVM = np.mean(recall_SVM, axis = 1)        
fscore_final_SVM = np.mean(fscore_SVM, axis = 1)  
accuracy_final_SVM = np.mean(accuracy_SVM, axis = 1) 

# Choosing final system set-up based on best F-score
max_ind_SVM = np.argmax(accuracy_final_SVM)
final_param_SVM = param_SVM[max_ind_SVM]  

print("SVM best cost setting is: " + str(final_param_SVM))

prec_best_SVM = prec_final_SVM[max_ind_SVM]
recall_best_SVM = recall_final_SVM[max_ind_SVM]
fscore_best_SVM = fscore_final_SVM[max_ind_SVM]
accuracy_best_SVM = accuracy_final_SVM[max_ind_SVM]

print( "SVM CFV Training Results: Mean Prec " + str(round(prec_best_SVM,3)) + 
      ", Mean Recall " + str(round(recall_best_SVM,3)) + 
      ", Mean Fscore " + str(round(fscore_best_SVM,3)) + 
      ", Mean Accuracy " + str(round(accuracy_best_SVM,3)) )

SVM best cost setting is: 10
SVM CFV Training Results: Mean Prec 0.988, Mean Recall 0.931, Mean Fscore 0.958, Mean Accuracy 0.955


In [55]:
# choosing best SVML paramters 
prec_final_SVML = np.mean(precision_SVML, axis = 1)
recall_final_SVML = np.mean(recall_SVML, axis = 1)        
fscore_final_SVML = np.mean(fscore_SVML, axis = 1)  
accuracy_final_SVML = np.mean(accuracy_SVML, axis = 1) 

# Choosing final system set-up based on best F-score
max_ind_SVML = np.argmax(accuracy_final_SVML)
final_param_SVML = param_SVML[max_ind_SVML]  

print("SVM linear best cost setting is: " + str(final_param_SVML))

prec_best_SVML = prec_final_SVML[max_ind_SVML]
recall_best_SVML = recall_final_SVML[max_ind_SVML]
fscore_best_SVML = fscore_final_SVML[max_ind_SVML]
accuracy_best_SVML = accuracy_final_SVML[max_ind_SVML]

print( "SVM linear CFV Training Results: Mean Prec " + str(round(prec_best_SVML,3)) + 
      ", Mean Recall " + str(round(recall_best_SVML,3)) + 
      ", Mean Fscore " + str(round(fscore_best_SVML,3)) + 
      ", Mean Accuracy " + str(round(accuracy_best_SVML,3)) )

SVM linear best cost setting is: 0.0001
SVM linear CFV Training Results: Mean Prec 0.988, Mean Recall 0.931, Mean Fscore 0.958, Mean Accuracy 0.955


**End of Nick's Code**

<hr style="border:1px solid black"> </hr>

In [56]:
pd.options.display.float_format = '{:,.2f}'.format
np.set_printoptions(precision=2)

In [57]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
import time
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import roc_auc_score

In [58]:
 def conFusion_metricsOutput(var_y_test, var_y_pred, modname):
    
    res = []   
    confusion = metrics.confusion_matrix(var_y_test, var_y_pred)
    TP = confusion[1, 1] #True Positives (TP): we correctly predicted that tweets do have stigma
    TN = confusion[0, 0] #True Negatives (TN): we correctly predicted that tweets don't have stigma
    FP = confusion[0, 1] #False Positives (FP): we incorrectly predicted that tweets do have stigma (a "Type I error")
    FN = confusion[1, 0] #False Negatives (FN): we incorrectly predicted that tweets don't have stigma (a "Type II error")

    ## Classification Accuracy: Overall, how often is the classifier correct?
    accuracy = ((TP + TN) / float(TP + TN + FP + FN))
    #print(metrics.accuracy_score(y_test, y_pred))

    #Classification Error: Overall, how often is the classifier incorrect?
    #Also known as "Misclassification Rate"
    misclassication_rate = ((FP + FN) / float(TP + TN + FP + FN))
    #print(1 - metrics.accuracy_score(y_test, y_pred))

    #Recall/Sensitivity: When the actual value is positive, how often is the prediction correct?
    #How "sensitive" is the classifier to detecting positive instances?
    #Also known as "True Positive Rate"
    recall_tpr = (TP / float(TP + FN))
    #print(metrics.recall_score(y_test, y_pred))

    #Specificity: When the actual value is negative, how often is the prediction correct?
    #How "specific" (or "selective") is the classifier in predicting positive instances?
    specificity = (TN / float(TN + FP))

    #False Positive Rate: When the actual value is negative, how often is the prediction incorrect?
    fpr = (FP / float(TN + FP))

    #Precision: When a positive value is predicted, how often is the prediction correct?
    #How "precise" is the classifier when predicting positive instances?
    precision = (TP / float(TP + FP))
    #print(metrics.precision_score(y_test, y_pred))
    
    fnr = (FN / float(TP + FN))
    
    false_negatives = FN
    
    false_positives = FP
    
    res.append([accuracy, misclassication_rate, recall_tpr, specificity, fpr, precision, fnr, false_negatives, false_positives])
    
    data=pd.DataFrame(res ,columns=['accuracy','misclassication_rate', 'recall_tpr' , 'specificity' , 'fpr', 'precision', 'fnr', 'false_negatives', 'false_positives'], index=[modname]).T
    print(TP, TN, FP, FN)
    
    return data

In [None]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1, random_state=0)
k_fold = KFold(n_splits=5)

y_pred_rf_cv = cross_val_predict(rf, X_features_cv, df['stig_label'], cv=k_fold, n_jobs = -1)

auc = roc_auc_score(df['stig_label'], y_pred_rf_cv)

In [None]:
res_df = conFusion_metricsOutput(df['stig_label'], y_pred_rf_cv, modname = 'rf_cv')
res_df
print(auc)

In [None]:
#individual k-folds
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1, random_state=0)
k_fold = KFold(n_splits=5)
%time accuracy =        cross_val_score(rf, X_features_cv, df['stig_label'], cv = k_fold, n_jobs = -1, scoring='accuracy')
%time precision_score =  cross_val_score(rf,  X_features_cv, df['stig_label'], cv = k_fold, n_jobs = -1, scoring='precision')
%time recall_score =    cross_val_score(rf,  X_features_cv, df['stig_label'], cv = k_fold, n_jobs = -1, scoring='recall')


print(accuracy, precision_score, recall_score)

In [None]:
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1, random_state=0)
    rf_model = rf.fit(X_train_vect, y_train)
    y_pred = rf_model.predict(X_test_vect)
    precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average="binary")
    print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    n_est, depth, precision.mean(), recall.mean(),
    (y_pred==y_test).sum() / len(y_pred), 3))

In [None]:
#Extend if extremes show best
%precision %.2f
np.set_printoptions(precision=2)
#200 estimators with 20 depth gives best results first time so redo up to 250

for n_est in [10, 50, 100, 150, 200, 250]:
    for depth in [10, 20, 30, 50, None]:
        train_RF(n_est, depth)

In [None]:
#best is 200 est, depth 20

In [None]:
#rf hold out test

rf = RandomForestClassifier(n_estimators=200, max_depth=20, n_jobs=-1, random_state=0) 

rf_model = rf.fit(X_train_vect, y_train)

y_pred = rf_model.predict(X_test_vect)
auc = roc_auc_score(y_test, y_pred)

In [None]:
res_df = res_df.join(conFusion_metricsOutput(y_test, y_pred, 'rf_holdout'))

print(auc)

In [None]:
gb = GradientBoostingClassifier(n_estimators=150, max_depth=None, random_state=0)
k_fold = KFold(n_splits=5)

y_pred_gb_cv = cross_val_predict(gb, X_features_cv, df['stig_label'], cv=k_fold, n_jobs = -1)

auc = roc_auc_score(df['stig_label'], y_pred_gb_cv)

In [None]:
res_df = res_df.join(conFusion_metricsOutput(df['stig_label'], y_pred_gb_cv, modname = 'gb_cv'))
print(auc)

In [None]:
def train_GB(n_est, depth):
    gb = GradientBoostingClassifier(n_estimators=n_est, max_depth=depth, random_state=0)
    gb_model = gb.fit(X_train_vect, y_train)
    y_pred = gb_model.predict(X_test_vect)
    precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average="binary")
    print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    n_est, depth, precision.mean(), recall.mean(),
    (y_pred==y_test).sum() / len(y_pred), 3))

In [None]:
for n_est in [10, 50, 100, 150, 200]:
    for depth in [10, 20, 30, 50, None]:
        train_GB(n_est, depth)

In [None]:
#Best for GB is 10 estimators depth 10

In [None]:
gb = GradientBoostingClassifier(n_estimators=10, max_depth=10, random_state=0)


gb_model = gb.fit(X_train_vect, y_train)



y_pred = gb_model.predict(X_test_vect)


In [None]:
res_df = res_df.join(conFusion_metricsOutput(y_test, y_pred, 'gb_holdout'))



In [None]:
auc = roc_auc_score(y_test, y_pred)
print(auc)

In [None]:
from sklearn.svm import SVC

In [None]:
svmClas = SVC(probability=True)

y_pred_svm_cv = cross_val_predict(svmClas, X_features_cv, df['stig_label'], cv=k_fold, n_jobs = -1)

auc = roc_auc_score(df['stig_label'], y_pred_svm_cv)

res_df = res_df.join(conFusion_metricsOutput(df['stig_label'], y_pred_svm_cv, modname = 'svm_cv'))
res_df
print(auc)

In [None]:
SVM_cost = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000, 1000000]

for cst in SVM_cost:

    svmClas = SVC(C = cst, random_state=0)

    svmClas.fit(X_train_vect, y_train)

    y_pred = svmClas.predict(X_test_vect)
    precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average="binary")
    print('Cost function: {} --- Precision: {} / Recall: {} / Accuracy: {}'.format(
    cst, precision.mean(), recall.mean(),
    (y_pred==y_test).sum() / len(y_pred), 3))

In [None]:
#cost function of 10000 is best

In [None]:
svmClas = SVC(C = 10000, random_state=0)
svmClas.fit(X_train_vect, y_train)

pred = svmClas.predict(X_test_vect)

In [None]:
res_df = res_df.join(conFusion_metricsOutput(y_test, pred, 'svm_holdout'))


In [None]:
auc = roc_auc_score(y_test, pred)

print(auc)

In [None]:
svmClas = SVC(kernel='linear', probability=True)
y_pred_svm_lin_cv = cross_val_predict(svmClas, X_features_cv, df['stig_label'], cv=k_fold, n_jobs = -1)

auc = roc_auc_score(df['stig_label'], y_pred_svm_lin_cv)

res_df = res_df.join(conFusion_metricsOutput(df['stig_label'], y_pred_svm_lin_cv, modname = 'svm_lin_cv'))
res_df
print(auc)

In [None]:
SVM_cost = [0.0001, 0.001, 0.01, 0.1, 1, 10]

for cst in SVM_cost:

    svmClas = SVC(kernel='linear', C = cst, random_state=0)

    svmClas.fit(X_train_vect, y_train)
    y_pred = svmClas.predict(X_test_vect)
    precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average="binary")
    print('Cost function: {} --- Precision: {} / Recall: {} / Accuracy: {}'.format(
    cst, precision.mean(), recall.mean(),
    (y_pred==y_test).sum() / len(y_pred), 3))

In [None]:
#best cost function for SVM linear is 1

In [None]:
svmClas = SVC(kernel='linear', C = 1, random_state=0)
svmClas.fit(X_train_vect, y_train)

pred = svmClas.predict(X_test_vect)

In [None]:
res_df = res_df.join(conFusion_metricsOutput(y_test, pred, 'svm_lin_holdout'))
auc = roc_auc_score(y_test, pred)
print(auc)

In [None]:
res_df 

In [None]:
# according to accuracy and false negatives, Random Forest holdout and SVM linear holdout are best