In [46]:
import nltk
import random
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from textblob.classifiers import NaiveBayesClassifier
import numpy as np
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
pd.set_option('display.max_columns', 10000)
##Explore this blog: https://jakevdp.github.io/PythonDataScienceHandbook/05.08-random-forests.html for visualisation ideas

In [47]:
#Loads rated tweets into dataframe
df = pd.read_excel("C:/Users/ertur/Documents/Work/Workwork/ARUK/Submission - JMIR Aging/Revisions/Categorised tweets 1500.xlsx", converters={'Tweet':str,'Theme':int})

In [48]:
df = df.rename(columns = {'Tweet':'body_text', 'Theme':'label'})

In [49]:
df.shape

(1500, 2)

In [50]:
#removing cases where rating is missing
df = df.dropna()
df.shape

(1497, 2)

In [51]:
#obtaining sentiment and subjectivity
def sentAnal(df):
    for index, row in df.iterrows():
        temp = TextBlob(row['body_text'])
        df.loc[index,'Sentiment'] = temp.sentiment.polarity
    return df

In [52]:
df = sentAnal(df)

In [53]:
#removing tweets rated as uncertain or unknown
themes=[1,2,3,4,5,6]
df = df[df.label.isin(themes)]
df.shape

(1414, 3)

In [54]:
#converting assigned themes into corresponding rating of stigmatising and non-stigmatising
theme_map = {1:0, 2:0, 3:0, 4:1, 5:1, 6:1}
df['stig_label'] = df.label.map(theme_map)
df = df.drop('label', axis = 1)

In [55]:
#literature defined features are generated
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))
df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

# Average Word Length. simply take the sum of the length of all the words and divide it by the total length of the tweet as defined in function above
df['avg_word'] = df['body_text'].apply(lambda x: avg_word(x))

# Number of Words in tweet
df['word_count'] = df['body_text'].apply(lambda x: len(str(x).split(" ")))

# Number of characters. Here, we calculate the number of characters in each tweet. This is done by calculating the length of the tweet.
df['char_count'] = df['body_text'].str.len() ## this also includes spaces

# number of special characters like hashtags. we make use of the ‘starts with’ function because hashtags (or mentions) always appear at the beginning of a word.
df['hastags'] = df['body_text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))

# number of numerics in tweet
df['numerics'] = df['body_text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))


In [56]:
#care-partner defined features are generated
#senile
Search_for_These_values = ['senile', 'SENILE'] 
pattern = '|'.join(Search_for_These_values) 
df['senile'] = df['body_text'].str.contains(pattern)
df['senile'] = df['senile'].map({True: 1, False: 0})
#demented
Search_for_These_values = ['demented', 'DEMENTED'] 
pattern = '|'.join(Search_for_These_values) 
df['demented'] = df['body_text'].str.contains(pattern)
df['demented'] = df['demented'].map({True: 1, False: 0})
#donald trump
Search_for_These_values = ['donald', 'trump', 'DONALD', 'TRUMP', '@realDonaldTrump'] 
pattern = '|'.join(Search_for_These_values) 
df['donaldtrump'] = df['body_text'].str.contains(pattern)
df['donaldtrump'] = df['donaldtrump'].map({True: 1, False: 0})
#memory
Search_for_These_values = ['MEMORY', 'memory'] 
pattern = '|'.join(Search_for_These_values) 
df['Memory'] = df['body_text'].str.contains(pattern)
df['Memory'] = df['Memory'].map({True: 1, False: 0})
#research
Search_for_These_values = ['research', 'RESEARCH'] 
pattern = '|'.join(Search_for_These_values) 
df['Research'] = df['body_text'].str.contains(pattern)
df['Research'] = df['Research'].map({True: 1, False: 0})
#crazy
Search_for_These_values = ['crazy', 'CRAZY'] 
pattern = '|'.join(Search_for_These_values) 
df['Crazy'] = df['body_text'].str.contains(pattern)
df['Crazy'] = df['Crazy'].map({True: 1, False: 0})
#senility
Search_for_These_values = ['senility', 'SENILITY'] 
pattern = '|'.join(Search_for_These_values) 
df['Senility'] = df['body_text'].str.contains(pattern)
df['Senility'] = df['Senility'].map({True: 1, False: 0})
# URL
Search_for_These_values = ['https'] 
pattern = '|'.join(Search_for_These_values) 
df['Link'] = df['body_text'].str.contains(pattern)
df['Link'] = df['Link'].map({True: 1, False: 0})
#caregiver
Search_for_These_values = ['caregiver', 'CAREGIVER'] 
pattern = '|'.join(Search_for_These_values) 
df['Caregiver'] = df['body_text'].str.contains(pattern)
df['Caregiver'] = df['Caregiver'].map({True: 1, False: 0})

In [57]:
df.shape

(1414, 19)

In [58]:
cols = df[df.columns.difference(["stig_label"])].columns

In [59]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[cols], df['stig_label'], test_size=0.2, random_state = 1)

In [60]:
cols = df[df.columns.difference(["stig_label", "body_text"])].columns

In [61]:
print("The size of each training and testing datasets are:")
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

The size of each training and testing datasets are:
(1131, 18)
(283, 18)
(1131,)
(283,)


In [62]:
# instantiate the vectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)

# learn training data vocabulary, then use it to create a document-term matrix
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])
tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])


# transform testing data (using fitted vocabulary) into a document-term matrix
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

X_train_vect = pd.concat([X_train[cols].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray(), columns=tfidf_vect.get_feature_names())], axis=1)

X_test_vect = pd.concat([X_test[cols].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray(), columns=tfidf_vect.get_feature_names())], axis=1)

<hr style="border:1px solid black"> </hr>

**Cross Fold Validation Code from Nick**, it's slower but everything is comparable and the test vectors have been vectorised by the training vectors. I've not included all modesl, but you should be able to see the pattern of how to do it if you want to

In [63]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
import time
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import roc_auc_score

In [64]:
from sklearn.model_selection import StratifiedKFold

n_folds = 5 # number of folds to do being set
kf_Strat = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0) #sets this as a 'train test split equiv'

In [67]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.preprocessing import MinMaxScaler

# For monitoring training
from IPython.display import clear_output
from tqdm import tqdm #progress bars

results = {}
fld_cnt = 1
for train_index, test_index in kf_Strat.split(df[cols], df['stig_label']):
    
    print("Processing fold " + str((fld_cnt)))
    
    cols = df[df.columns.difference(["stig_label"])].columns
    
    X_train_CFV = df.loc[df.index[train_index],cols]
    y_train_CFV =df.loc[df.index[train_index],'stig_label']
    X_test_CFV = df.loc[df.index[test_index],cols]
    y_test_CFV =df.loc[df.index[test_index],'stig_label']
    
    cols = df[df.columns.difference(["stig_label", "body_text"])].columns

    # instantiate the vectorizer
    tfidf_vect = TfidfVectorizer(analyzer=clean_text)

    # learn training data vocabulary, then use it to create a document-term matrix
    tfidf_vect_fit = tfidf_vect.fit(X_train_CFV['body_text'])
    
    tfidf_train_CFV = tfidf_vect_fit.transform(X_train_CFV['body_text'])

    # transform testing data (using fitted vocabulary) into a document-term matrix
    tfidf_test_CFV = tfidf_vect_fit.transform(X_test_CFV['body_text'])

    X_train_CFV_vect = pd.concat([X_train_CFV[cols].reset_index(drop=True), 
               pd.DataFrame(tfidf_train_CFV.toarray(), columns=tfidf_vect.get_feature_names())], axis=1)

    X_test_CFV_vect = pd.concat([X_test_CFV[cols].reset_index(drop=True), 
               pd.DataFrame(tfidf_test_CFV.toarray(), columns=tfidf_vect.get_feature_names())], axis=1) 
    
    # Scale the data to reduce influence of features with large values and to speed up training
    min_max_scaler = MinMaxScaler()
    X_train_CFV_vect = min_max_scaler.fit_transform(X_train_CFV_vect)
    X_test_CFV_vect = min_max_scaler.transform(X_test_CFV_vect)   
    
    rf_CFV = RandomForestClassifier(n_estimators=500, max_depth=25, n_jobs=-1, random_state=0)
    
    rf_model_CFV = rf_CFV.fit(X_train_CFV_vect, y_train_CFV)
    y_pred_CFV_RF = rf_model_CFV.predict(X_test_CFV_vect)
    confusion = metrics.confusion_matrix(y_test_CFV, y_pred_CFV_RF)
    TP = confusion[1, 1] #True Positives (TP): we correctly predicted that tweets do have stigma
    TN = confusion[0, 0] #True Negatives (TN): we correctly predicted that tweets don't have stigma
    FP = confusion[0, 1] #False Positives (FP): we incorrectly predicted that tweets do have stigma (a "Type I error")
    FN = confusion[1, 0]
    auc = roc_auc_score(y_test_CFV, y_pred_CFV_RF)
    results[fld_cnt] = [TP,TN,FP,FN, auc]
    fld_cnt += 1
    


Processing fold 1
Processing fold 2
Processing fold 3
Processing fold 4
Processing fold 5


In [68]:
results

{1: [150, 124, 4, 5, 0.9682459677419355],
 2: [145, 125, 3, 10, 0.956023185483871],
 3: [151, 124, 3, 5, 0.9721633353523118],
 4: [145, 125, 2, 11, 0.9568695739955584],
 5: [152, 123, 4, 3, 0.9745745491490984]}

In [73]:
for i in results:
    temp = results[i]
    TP = temp[0]
    TN = temp[1]
    FP = temp[2]
    FN = temp[3]
    auc_t= temp[4]
    if i == 1:
        accuracy = ((TP + TN) / float(TP + TN + FP + FN))
        misclassication_rate = ((FP + FN) / float(TP + TN + FP + FN))
        recall_tpr = (TP / float(TP + FN))
        specificity = (TN / float(TN + FP))
        fpr = (FP / float(TN + FP))
        precision = (TP / float(TP + FP))
        fnr = (FN / float(TP + FN))
        false_negatives = FN
        false_positives = FP
        auc = auc_t
    else:
        accuracy += ((TP + TN) / float(TP + TN + FP + FN))
        misclassication_rate += ((FP + FN) / float(TP + TN + FP + FN))
        recall_tpr += (TP / float(TP + FN))
        specificity += (TN / float(TN + FP))
        fpr += (FP / float(TN + FP))
        precision += (TP / float(TP + FP))
        fnr += (FN / float(TP + FN))
        false_negatives += FN
        false_positives += FP
        auc += auc_t
res = [accuracy/5, misclassication_rate/5, recall_tpr/5, specificity/5, fpr/5,  precision/5, fnr/5, false_negatives/5, false_positives/5, auc/5 ]
print('RF CV results:')
res



RF CV results:


[0.9646467684134026,
 0.0353532315865975,
 0.9562613730355667,
 0.9748892716535433,
 0.02511072834645669,
 0.9790057432914574,
 0.04373862696443341,
 6.8,
 3.2,
 0.9655753223445551]

In [76]:
results = {}
fld_cnt = 1
for train_index, test_index in kf_Strat.split(df[cols], df['stig_label']):
    
    print("Processing fold " + str((fld_cnt)))
    
    cols = df[df.columns.difference(["stig_label"])].columns
    
    X_train_CFV = df.loc[df.index[train_index],cols]
    y_train_CFV =df.loc[df.index[train_index],'stig_label']
    X_test_CFV = df.loc[df.index[test_index],cols]
    y_test_CFV =df.loc[df.index[test_index],'stig_label']
    
    cols = df[df.columns.difference(["stig_label", "body_text"])].columns

    # instantiate the vectorizer
    tfidf_vect = TfidfVectorizer(analyzer=clean_text)

    # learn training data vocabulary, then use it to create a document-term matrix
    tfidf_vect_fit = tfidf_vect.fit(X_train_CFV['body_text'])
    
    tfidf_train_CFV = tfidf_vect_fit.transform(X_train_CFV['body_text'])

    # transform testing data (using fitted vocabulary) into a document-term matrix
    tfidf_test_CFV = tfidf_vect_fit.transform(X_test_CFV['body_text'])

    X_train_CFV_vect = pd.concat([X_train_CFV[cols].reset_index(drop=True), 
               pd.DataFrame(tfidf_train_CFV.toarray(), columns=tfidf_vect.get_feature_names())], axis=1)

    X_test_CFV_vect = pd.concat([X_test_CFV[cols].reset_index(drop=True), 
               pd.DataFrame(tfidf_test_CFV.toarray(), columns=tfidf_vect.get_feature_names())], axis=1) 
    
    # Scale the data to reduce influence of features with large values and to speed up training
    min_max_scaler = MinMaxScaler()
    X_train_CFV_vect = min_max_scaler.fit_transform(X_train_CFV_vect)
    X_test_CFV_vect = min_max_scaler.transform(X_test_CFV_vect)   
    
    GB_CFV = GradientBoostingClassifier(n_estimators=200, max_depth=10, random_state=0)
    
    GB_model_CFV = GB_CFV.fit(X_train_CFV_vect, y_train_CFV)
    y_pred_CFV_GB = GB_model_CFV.predict(X_test_CFV_vect)
    confusion = metrics.confusion_matrix(y_test_CFV, y_pred_CFV_GB)
    TP = confusion[1, 1] #True Positives (TP): we correctly predicted that tweets do have stigma
    TN = confusion[0, 0] #True Negatives (TN): we correctly predicted that tweets don't have stigma
    FP = confusion[0, 1] #False Positives (FP): we incorrectly predicted that tweets do have stigma (a "Type I error")
    FN = confusion[1, 0]
    auc = roc_auc_score(y_test_CFV, y_pred_CFV_GB)
    results[fld_cnt] = [TP,TN,FP,FN, auc]
    fld_cnt += 1
    

Processing fold 1
Processing fold 2
Processing fold 3
Processing fold 4
Processing fold 5


In [77]:
for i in results:
    temp = results[i]
    TP = temp[0]
    TN = temp[1]
    FP = temp[2]
    FN = temp[3]
    auc_t= temp[4]
    if i == 1:
        accuracy = ((TP + TN) / float(TP + TN + FP + FN))
        misclassication_rate = ((FP + FN) / float(TP + TN + FP + FN))
        recall_tpr = (TP / float(TP + FN))
        specificity = (TN / float(TN + FP))
        fpr = (FP / float(TN + FP))
        precision = (TP / float(TP + FP))
        fnr = (FN / float(TP + FN))
        false_negatives = FN
        false_positives = FP
        auc = auc_t
    else:
        accuracy += ((TP + TN) / float(TP + TN + FP + FN))
        misclassication_rate += ((FP + FN) / float(TP + TN + FP + FN))
        recall_tpr += (TP / float(TP + FN))
        specificity += (TN / float(TN + FP))
        fpr += (FP / float(TN + FP))
        precision += (TP / float(TP + FP))
        fnr += (FN / float(TP + FN))
        false_negatives += FN
        false_positives += FP
        auc += auc_t
res = [accuracy/5, misclassication_rate/5, recall_tpr/5, specificity/5, fpr/5,  precision/5, fnr/5, false_negatives/5, false_positives/5, auc/5 ]
print('GB CV results:')
res

GB CV results:


[0.9476655890534547,
 0.052334410946545375,
 0.9382464846980977,
 0.9591535433070867,
 0.04084645669291338,
 0.9656388604235797,
 0.061753515301902394,
 9.6,
 5.2,
 0.9487000140025922]

In [78]:
results = {}
fld_cnt = 1
for train_index, test_index in kf_Strat.split(df[cols], df['stig_label']):
    
    print("Processing fold " + str((fld_cnt)))
    
    cols = df[df.columns.difference(["stig_label"])].columns
    
    X_train_CFV = df.loc[df.index[train_index],cols]
    y_train_CFV =df.loc[df.index[train_index],'stig_label']
    X_test_CFV = df.loc[df.index[test_index],cols]
    y_test_CFV =df.loc[df.index[test_index],'stig_label']
    
    cols = df[df.columns.difference(["stig_label", "body_text"])].columns

    # instantiate the vectorizer
    tfidf_vect = TfidfVectorizer(analyzer=clean_text)

    # learn training data vocabulary, then use it to create a document-term matrix
    tfidf_vect_fit = tfidf_vect.fit(X_train_CFV['body_text'])
    
    tfidf_train_CFV = tfidf_vect_fit.transform(X_train_CFV['body_text'])

    # transform testing data (using fitted vocabulary) into a document-term matrix
    tfidf_test_CFV = tfidf_vect_fit.transform(X_test_CFV['body_text'])

    X_train_CFV_vect = pd.concat([X_train_CFV[cols].reset_index(drop=True), 
               pd.DataFrame(tfidf_train_CFV.toarray(), columns=tfidf_vect.get_feature_names())], axis=1)

    X_test_CFV_vect = pd.concat([X_test_CFV[cols].reset_index(drop=True), 
               pd.DataFrame(tfidf_test_CFV.toarray(), columns=tfidf_vect.get_feature_names())], axis=1) 
    
    # Scale the data to reduce influence of features with large values and to speed up training
    min_max_scaler = MinMaxScaler()
    X_train_CFV_vect = min_max_scaler.fit_transform(X_train_CFV_vect)
    X_test_CFV_vect = min_max_scaler.transform(X_test_CFV_vect)   
    
    svmClas_CFV = SVC(C = 10, probability=True, random_state=0)
    
    svmClas_CFV = svmClas_CFV.fit(X_train_CFV_vect, y_train_CFV)
    y_pred_CFV_svmClas = svmClas_CFV.predict(X_test_CFV_vect)
    confusion = metrics.confusion_matrix(y_test_CFV, y_pred_CFV_svmClas)
    TP = confusion[1, 1] #True Positives (TP): we correctly predicted that tweets do have stigma
    TN = confusion[0, 0] #True Negatives (TN): we correctly predicted that tweets don't have stigma
    FP = confusion[0, 1] #False Positives (FP): we incorrectly predicted that tweets do have stigma (a "Type I error")
    FN = confusion[1, 0]
    auc = roc_auc_score(y_test_CFV, y_pred_CFV_svmClas)
    results[fld_cnt] = [TP,TN,FP,FN, auc]
    fld_cnt += 1

Processing fold 1
Processing fold 2
Processing fold 3
Processing fold 4
Processing fold 5


In [79]:
for i in results:
    temp = results[i]
    TP = temp[0]
    TN = temp[1]
    FP = temp[2]
    FN = temp[3]
    auc_t= temp[4]
    if i == 1:
        accuracy = ((TP + TN) / float(TP + TN + FP + FN))
        misclassication_rate = ((FP + FN) / float(TP + TN + FP + FN))
        recall_tpr = (TP / float(TP + FN))
        specificity = (TN / float(TN + FP))
        fpr = (FP / float(TN + FP))
        precision = (TP / float(TP + FP))
        fnr = (FN / float(TP + FN))
        false_negatives = FN
        false_positives = FP
        auc = auc_t
    else:
        accuracy += ((TP + TN) / float(TP + TN + FP + FN))
        misclassication_rate += ((FP + FN) / float(TP + TN + FP + FN))
        recall_tpr += (TP / float(TP + FN))
        specificity += (TN / float(TN + FP))
        fpr += (FP / float(TN + FP))
        precision += (TP / float(TP + FP))
        fnr += (FN / float(TP + FN))
        false_negatives += FN
        false_positives += FP
        auc += auc_t
res = [accuracy/5, misclassication_rate/5, recall_tpr/5, specificity/5, fpr/5,  precision/5, fnr/5, false_negatives/5, false_positives/5, auc/5 ]
print('SVM RBF CV results:')
res

SVM RBF CV results:


[0.9554494649525098,
 0.04455053504749016,
 0.9305045492142267,
 0.9858513779527559,
 0.014148622047244094,
 0.9878592847667569,
 0.06949545078577336,
 10.8,
 1.8,
 0.9581779635834913]

In [131]:
results = {}
fld_cnt = 1
for train_index, test_index in kf_Strat.split(df[cols], df['stig_label']):
    
    print("Processing fold " + str((fld_cnt)))
    
    cols = df[df.columns.difference(["stig_label"])].columns
    
    X_train_CFV = df.loc[df.index[train_index],cols]
    y_train_CFV =df.loc[df.index[train_index],'stig_label']
    X_test_CFV = df.loc[df.index[test_index],cols]
    y_test_CFV =df.loc[df.index[test_index],'stig_label']
    
    cols = df[df.columns.difference(["stig_label", "body_text"])].columns

    # instantiate the vectorizer
    tfidf_vect = TfidfVectorizer(analyzer=clean_text)

    # learn training data vocabulary, then use it to create a document-term matrix
    tfidf_vect_fit = tfidf_vect.fit(X_train_CFV['body_text'])
    
    tfidf_train_CFV = tfidf_vect_fit.transform(X_train_CFV['body_text'])

    # transform testing data (using fitted vocabulary) into a document-term matrix
    tfidf_test_CFV = tfidf_vect_fit.transform(X_test_CFV['body_text'])

    X_train_CFV_vect = pd.concat([X_train_CFV[cols].reset_index(drop=True), 
               pd.DataFrame(tfidf_train_CFV.toarray(), columns=tfidf_vect.get_feature_names())], axis=1)

    X_test_CFV_vect = pd.concat([X_test_CFV[cols].reset_index(drop=True), 
               pd.DataFrame(tfidf_test_CFV.toarray(), columns=tfidf_vect.get_feature_names())], axis=1) 
    
    # Scale the data to reduce influence of features with large values and to speed up training
    min_max_scaler = MinMaxScaler()
    X_train_CFV_vect = min_max_scaler.fit_transform(X_train_CFV_vect)
    X_test_CFV_vect = min_max_scaler.transform(X_test_CFV_vect)   
    
    svmClasL_CFV = SVC(kernel='linear',C = 0.1, probability=True, random_state=0)
    
    svmClasL_model_CFV = svmClasL_CFV.fit(X_train_CFV_vect, y_train_CFV)
    y_pred_CFV_svmClasL = svmClasL_model_CFV.predict(X_test_CFV_vect)
    confusion = metrics.confusion_matrix(y_test_CFV, y_pred_CFV_svmClasL)
    TP = confusion[1, 1] #True Positives (TP): we correctly predicted that tweets do have stigma
    TN = confusion[0, 0] #True Negatives (TN): we correctly predicted that tweets don't have stigma
    FP = confusion[0, 1] #False Positives (FP): we incorrectly predicted that tweets do have stigma (a "Type I error")
    FN = confusion[1, 0]
    auc = roc_auc_score(y_test_CFV, y_pred_CFV_svmClasL)
    results[fld_cnt] = [TP,TN,FP,FN, auc]
    fld_cnt += 1

Processing fold 1
Processing fold 2
Processing fold 3
Processing fold 4
Processing fold 5


In [132]:
for i in results:
    temp = results[i]
    TP = temp[0]
    TN = temp[1]
    FP = temp[2]
    FN = temp[3]
    auc_t= temp[4]
    if i == 1:
        accuracy = ((TP + TN) / float(TP + TN + FP + FN))
        misclassication_rate = ((FP + FN) / float(TP + TN + FP + FN))
        recall_tpr = (TP / float(TP + FN))
        specificity = (TN / float(TN + FP))
        fpr = (FP / float(TN + FP))
        precision = (TP / float(TP + FP))
        fnr = (FN / float(TP + FN))
        false_negatives = FN
        false_positives = FP
        auc = auc_t
    else:
        accuracy += ((TP + TN) / float(TP + TN + FP + FN))
        misclassication_rate += ((FP + FN) / float(TP + TN + FP + FN))
        recall_tpr += (TP / float(TP + FN))
        specificity += (TN / float(TN + FP))
        fpr += (FP / float(TN + FP))
        precision += (TP / float(TP + FP))
        fnr += (FN / float(TP + FN))
        false_negatives += FN
        false_positives += FP
        auc += auc_t
res = [accuracy/5, misclassication_rate/5, recall_tpr/5, specificity/5, fpr/5,  precision/5, fnr/5, false_negatives/5, false_positives/5, auc/5 ]
print('SVM Linear CV results:')
res

SVM Linear CV results:


[0.960396461418941,
 0.03960353858105907,
 0.9369396195202647,
 0.9890009842519685,
 0.010999015748031495,
 0.9905262336735314,
 0.0630603804797353,
 9.8,
 1.4,
 0.9629703018861167]

**End of Nick's Code**

<hr style="border:1px solid black"> </hr>

In [56]:
pd.options.display.float_format = '{:,.2f}'.format
np.set_printoptions(precision=2)

In [58]:
 def conFusion_metricsOutput(var_y_test, var_y_pred, modname):
    
    res = []   
    confusion = metrics.confusion_matrix(var_y_test, var_y_pred)
    TP = confusion[1, 1] #True Positives (TP): we correctly predicted that tweets do have stigma
    TN = confusion[0, 0] #True Negatives (TN): we correctly predicted that tweets don't have stigma
    FP = confusion[0, 1] #False Positives (FP): we incorrectly predicted that tweets do have stigma (a "Type I error")
    FN = confusion[1, 0] #False Negatives (FN): we incorrectly predicted that tweets don't have stigma (a "Type II error")

    ## Classification Accuracy: Overall, how often is the classifier correct?
    accuracy = ((TP + TN) / float(TP + TN + FP + FN))
    #print(metrics.accuracy_score(y_test, y_pred))

    #Classification Error: Overall, how often is the classifier incorrect?
    #Also known as "Misclassification Rate"
    misclassication_rate = ((FP + FN) / float(TP + TN + FP + FN))
    #print(1 - metrics.accuracy_score(y_test, y_pred))

    #Recall/Sensitivity: When the actual value is positive, how often is the prediction correct?
    #How "sensitive" is the classifier to detecting positive instances?
    #Also known as "True Positive Rate"
    recall_tpr = (TP / float(TP + FN))
    #print(metrics.recall_score(y_test, y_pred))

    #Specificity: When the actual value is negative, how often is the prediction correct?
    #How "specific" (or "selective") is the classifier in predicting positive instances?
    specificity = (TN / float(TN + FP))

    #False Positive Rate: When the actual value is negative, how often is the prediction incorrect?
    fpr = (FP / float(TN + FP))

    #Precision: When a positive value is predicted, how often is the prediction correct?
    #How "precise" is the classifier when predicting positive instances?
    precision = (TP / float(TP + FP))
    #print(metrics.precision_score(y_test, y_pred))
    
    fnr = (FN / float(TP + FN))
    
    false_negatives = FN
    
    false_positives = FP
    
    
    res.append([accuracy, misclassication_rate, recall_tpr, specificity, fpr, precision, fnr, false_negatives, false_positives])
    
    data=pd.DataFrame(res ,columns=['accuracy','misclassication_rate', 'recall_tpr' , 'specificity' , 'fpr', 'precision', 'fnr', 'false_negatives', 'false_positives'], index=[modname]).T
    print(TP, TN, FP, FN)
    
    return data

In [None]:
rf = RandomForestClassifier(n_estimators=500, max_depth=25, n_jobs=-1, random_state=0)
k_fold = KFold(n_splits=5)

y_pred_rf_cv = cross_val_predict(rf, X_features_cv, df['stig_label'], cv=k_fold, n_jobs = -1)

auc = roc_auc_score(df['stig_label'], y_pred_rf_cv)

In [None]:
res_df = conFusion_metricsOutput(df['stig_label'], y_pred_rf_cv, modname = 'rf_cv')
res_df
print(auc)

In [None]:
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1, random_state=0)
    rf_model = rf.fit(X_train_vect, y_train)
    y_pred = rf_model.predict(X_test_vect)
    precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average="binary")
    print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    n_est, depth, precision.mean(), recall.mean(),
    (y_pred==y_test).sum() / len(y_pred), 3))

In [None]:
#Extend if extremes show best
%precision %.2f
np.set_printoptions(precision=2)
#200 estimators with 20 depth gives best results first time so redo up to 250

for n_est in [10, 50, 100, 150, 200, 250]:
    for depth in [10, 20, 30, 50, None]:
        train_RF(n_est, depth)

In [None]:
#best is 200 est, depth 20

In [None]:
#rf hold out test

rf = RandomForestClassifier(n_estimators=200, max_depth=20, n_jobs=-1, random_state=0) 

rf_model = rf.fit(X_train_vect, y_train)

y_pred = rf_model.predict(X_test_vect)
auc = roc_auc_score(y_test, y_pred)

In [None]:
res_df = res_df.join(conFusion_metricsOutput(y_test, y_pred, 'rf_holdout'))

print(auc)

In [None]:
gb = GradientBoostingClassifier(n_estimators=150, max_depth=None, random_state=0)
k_fold = KFold(n_splits=5)

y_pred_gb_cv = cross_val_predict(gb, X_features_cv, df['stig_label'], cv=k_fold, n_jobs = -1)

auc = roc_auc_score(df['stig_label'], y_pred_gb_cv)

In [None]:
res_df = res_df.join(conFusion_metricsOutput(df['stig_label'], y_pred_gb_cv, modname = 'gb_cv'))
print(auc)

In [None]:
def train_GB(n_est, depth):
    gb = GradientBoostingClassifier(n_estimators=n_est, max_depth=depth, random_state=0)
    gb_model = gb.fit(X_train_vect, y_train)
    y_pred = gb_model.predict(X_test_vect)
    precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average="binary")
    print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    n_est, depth, precision.mean(), recall.mean(),
    (y_pred==y_test).sum() / len(y_pred), 3))

In [None]:
for n_est in [10, 50, 100, 150, 200]:
    for depth in [10, 20, 30, 50, None]:
        train_GB(n_est, depth)

In [None]:
#Best for GB is 10 estimators depth 10

In [None]:
gb = GradientBoostingClassifier(n_estimators=10, max_depth=10, random_state=0)


gb_model = gb.fit(X_train_vect, y_train)



y_pred = gb_model.predict(X_test_vect)


In [None]:
res_df = res_df.join(conFusion_metricsOutput(y_test, y_pred, 'gb_holdout'))



In [None]:
auc = roc_auc_score(y_test, y_pred)
print(auc)

In [None]:
from sklearn.svm import SVC

In [None]:
svmClas = SVC(probability=True)

y_pred_svm_cv = cross_val_predict(svmClas, X_features_cv, df['stig_label'], cv=k_fold, n_jobs = -1)

auc = roc_auc_score(df['stig_label'], y_pred_svm_cv)

res_df = res_df.join(conFusion_metricsOutput(df['stig_label'], y_pred_svm_cv, modname = 'svm_cv'))
res_df
print(auc)

In [None]:
SVM_cost = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000, 1000000]

for cst in SVM_cost:

    svmClas = SVC(C = cst, random_state=0)

    svmClas.fit(X_train_vect, y_train)

    y_pred = svmClas.predict(X_test_vect)
    precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average="binary")
    print('Cost function: {} --- Precision: {} / Recall: {} / Accuracy: {}'.format(
    cst, precision.mean(), recall.mean(),
    (y_pred==y_test).sum() / len(y_pred), 3))

In [None]:
#cost function of 10000 is best

In [None]:
svmClas = SVC(C = 10000, random_state=0)
svmClas.fit(X_train_vect, y_train)

pred = svmClas.predict(X_test_vect)

In [None]:
res_df = res_df.join(conFusion_metricsOutput(y_test, pred, 'svm_holdout'))


In [None]:
auc = roc_auc_score(y_test, pred)

print(auc)

In [None]:
svmClas = SVC(kernel='linear', probability=True)
y_pred_svm_lin_cv = cross_val_predict(svmClas, X_features_cv, df['stig_label'], cv=k_fold, n_jobs = -1)

auc = roc_auc_score(df['stig_label'], y_pred_svm_lin_cv)

res_df = res_df.join(conFusion_metricsOutput(df['stig_label'], y_pred_svm_lin_cv, modname = 'svm_lin_cv'))
res_df
print(auc)

In [None]:
SVM_cost = [0.0001, 0.001, 0.01, 0.1, 1, 10]

for cst in SVM_cost:

    svmClas = SVC(kernel='linear', C = cst, random_state=0)

    svmClas.fit(X_train_vect, y_train)
    y_pred = svmClas.predict(X_test_vect)
    precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average="binary")
    print('Cost function: {} --- Precision: {} / Recall: {} / Accuracy: {}'.format(
    cst, precision.mean(), recall.mean(),
    (y_pred==y_test).sum() / len(y_pred), 3))

In [None]:
#best cost function for SVM linear is 1

In [None]:
svmClas = SVC(kernel='linear', C = 1, random_state=0)
svmClas.fit(X_train_vect, y_train)

pred = svmClas.predict(X_test_vect)

In [None]:
res_df = res_df.join(conFusion_metricsOutput(y_test, pred, 'svm_lin_holdout'))
auc = roc_auc_score(y_test, pred)
print(auc)

In [None]:
res_df 

In [None]:
# according to accuracy and false negatives, Random Forest holdout and SVM linear holdout are best