In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix

# Import Data and Perform Overall Data Preparation

## Import Data

In [3]:
new_reddit_data = pd.read_csv(r'C:\Users\Home\Documents\Text Analytics\Group Project\Raw Data\correlation.csv')
reddit_data = pd.read_csv(r'C:\Users\Home\Documents\Text Analytics\Group Project\Raw Data\Training-Validation.csv')

trimmed_data = reddit_data.append(new_reddit_data, ignore_index = True)


trimmed_data.drop(['score', 'url', 'num_comments', 'created'], axis = 1, inplace = True)

## Tokenize and clean each post and title and stick the results in lists

In [4]:
#stopwords = stopwords.words('english')

#Tokenize "body".
tokenized_posts_list = []
# Note: word_tokenize() only accepts 1 string at a time. I must loop through the strings and then tokenize it.
for i in trimmed_data['body']:
    tokens_list = []
    tokens = nltk.word_tokenize(str(i))
    for j in tokens:
        if j.isalpha():
            tokens_list.append(j.lower())
    tokenized_posts_list.append(tokens_list)
    
#Tokenize "title". 
tokenized_titles_list = []
# Note: word_tokenize() only accepts 1 string at a time. I must loop through the strings and then tokenize it.
for i in trimmed_data['title']:
    tokens_list = []
    tokens = nltk.word_tokenize(str(i))
    for j in tokens:
        if j.isalpha():
            tokens_list.append(j.lower())
    tokenized_titles_list.append(tokens_list)

#Add list of fully tokenized and cleaned posts onto existing dataframe. This will allow us to analyze each post and count
#stuff in order to create our features
trimmed_data['Cleaned and Tokenized Titles'] = tokenized_titles_list
trimmed_data['Cleaned and Tokenized Posts'] = tokenized_posts_list

## Create the "Gold Standard" by classifying each subreddit as 1 (likely depression or correlated with depression) or 0 (not likely depression).

In [5]:
subreddits_indicative_of_depression = ['depression','depression_help','mentalhealth','addiction', 'cripplingalcoholism', 'SuicideWatch']

manual_classification = []
for i in trimmed_data['subreddit']:
    
    if i in subreddits_indicative_of_depression: 
        manual_classification.append(1)
    else:
        manual_classification.append(0)
        
trimmed_data['Manual Classification'] = manual_classification

trimmed_data.head()

Unnamed: 0,title,id,subreddit,body,Cleaned and Tokenized Titles,Cleaned and Tokenized Posts,Manual Classification
0,Our most-broken and least-understood rules is ...,doqwow,depression,We understand that most people who reply immed...,"[our, and, rules, is, helpers, may, not, invit...","[we, understand, that, most, people, who, repl...",1
1,"Regular Check-In Post, with important reminder...",m246c4,depression,Welcome to /r/depression's check-in post - a p...,"[regular, post, with, important, reminders, ab...","[welcome, to, post, a, place, to, take, a, mom...",1
2,I Feel Like I Live In A World Where Evil Wins,m2oac5,depression,"Hello everyone, little short intro here. I am...","[i, feel, like, i, live, in, a, world, where, ...","[hello, everyone, little, short, intro, here, ...",1
3,I just need to tell someone this,m2om39,depression,\nI’m 17. Today is my Dads 70th birthday. Tha...,"[i, just, need, to, tell, someone, this]","[i, m, today, is, my, dads, birthday, that, mi...",1
4,Im pretty sure i stay up so late because I don...,m2mlcb,depression,"I dont know if that makes sense, but thats how...","[im, pretty, sure, i, stay, up, so, late, beca...","[i, dont, know, if, that, makes, sense, but, t...",1


# Creation of the "Features Grocery Store" - So we can just go "shopping" whenever we need to (meaning create new dataframes with desired features)

## Psycholinguistic Markers (note: the researchers did not remove stopwords; neither will I)

### Aisle 1: Counts of punctuations and words

In [6]:
#Number of Punctuation Characters
punctuation = ['~','-',':',';','"',',','.','?','!']
number_of_punctuation_characters = []

for i in trimmed_data['body']:
    number = 0
    for j in punctuation:
        number += str(i).count(j)
    number_of_punctuation_characters.append(number)

#Number of words
number_of_words = []

for i in trimmed_data['Cleaned and Tokenized Posts']:
    number_of_words.append(len(i))
    
#Number of unique words
number_of_unique_words = []

for i in trimmed_data['Cleaned and Tokenized Posts']:
    number_of_unique_words.append(len(set(i)))

### Aisle 2: Parts of Speech

In [7]:
#The remaining needed Psycholinguistic Markers are all parts of speech. The first step is to tag the P.O.S once for the whole document
#run the P.O.S. tagger on the entire tokenized "tokenized_posts_list" list (remember: this is a list of lists)
POS_list = []
for i in trimmed_data['Cleaned and Tokenized Posts']:
    part_of_speech_tags = nltk.pos_tag(i)
    POS_list.append(part_of_speech_tags)

#Number of Verbs
number_of_all_verbs = []
for i in POS_list:
    all_verbs = [(word,tag) for (word,tag) in i if tag.startswith('V')]
    number_of_all_verbs.append(len(all_verbs))
    
#Number of Adjectives
number_of_adjectives = []
for i in POS_list:
    adjectives = [(word,tag) for (word,tag) in i if tag.startswith('J')]
    number_of_adjectives.append(len(adjectives))
    
#Number of Conjunctions (coordinating conjunctions only)
number_of_conjunctions = []
for i in POS_list:
    conjunctions = [(word,tag) for (word,tag) in i if tag.startswith('CC')]
    number_of_conjunctions.append(len(conjunctions))

#Number of Prepositions (includes prepositions and subordinating conjunctions)
number_of_prepositions = []
for i in POS_list:
    prepositions = [(word,tag) for (word,tag) in i if tag.startswith('IN')]
    number_of_prepositions.append(len(prepositions))
    
#Number of Infinitives
number_of_infinitives = []
for i in POS_list:
    infinitives = [(word,tag) for (word,tag) in i if tag.endswith('VB')]
    number_of_infinitives.append(len(infinitives))
    
#Number of Past Tense Verbs
number_of_past_tense_verbs = []
for i in POS_list:
    past_tense_verbs = [(word,tag) for (word,tag) in i if tag.startswith('VBD')]
    number_of_past_tense_verbs.append(len(past_tense_verbs))
    
#Number of First Person Verbs (this will be a little inexact; this is the rough combo of VBD and VBP)
number_of_first_person_verbs = []
for i in POS_list:
    first_person_verbs = [(word,tag) for (word,tag) in i if tag.startswith('VBD') or tag.startswith('VBP')]
    number_of_first_person_verbs.append(len(first_person_verbs))
    
#Number of Third-Person Verbs (this will undercount. VBZ is 3rd person singular only; combining with anything else will overcount alot)
number_of_third_person_verbs = []
for i in POS_list:
    third_person_verbs = [(word,tag) for (word,tag) in i if tag.startswith('VBZ')]
    number_of_third_person_verbs.append(len(third_person_verbs))
    
#Number of Pronouns
number_of_pronouns = []
NLTK_pronouns = ['PRP', 'PRP$', 'WP', 'WP$']
for i in POS_list:
    pronouns = [(word,tag) for (word,tag) in i if tag in NLTK_pronouns]
    number_of_pronouns.append(len(pronouns))
    
#Number of First-Person Pronouns
number_of_first_person_pronouns = []
list_of_first_person_pronouns = ['We', 'us', 'our','ourselves', 'I', 'me', 'my', 'mine', 'myself']
for i in POS_list:
    first_person_pronouns = [(word,tag) for (word,tag) in i if word in list_of_first_person_pronouns]
    number_of_first_person_pronouns.append(len(first_person_pronouns))
    
#Number of Singular First-Person Pronouns
number_of_singular_first_person_pronouns = []
list_of_singular_first_person_pronouns = ['I', 'me', 'my', 'mine', 'myself']
for i in POS_list:
    singular_first_person_pronouns = [(word,tag) for (word,tag) in i if word in list_of_singular_first_person_pronouns]
    number_of_singular_first_person_pronouns.append(len(singular_first_person_pronouns))
    
#Number of Plural First-Person Pronouns
number_of_plural_first_person_pronouns = []
list_of_plural_first_person_pronouns = ['We', 'us', 'our','ourselves']
for i in POS_list:
    plural_first_person_pronouns = [(word,tag) for (word,tag) in i if word in list_of_plural_first_person_pronouns]
    number_of_plural_first_person_pronouns.append(len(plural_first_person_pronouns))
    


### Aisle 3: Entire Sentences

In [8]:
#Because of the non-standard use of punctuation to end sentences, this count is not perfect. But, eyeballing the first 5 entries
#it looks good enough
number_of_sentences = []
for i in trimmed_data['body']:
    sentences = nltk.sent_tokenize(str(i))
    number_of_sentences.append(len(sentences))

# Dataframe Creation

## "Paper 1" Dataframe - The Psycholinguistic Markers

### Step 1: Creation of all needed new columns

In [9]:
#(N punctuation characters) / (N words)
punctuation_divided_by_words = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_punctuation_characters, number_of_words)]

#(N unique words) / (N words)
unique_words_divided_by_words = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_unique_words, number_of_words)]

#(N verbs) / (N adjectives)
verbs_divided_by_adjectives = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_all_verbs, number_of_adjectives)]

#(N conjunctions + N prepositions) / (N sentences)
conjunctions_plus_prepositions_divided_by_sentences = [(i + k) / j if j > 0 else 'divide by zero' for i,k,j in zip(number_of_conjunctions, number_of_prepositions, number_of_sentences)]

#(N infinitives) / (N verbs)
infintives_divided_by_verbs = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_infinitives, number_of_all_verbs)]

#(N singular first person past tense verbs) / (N verbs) (note: inexactly calculated)
SFPPT_verbs_divided_by_verbs = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_past_tense_verbs, number_of_all_verbs)]

#(N first person verbs) / (N verbs) (inexactly calculated)
first_person_verbs_divided_by_verbs = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_first_person_verbs, number_of_all_verbs)]

#(N third person verbs) / (N verbs) (inexactly calculated)
third_person_verbs_divided_by_verbs = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_third_person_verbs, number_of_all_verbs)]

#(N first person pronouns) / (N pronouns)
first_person_pronouns_divided_by_pronouns = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_first_person_pronouns, number_of_pronouns)]

#(N singular first person pronouns) / (N pronouns)
singular_first_person_pronouns_divided_by_pronouns = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_singular_first_person_pronouns, number_of_pronouns)]

#(N plural first person pronouns) / (N pronouns)
plural_first_person_pronouns_divided_by_pronouns = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_plural_first_person_pronouns, number_of_pronouns)]

### Step 2: Creation of "Dataframe With Psycholinguistic Markers"

In [10]:
PM_data = ({'Manual Classification': trimmed_data['Manual Classification'],
            'Subreddit': trimmed_data['subreddit'],
            'Post Title': trimmed_data['title'], 
            'Post Body': trimmed_data['body'], 
            '(N punctuation characters) / (N words)': punctuation_divided_by_words,
            '(N unique words) / (N words)': unique_words_divided_by_words,
            '(N verbs) / (N adjectives)': verbs_divided_by_adjectives,
            '(N conjunctions + N prepositions) / (N sentences)': conjunctions_plus_prepositions_divided_by_sentences,
            '(N infinitives) / (N verbs)': infintives_divided_by_verbs,
            '(N singular first person past tense verbs) / (N verbs)': SFPPT_verbs_divided_by_verbs,
            '(N first person verbs) / (N verbs)': first_person_verbs_divided_by_verbs,
            '(N third person verbs) / (N verbs)': third_person_verbs_divided_by_verbs,
            '(N first person pronouns) / (N pronouns)': first_person_pronouns_divided_by_pronouns,
            '(N singular first person pronouns) / (N pronouns)': singular_first_person_pronouns_divided_by_pronouns,
            '(N plural first person pronouns) / (N pronouns)': plural_first_person_pronouns_divided_by_pronouns})

PM_dataframe = pd.DataFrame(data = PM_data)

#### For the purposes of excluding the new data from model training, split the PM_dataframe into two.

In [11]:
#Segregate the new entries into their own dataframe (subreddits: "addiction", "cripplingalcoholism", and "SuicideWatch")
new_data_filter = (PM_dataframe['Subreddit'] == 'addiction') | (PM_dataframe['Subreddit'] == 'cripplingalcoholism') | (PM_dataframe['Subreddit'] == 'SuicideWatch')

new_data = PM_dataframe.loc[new_data_filter].reset_index(drop = True)

PM_dataframe.drop(PM_dataframe[new_data_filter].index, inplace = True)

In [12]:
PM_dataframe.head()

Unnamed: 0,Manual Classification,Subreddit,Post Title,Post Body,(N punctuation characters) / (N words),(N unique words) / (N words),(N verbs) / (N adjectives),(N conjunctions + N prepositions) / (N sentences),(N infinitives) / (N verbs),(N singular first person past tense verbs) / (N verbs),(N first person verbs) / (N verbs),(N third person verbs) / (N verbs),(N first person pronouns) / (N pronouns),(N singular first person pronouns) / (N pronouns),(N plural first person pronouns) / (N pronouns)
0,1,depression,Our most-broken and least-understood rules is ...,We understand that most people who reply immed...,0.133423,0.447439,1.88406,3.594595,0.376923,0.00769231,0.292308,0.169231,0.125,0.046875,0.078125
1,1,depression,"Regular Check-In Post, with important reminder...",Welcome to /r/depression's check-in post - a p...,0.150943,0.566038,1.91304,3.6,0.340909,0.0227273,0.25,0.159091,0.12,0,0.12
2,1,depression,I Feel Like I Live In A World Where Evil Wins,"Hello everyone, little short intro here. I am...",0.135802,0.510288,2.86364,3.071429,0.206349,0.0952381,0.380952,0.0952381,0.444444,0.111111,0.333333
3,1,depression,I just need to tell someone this,\nI’m 17. Today is my Dads 70th birthday. Tha...,0.0766129,0.504032,2.29167,2.235294,0.2,0.0727273,0.309091,0.327273,0.393939,0.393939,0
4,1,depression,Im pretty sure i stay up so late because I don...,"I dont know if that makes sense, but thats how...",0.0833333,0.916667,2.0,2.0,0.0,0.0,0.5,0.5,divide by zero,divide by zero,divide by zero


In [13]:
new_data.head()

Unnamed: 0,Manual Classification,Subreddit,Post Title,Post Body,(N punctuation characters) / (N words),(N unique words) / (N words),(N verbs) / (N adjectives),(N conjunctions + N prepositions) / (N sentences),(N infinitives) / (N verbs),(N singular first person past tense verbs) / (N verbs),(N first person verbs) / (N verbs),(N third person verbs) / (N verbs),(N first person pronouns) / (N pronouns),(N singular first person pronouns) / (N pronouns),(N plural first person pronouns) / (N pronouns)
0,1,SuicideWatch,New wiki on how to avoid accidentally encourag...,We've been seeing a worrying increase in pro-s...,0.149872,0.362883,2.15823,2.987805,0.334311,0.0381232,0.237537,0.170088,0.135338,0.0150376,0.120301
1,1,SuicideWatch,Please remember that NO ACTIVISM of any kind i...,"Activism, i.e. advocating or fundraising for s...",0.202247,0.61236,2.07143,3.8,0.206897,0.137931,0.206897,0.344828,0.363636,0.0,0.363636
2,1,SuicideWatch,Love of my life died. Don’t know what to do,\nI ’m 19 years old and we were going to be to...,0.122807,0.467836,2.82353,1.2,0.291667,0.166667,0.5625,0.104167,0.25,0.25,0.0
3,1,SuicideWatch,The repetition is driving me insane. I can't s...,I'm so sick of it. Always tired despite spendi...,0.135802,0.728395,2.66667,2.6,0.3125,0.0625,0.25,0.0625,0.5,0.5,0.0
4,1,SuicideWatch,What’s the point of being alive when you’re poor?,Either go homeless and starve to death or work...,0.0645161,0.752688,2.5,2.8,0.4,0.0,0.45,0.05,0.0,0.0,0.0


# Machine Learning Models

## Grab the Psycholingustic Markers and new_data dataframes and get rid of any row with "divide by zero".
## Please note that I researched the given warning message and the Python community agrees that it is a bug caused by Numpy and Pandas fighting with each other. It is safe to ignore this warning when conducting this particular operation.

### Psycholinguistic Markers

In [14]:
#Copy existing dataframe
PM_dataframe_for_ML_models = PM_dataframe.copy()

#Drop any row which contains the text string "divide by zero"
all_columns = list(PM_dataframe_for_ML_models.columns)

for i in all_columns:
    divide_by_zero_filter = PM_dataframe_for_ML_models[i] == 'divide by zero'
    PM_dataframe_for_ML_models.drop(PM_dataframe_for_ML_models[divide_by_zero_filter].index, inplace = True)

PM_dataframe_for_ML_models.reset_index(drop = True, inplace = True)

  res_values = method(rvalues)


### New Data

In [15]:
#Copy existing dataframe
new_data_for_ML_models = new_data.copy()

#Drop any row which contains the text string "divide by zero"
all_columns = list(new_data_for_ML_models.columns)

for i in all_columns:
    divide_by_zero_filter = new_data_for_ML_models[i] == 'divide by zero'
    new_data_for_ML_models.drop(new_data_for_ML_models[divide_by_zero_filter].index, inplace = True)

new_data_for_ML_models.reset_index(drop = True, inplace = True)

## Model Setup: Generate Variable List, Standardize Values, and Partition Data

### Psycholinguistic Dataframe

In [16]:
#Setting Up Numerical Variables (there are no categorical variables). Since I create the dataframe in the above code, I can
#safely drop 'manual classification', 'subreddit', 'post title', and 'post body' by directly slicing the list by index
nvar_list = list(PM_dataframe_for_ML_models.columns)
del nvar_list[0:4]

#Drop unneeded text columns from dataframe which will be used in Machine Learning models
PM_dataframe_for_ML_models.drop(['Subreddit', 'Post Title', 'Post Body'], axis = 1, inplace=True)

#Standardizing Numerical variables
standardized_PM_dataframe_for_ML_models = PM_dataframe_for_ML_models.copy()

original_column_values = PM_dataframe_for_ML_models[nvar_list]
sample_mean = PM_dataframe_for_ML_models[nvar_list].mean()
sample_stddev = PM_dataframe_for_ML_models[nvar_list].std()

standardized_PM_dataframe_for_ML_models[nvar_list] = ((original_column_values - sample_mean)/sample_stddev)

### New Data Dataframe

In [17]:
#Setting Up Numerical Variables (there are no categorical variables). Since I create the dataframe in the above code, I can
#safely drop 'manual classification', 'subreddit', 'post title', and 'post body' by directly slicing the list by index
nvar_list = list(new_data_for_ML_models.columns)
del nvar_list[0:4]

#Drop unneeded text columns from dataframe which will be used in Machine Learning models
new_data_for_ML_models.drop(['Subreddit', 'Post Title', 'Post Body'], axis = 1, inplace=True)

#Standardizing Numerical variables
standardized_new_data_for_ML_models = new_data_for_ML_models.copy()

original_column_values = new_data_for_ML_models[nvar_list]
sample_mean = new_data_for_ML_models[nvar_list].mean()
sample_stddev = new_data_for_ML_models[nvar_list].std()

standardized_new_data_for_ML_models[nvar_list] = ((original_column_values - sample_mean)/sample_stddev)

### Data Partitioning

In [18]:
#Data Partition
#Splitting the data into our partitions will return two dataframes, so we must prep like so:
test_partition_size = .2
data_to_be_partitioned = standardized_PM_dataframe_for_ML_models

non_test_data, test_data = train_test_split(data_to_be_partitioned, test_size = test_partition_size, random_state = 1)

#use standardized_new_data_for_ML_models in place of existing test data. This way the model will be tested off of subreddits unrelated to training

test_data = standardized_new_data_for_ML_models

## Logistic Regression

### Model in the Training/Validation Partitions

In [19]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y = non_test_data[DV]
x = non_test_data.drop(columns = [DV])

#Create Lin Hao's function to return the logistic regression coefficient results in a nice format
def summary_coef(model_object):
    n_predictors = x.shape[1]
    model_coef = pd.DataFrame(model_object.coef_.reshape(1,n_predictors),columns = x.columns.values)
    model_coef['Intercept'] = model_object.intercept_
    return (model_coef.transpose())

#Setup Logistic Regression with k-folds = 5
kfolds = 5

#Establish alpha range for Python to search within
min_alpha = .01
max_alpha = 100

max_C = 1/min_alpha
min_C = 1/max_alpha

#Because there are infinite values between min_alpha and max_alpha, we must specify how many alphas Python should look for
#Python will then divide that interval into an even number of searches. We need numpy for this
n_candidates = 5000
c_list= list(np.linspace(min_C, max_C, num = n_candidates))

#Run logistic regression, use "scoring = roc_auc" to get the Area Under the Curve, and send it into the "nice formatting" function
classifier_optimal = LogisticRegressionCV(Cs = c_list, cv=kfolds, scoring = 'roc_auc', penalty = 'l1',solver='saga',max_iter=200, random_state=1, n_jobs = -1).fit(x,y)
print(summary_coef(classifier_optimal))

#Find the optimal selected alpha
print("\nThis model's optimal alpha in the validation partition is",1/classifier_optimal.C_)

# predict probabilities
logistic_regression_predicted_probabilities_validation_partition = classifier_optimal.predict_proba(x)

# keep probabilities for the positive outcome only
logistic_regression_predicted_probabilities_validation_partition = logistic_regression_predicted_probabilities_validation_partition[:, 1]

# calculate AUC
AUC_logistic_regression_validation_partition = roc_auc_score(y, logistic_regression_predicted_probabilities_validation_partition)
print('The AUC of the optimal model in the validation partition is', AUC_logistic_regression_validation_partition)

                                                           0
(N punctuation characters) / (N words)              0.370752
(N unique words) / (N words)                        0.844662
(N verbs) / (N adjectives)                         -0.280246
(N conjunctions + N prepositions) / (N sentences)   0.111276
(N infinitives) / (N verbs)                         0.106644
(N singular first person past tense verbs) / (N... -0.878462
(N first person verbs) / (N verbs)                  0.140895
(N third person verbs) / (N verbs)                 -0.352125
(N first person pronouns) / (N pronouns)            0.000000
(N singular first person pronouns) / (N pronouns)   1.025793
(N plural first person pronouns) / (N pronouns)    -0.204425
Intercept                                           0.959950

This model's optimal alpha in the validation partition is [1.53831004]
The AUC of the optimal model in the validation partition is 0.8762721987674392


### Model in the Test Partition

In [21]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y2 = test_data[DV]
x2 = test_data.drop(columns = [DV])

#Run logistic regression, use "scoring = roc_auc" to get the Area Under the Curve, and send it into the "nice formatting" function
classifier_optimal_2 = LogisticRegressionCV(Cs = c_list, cv=kfolds, penalty = 'l1',solver='saga',max_iter=200, random_state=1, n_jobs = -1).fit(x,y)
print(summary_coef(classifier_optimal_2))

#Find the optimal selected alpha
print("\nThis model's optimal alpha in the test partition is",1/classifier_optimal_2.C_)

# predict probabilities
logistic_regression_predicted_probabilities_test_partition = classifier_optimal_2.predict_proba(x2)

# keep probabilities for the positive outcome only
logistic_regression_predicted_probabilities_test_partition = logistic_regression_predicted_probabilities_test_partition[:, 1]

# calculate AUC
# AUC_logistic_regression_test_partition = roc_auc_score(y2, logistic_regression_predicted_probabilities_test_partition)
# print('The AUC of the optimal model in the test partition is', AUC_logistic_regression_test_partition)

                                                           0
(N punctuation characters) / (N words)              0.357888
(N unique words) / (N words)                        0.834146
(N verbs) / (N adjectives)                         -0.275073
(N conjunctions + N prepositions) / (N sentences)   0.102943
(N infinitives) / (N verbs)                         0.091333
(N singular first person past tense verbs) / (N... -0.864351
(N first person verbs) / (N verbs)                  0.117717
(N third person verbs) / (N verbs)                 -0.353645
(N first person pronouns) / (N pronouns)            0.000000
(N singular first person pronouns) / (N pronouns)   1.015815
(N plural first person pronouns) / (N pronouns)    -0.199772
Intercept                                           0.951465

This model's optimal alpha in the test partition is [3.99961596]


### Logistic Regression Confusion Matrix (validation partition)

#### Note: since we are dealing with flagging depression, we are most concerned with capturing all True Positives and avoiding False Negatives, since we want to help people to are depressed (capture True Positives) and avoid accidentally ignoring people who are depressed (avoid False Negatives). Thus, we want to measure positive precision and positive recall together as the F-score.

In [24]:
#convert predicted probabilties into 1s and 0s
y_predicted_logistic_regression = []
for i in logistic_regression_predicted_probabilities_test_partition:
    if i >= .5:
        y_predicted_logistic_regression.append(1)
    else:
        y_predicted_logistic_regression.append(0)
        
#create confusion matrix: confusion_matrix(y_true, y_pred)
#Note from sci-k itlearn.org:
#Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
    #true positives is M[1,1], and false positives is M[0,1].
logistic_regression_confusion_matrix = confusion_matrix(y2, y_predicted_logistic_regression)

logistic_regression_true_positive = logistic_regression_confusion_matrix[1,1]
logistic_regression_true_negative = logistic_regression_confusion_matrix[0,0]
logistic_regression_false_positive = logistic_regression_confusion_matrix[0,1]
logistic_regression_false_negative = logistic_regression_confusion_matrix[1,0]

#Calculate positive precision
logistic_regression_positive_precision = logistic_regression_true_positive / (logistic_regression_true_positive + logistic_regression_false_positive)
print('The positive precision for the logistic regression model in the test partition is',logistic_regression_positive_precision)

#Calculate positive recall
logistic_regression_positive_recall = logistic_regression_true_positive / (logistic_regression_true_positive + logistic_regression_false_negative)
print('The positive recall for the logistic regression model in the test partition is',logistic_regression_positive_recall)

#Calculate F score
logistic_regression_F_score = (2 * logistic_regression_positive_precision * logistic_regression_positive_recall) / (logistic_regression_positive_precision + logistic_regression_positive_recall)
print('The F-score for the logistic regression model in the test partition is',logistic_regression_F_score)

The positive precision for the logistic regression model in the test partition is 1.0
The positive recall for the logistic regression model in the test partition is 0.7151187005414411
The F-score for the logistic regression model in the test partition is 0.8338999514327343


## k-NN

### Model in the Training/Validation Partitions

In [25]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y = non_test_data[DV]
x = non_test_data.drop(columns = [DV])

# Run Nearest Neighbors with k-fold cross validation with k=5
kfolds = 5

# Here we specify within which range of Ks we will search through and save that range into a dictionary
max_k = 200
parameters_grid = {'n_neighbors': list(range(1, max_k + 1))}

#Setup a k-NN model which will search through all of our specified hyperparameters (the ks) and then apply the model to our data
k_NN = GridSearchCV(KNeighborsClassifier(metric = 'euclidean'), parameters_grid, scoring='roc_auc', cv=kfolds, n_jobs=-1)
k_NN.fit(x,y)
classifier_best_KNN = k_NN.best_estimator_

# Display optimal k
print('The optimal k in the validation partition is',classifier_best_KNN.n_neighbors)

# predict probabilities
k_NN_predicted_probabilities_validation_partition = classifier_best_KNN.predict_proba(x)

# keep probabilities for the positive outcome only
k_NN_predicted_probabilities_validation_partition = k_NN_predicted_probabilities_validation_partition[:, 1]

# calculate AUC
AUC_k_NN_validation_partition = roc_auc_score(y, k_NN_predicted_probabilities_validation_partition)
print('The AUC of the optimal model in the validation partition is', AUC_k_NN_validation_partition)

The optimal k in the validation partition is 99
The AUC of the optimal model in the validation partition is 0.8915001437163503


### Model in the Test Partition

In [35]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y2 = test_data[DV]
x2 = test_data.drop(columns = [DV])

#Setup a k-NN model which will search through all of our specified hyperparameters (the ks) and then apply the model to our data
k_NN_2 = GridSearchCV(KNeighborsClassifier(metric = 'euclidean'), parameters_grid, cv=kfolds, n_jobs=-1)
k_NN_2.fit(x,y)
classifier_best_KNN_2 = k_NN_2.best_estimator_

# Display optimal k
print('The optimal k in the test partition is',classifier_best_KNN_2.n_neighbors)

#predict probabilities
k_NN_predicted_probabilities_test_partition = classifier_best_KNN_2.predict_proba(x2)

#keep probabilities for the positive outcome only
k_NN_predicted_probabilities_test_partition = k_NN_predicted_probabilities_test_partition[:, 1]

# calculate AUC
# AUC_k_NN_test_partition = roc_auc_score(y2, k_NN_predicted_probabilities_test_partition)
# print('The AUC of the optimal model in the test partition is', AUC_k_NN_test_partition)

The optimal k in the test partition is 49


### k-NN Confusion Matrix (test partition)

#### Note: since we are dealing with flagging depression, we are most concerned with capturing all True Positives and avoiding False Negatives, since we want to help people to are depressed (capture True Positives) and avoid accidentally ignoring people who are depressed (avoid False Negatives). Thus, we want to measure positive precision and positive recall together as the F-score.

In [37]:
#convert predicted probabilties into 1s and 0s
y_predicted_k_NN = []
for i in k_NN_predicted_probabilities_test_partition:
    if i >= .5:
        y_predicted_k_NN.append(1)
    else:
        y_predicted_k_NN.append(0)
        
# create confusion matrix: confusion_matrix(y_true, y_pred)
# Note from sci-k itlearn.org:
# Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
#     true positives is M[1,1], and false positives is M[0,1].
k_NN_confusion_matrix = confusion_matrix(y2,y_predicted_k_NN)

k_NN_true_positive = k_NN_confusion_matrix[1,1]
k_NN_true_negative = k_NN_confusion_matrix[0,0]
k_NN_false_positive = k_NN_confusion_matrix[0,1]
k_NN_false_negative = k_NN_confusion_matrix[1,0]

#Calculate positive precision
k_NN_positive_precision = k_NN_true_positive / (k_NN_true_positive + k_NN_false_positive)
print('The positive precision for the k-NN model in the test partition is',k_NN_positive_precision)

#Calculate positive recall
k_NN_positive_recall = k_NN_true_positive / (k_NN_true_positive + k_NN_false_negative)
print('The positive recall for the k-NN model in the test partition is',k_NN_positive_recall)

#Calculate F score
k_NN_F_score = (2 * k_NN_positive_precision * k_NN_positive_recall) / (k_NN_positive_precision + k_NN_positive_recall)
print('The F-score for the k-NN model in the test partition is',k_NN_F_score)

#Calculate accuracy rate
print("The model's accuracy against the test partition is",classifier_best_KNN_2.score(x2,y2))

#Calculate F score
k_NN_F_score = (2 * k_NN_positive_precision * k_NN_positive_recall) / (k_NN_positive_precision + k_NN_positive_recall)
print('The F-score for the k-NN model in the test partition is',k_NN_F_score)

The positive precision for the k-NN model in the test partition is 1.0
The positive recall for the k-NN model in the test partition is 0.7067888379841732
The F-score for the k-NN model in the test partition is 0.8282088823816496
The model's accuracy against the test partition is 0.7067888379841732
The F-score for the k-NN model in the test partition is 0.8282088823816496


## Classification Tree (data standardization does not matter, though our data is already standardized at this point)

### Model in the Validation Partition

In [31]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y = non_test_data[DV]
x = non_test_data.drop(columns = [DV])

# Run Classification tree with k-fold cross validation with k=5
kfolds = 5

# Here we specify within which range of depths we will search for the best pruned tree
maximum_depth = 100
minimum_depth = 1
parameter_grid = {'max_depth': list(range(minimum_depth, maximum_depth + 1))}

classification_tree = GridSearchCV(DecisionTreeClassifier(criterion='entropy', random_state=1), parameter_grid, scoring='roc_auc', cv=kfolds, n_jobs=-1)
classification_tree.fit(x,y)
best_pruned_tree = classification_tree.best_estimator_

# Display the level of depth of the best pruned tree
print('The best pruned tree in the validation partition is of depth',best_pruned_tree.get_depth())

# predict probabilities
classification_tree_predicted_probabilities_validation_partition = best_pruned_tree.predict_proba(x)

# keep probabilities for the positive outcome only
classification_tree_predicted_probabilities_validation_partition = classification_tree_predicted_probabilities_validation_partition[:, 1]

# calculate AUC
AUC_classification_tree_validation_partition = roc_auc_score(y, classification_tree_predicted_probabilities_validation_partition)
print('The AUC of the best pruned tree in the validation partition is', AUC_classification_tree_validation_partition)

The best pruned tree in the validation partition is of depth 5
The AUC of the best pruned tree in the validation partition is 0.8904330884489143


### Model in the Test Partition

In [32]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y2 = test_data[DV]
x2 = test_data.drop(columns = [DV])

# Run Classification tree with k-fold cross test with k=5
kfolds = 5

# Here we specify within which range of depths we will search for the best pruned tree
maximum_depth = 100
minimum_depth = 1
parameter_grid = {'max_depth': list(range(minimum_depth, maximum_depth + 1))}

classification_tree_2 = GridSearchCV(DecisionTreeClassifier(criterion='entropy', random_state=1), parameter_grid, cv=kfolds, n_jobs=-1)
classification_tree_2.fit(x,y)
best_pruned_tree_2 = classification_tree_2.best_estimator_

# Display the level of depth of the best pruned tree
print('The best pruned tree in the test partition is of depth',best_pruned_tree_2.get_depth())

# predict 1s and 0s
classification_tree_predicted_probabilities_test_partition = best_pruned_tree_2.predict_proba(x2)

# keep probabilities for the positive outcome only
classification_tree_predicted_probabilities_test_partition = classification_tree_predicted_probabilities_test_partition[:, 1]

# # calculate AUC
# AUC_classification_tree_test_partition = roc_auc_score(y2, classification_tree_predicted_probabilities_test_partition)
# print('The AUC of the best pruned tree in the test partition is', AUC_classification_tree_test_partition)

The best pruned tree in the test partition is of depth 4


### Classification Tree Confusion Matrix (test partition)

#### Note: since we are dealing with flagging depression, we are most concerned with capturing all True Positives and avoiding False Negatives, since we want to help people to are depressed (capture True Positives) and avoid accidentally ignoring people who are depressed (avoid False Negatives). Thus, we want to measure positive precision and positive recall together as the F-score.

In [38]:
#convert predicted probabilties into 1s and 0s
y_predicted_classification_tree = []
for i in classification_tree_predicted_probabilities_test_partition:
    if i >= .5:
        y_predicted_classification_tree.append(1)
    else:
        y_predicted_classification_tree.append(0)
        
# create confusion matrix: confusion_matrix(y_true, y_pred)
# Note from sci-k itlearn.org:
# Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
    #true positives is M[1,1], and false positives is M[0,1].
classification_tree_confusion_matrix = confusion_matrix(y2,y_predicted_classification_tree)

classification_tree_true_positive = classification_tree_confusion_matrix[1,1]
classification_tree_true_negative = classification_tree_confusion_matrix[0,0]
classification_tree_false_positive = classification_tree_confusion_matrix[0,1]
classification_tree_false_negative = classification_tree_confusion_matrix[1,0]

#Calculate positive precision
classification_tree_positive_precision = classification_tree_true_positive / (classification_tree_true_positive + classification_tree_false_positive)
print('The positive precision for the classification tree model in the test partition is',classification_tree_positive_precision)

#Calculate positive recall
classification_tree_positive_recall = classification_tree_true_positive / (classification_tree_true_positive + classification_tree_false_negative)
print('The positive recall for the classification tree model in the test partition is',classification_tree_positive_recall)

#Calculate F score
classification_tree_F_score = (2 * classification_tree_positive_precision * classification_tree_positive_recall) / (classification_tree_positive_precision + classification_tree_positive_recall)
print('The F-score for the classification tree model in the test partition is',classification_tree_F_score)

#Calculate accuracy rate
print("The model's accuracy against the test partition is",best_pruned_tree_2.score(x2,y2))

#Calculate F score
classification_tree_F_score = (2 * classification_tree_positive_precision * classification_tree_positive_recall) / (classification_tree_positive_precision + classification_tree_positive_recall)
print('The F-score for the classification tree model in the test partition is',classification_tree_F_score)

The positive precision for the classification tree model in the test partition is 1.0
The positive recall for the classification tree model in the test partition is 0.7784256559766763
The F-score for the classification tree model in the test partition is 0.8754098360655737
The model's accuracy against the test partition is 0.7784256559766763
The F-score for the classification tree model in the test partition is 0.8754098360655737


## Random Forest

### Model in the validation partition

In [48]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y = non_test_data[DV]
x = non_test_data.drop(columns = [DV])

# Run Random Forest with k-fold cross validation with k=5
kfolds = 5

#Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 3)]

#Number of features to consider at every split
#From the Random Forest Python documentation: 
    #Empirical good default values are max_features=sqrt(n_features) for classification tasks
    #This translates to the 'sqrt' default option that the model contains
max_features = ['sqrt']

#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 3)]
max_depth.append(None)

#Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

#Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

#Create the random grid, and run the classifier on our x and y
param_grid = {'n_estimators': n_estimators,'max_features': max_features,'max_depth': max_depth,
              'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

rf = RandomForestClassifier()
grid_cv = GridSearchCV(rf, param_grid, scoring = 'roc_auc', cv = kfolds, n_jobs=-1)
grid_cv.fit(x, y)

random_forest_classifier_optimal_validation_partition = grid_cv.best_estimator_

# predict probabilities using the classifier and keep probabilities for the positive outcome only
random_forest_predicted_probabilities_validation_partition = random_forest_classifier_optimal_validation_partition.predict_proba(x)[:,1]

# Get the AUC of the best Random Forest Classifier
AUC_random_forest_validation_partition = roc_auc_score(y, random_forest_predicted_probabilities_validation_partition)
print('The AUC for the random forest model in the validation partition is', AUC_random_forest_validation_partition )

The AUC for the random forest model in the validation partition is 0.9854008140835842


### Model in the test partition

In [50]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y2 = test_data[DV]
x2 = test_data.drop(columns = [DV])

# Run Random Forest with k-fold cross test with k=5
kfolds = 5

#Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 3)]

#Number of features to consider at every split
#From the Random Forest Python documentation: 
    #Empirical good default values are max_features=sqrt(n_features) for classification tasks
    #This translates to the 'sqrt' default option that the model contains
max_features = ['sqrt']

#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 3)]
max_depth.append(None)

#Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

#Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

#Create the random grid, and run the classifier on our x and y
param_grid = {'n_estimators': n_estimators,'max_features': max_features,'max_depth': max_depth,
              'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

rf = RandomForestClassifier()
grid_cv = GridSearchCV(rf, param_grid, cv = kfolds, n_jobs=-1)
grid_cv.fit(x, y)

random_forest_classifier_optimal_test_partition = grid_cv.best_estimator_

# predict probabilities and keep only positive outcomes
random_forest_predicted_probabilities_test_partition = random_forest_classifier_optimal_test_partition.predict_proba(x2)[:,1]

#Get the AUC of the best Random Forest Classifier
# AUC_random_forest_test_partition = roc_auc_score(y2, random_forest_predicted_probabilities_test_partition)
# print('The AUC for the random forest model in the test partition is', AUC_random_forest_test_partition)

### Random Forest Confusion Matrix (test partition)

#### Note: since we are dealing with flagging depression, we are most concerned with capturing all True Positives and avoiding False Negatives, since we want to help people to are depressed (capture True Positives) and avoid accidentally ignoring people who are depressed (avoid False Negatives). Thus, we want to measure positive precision and positive recall together as the F-score.

In [51]:
#convert predicted probabilties into 1s and 0s
y_predicted_random_forest = []
for i in random_forest_predicted_probabilities_test_partition:
    if i >= .5:
        y_predicted_random_forest.append(1)
    else:
        y_predicted_random_forest.append(0)
        
# create confusion matrix: confusion_matrix(y_true, y_pred)
# Note from sci-k itlearn.org:
# Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
#     true positives is M[1,1], and false positives is M[0,1].
random_forest_confusion_matrix = confusion_matrix(y2,y_predicted_random_forest)

random_forest_true_positive = random_forest_confusion_matrix[1,1]
random_forest_true_negative = random_forest_confusion_matrix[0,0]
random_forest_false_positive = random_forest_confusion_matrix[0,1]
random_forest_false_negative = random_forest_confusion_matrix[1,0]

#Calculate positive precision
random_forest_positive_precision = random_forest_true_positive / (random_forest_true_positive + random_forest_false_positive)
print('The positive precision for the random forest model in the test partition is',random_forest_positive_precision)

#Calculate positive recall
random_forest_positive_recall = random_forest_true_positive / (random_forest_true_positive + random_forest_false_negative)
print('The positive recall for the random forest model in the test partition is',random_forest_positive_recall)

#Calculate F score
random_forest_F_score = (2 * random_forest_positive_precision * random_forest_positive_recall) / (random_forest_positive_precision + random_forest_positive_recall)
print('The F-score for the random forest model in the test partition is',random_forest_F_score)

#Calculate accuracy rate
print("The model's accuracy against the test partition is",random_forest_classifier_optimal_test_partition.score(x2,y2))

#Calculate F score
random_forest_F_score = (2 * random_forest_positive_precision * random_forest_positive_recall) / (random_forest_positive_precision + random_forest_positive_recall)
print('The F-score for the random forest model in the test partition is',random_forest_F_score)

The positive precision for the random forest model in the test partition is 1.0
The positive recall for the random forest model in the test partition is 0.7317784256559767
The F-score for the random forest model in the test partition is 0.8451178451178452
The model's accuracy against the test partition is 0.7317784256559767
The F-score for the random forest model in the test partition is 0.8451178451178452


## AdaBoost

### Model in the validation partition

In [39]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y = non_test_data[DV]
x = non_test_data.drop(columns = [DV])

# Run Random Forest with k-fold cross validation with k=5
kfolds = 5

#Set a parameter grid for the GridSearchCV
param_grid = {"base_estimator__criterion": ["gini", "entropy"], "base_estimator__splitter": ["best", "random"], 
              "n_estimators": [10, 50, 100, 500]}

#Assigning models and functions
DTC = DecisionTreeClassifier(random_state = 1, max_features = "auto", max_depth = 10)
adaboost = AdaBoostClassifier(base_estimator = DTC, random_state=1)

#Use a GridSearchCV to find the optimal model candidate
grid_cv = GridSearchCV(adaboost, param_grid, scoring = 'roc_auc', cv = kfolds, n_jobs=-1)
grid_cv.fit(x, y)
adaboost_classifier_optimal_validation_partition = grid_cv.best_estimator_

# predict probabilities using the classifier and keep probabilities for the positive outcome only
adaboost_predicted_probabilities_validation_partition = adaboost_classifier_optimal_validation_partition.predict_proba(x)[:,1]

# Get the AUC in the validation partition
AUC_adaboost_validation_partition = roc_auc_score(y, adaboost_predicted_probabilities_validation_partition)
print('The AUC for the adaboost model in the validation partition is', AUC_adaboost_validation_partition)

The AUC for the adaboost model in the validation partition is 1.0


### Model in the test partition

In [46]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y2 = test_data[DV]
x2 = test_data.drop(columns = [DV])

# Run Random Forest with k-fold cross test with k=5
kfolds = 5

#Set a parameter grid for the GridSearchCV
param_grid = {"base_estimator__criterion": ["gini", "entropy"], "base_estimator__splitter": ["best", "random"], 
              "n_estimators": [10, 50, 100, 500]}

#Assigning models and functions
DTC = DecisionTreeClassifier(random_state = 1, max_features = "auto", max_depth = 10)
adaboost = AdaBoostClassifier(base_estimator = DTC, random_state=1)

#Use a GridSearchCV to find the optimal model candidate
grid_cv = GridSearchCV(adaboost, param_grid, cv = kfolds, n_jobs=-1)
grid_cv.fit(x, y)
adaboost_classifier_optimal_test_partition = grid_cv.best_estimator_

# predict probabilities and keep only predictions for the positive outcome
adaboost_predicted_probabilities_test_partition = adaboost_classifier_optimal_test_partition.predict_proba(x2)[:,1]

# # Get the AUC in the test partition
# AUC_adaboost_test_partition = roc_auc_score(y2, adaboost_predicted_probabilities_test_partition)
# print('The AUC for the adaboost model in the test partition is', AUC_adaboost_test_partition)

### AdaBoost Confusion Matrix (test partition)

#### Note: since we are dealing with flagging depression, we are most concerned with capturing all True Positives and avoiding False Negatives, since we want to help people to are depressed (capture True Positives) and avoid accidentally ignoring people who are depressed (avoid False Negatives). Thus, we want to measure positive precision and positive recall together as the F-score.

In [47]:
#convert predicted probabilties into 1s and 0s
y_predicted_adaboost = []
for i in adaboost_predicted_probabilities_test_partition:
    if i >= .5:
        y_predicted_adaboost.append(1)
    else:
        y_predicted_adaboost.append(0)
        
# create confusion matrix: confusion_matrix(y_true, y_pred)
# Note from sci-k itlearn.org:
# Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
    #true positives is M[1,1], and false positives is M[0,1].
adaboost_confusion_matrix = confusion_matrix(y2,y_predicted_adaboost)

adaboost_true_positive = adaboost_confusion_matrix[1,1]
adaboost_true_negative = adaboost_confusion_matrix[0,0]
adaboost_false_positive = adaboost_confusion_matrix[0,1]
adaboost_false_negative = adaboost_confusion_matrix[1,0]

#Calculate positive precision
adaboost_positive_precision = adaboost_true_positive / (adaboost_true_positive + adaboost_false_positive)
print('The positive precision for the adaboost model in the test partition is',adaboost_positive_precision)

#Calculate positive recall
adaboost_positive_recall = adaboost_true_positive / (adaboost_true_positive + adaboost_false_negative)
print('The positive recall for the adaboost model in the test partition is',adaboost_positive_recall)

#Calculate F score
adaboost_F_score = (2 * adaboost_positive_precision * adaboost_positive_recall) / (adaboost_positive_precision + adaboost_positive_recall)
print('The F-score for the adaboost model in the test partition is',adaboost_F_score)

#Calculate accuracy rate
print("The model's accuracy against the test partition is",adaboost_classifier_optimal_test_partition.score(x2,y2))

#Calculate F score
adaboost_F_score = (2 * adaboost_positive_precision * adaboost_positive_recall) / (adaboost_positive_precision + adaboost_positive_recall)
print('The F-score for the adaboost model in the test partition is',adaboost_F_score)

The positive precision for the adaboost model in the test partition is 1.0
The positive recall for the adaboost model in the test partition is 0.7650978758850479
The F-score for the adaboost model in the test partition is 0.8669183577159036
The model's accuracy against the test partition is 0.7650978758850479
The F-score for the adaboost model in the test partition is 0.8669183577159036


# Create dataframe of overall results for easy viewing

In [52]:
model_names = ['k-NN', 'Classification Tree', 'Random Forest', 'AdaBoost']

# AUC_in_validation_partition = [AUC_k_NN_validation_partition,
#                               AUC_classification_tree_validation_partition,
#                               AUC_random_forest_validation_partition,
#                               AUC_adaboost_validation_partition]


# AUC_in_test_partition = [AUC_k_NN_test_partition,
#                               AUC_classification_tree_test_partition,
#                               AUC_random_forest_test_partition,
#                               AUC_adaboost_test_partition]


positive_precision = [k_NN_positive_precision,
                      classification_tree_positive_precision,
                      random_forest_positive_precision,
                      adaboost_positive_precision]
                    

positive_recall = [k_NN_positive_recall,
                      classification_tree_positive_recall,
                      random_forest_positive_recall,
                      adaboost_positive_recall]
                      

F_score = [k_NN_F_score,
                      classification_tree_F_score,
                      random_forest_F_score,
                      adaboost_F_score]
                      
summary_data = ({'Model Names': model_names, 
                'Positive Precision': positive_precision,
                'Positive Recall': positive_recall,
                'F-Score': F_score})

summary_df = pd.DataFrame(summary_data)

summary_df.head()

Unnamed: 0,Model Names,Positive Precision,Positive Recall,F-Score
0,k-NN,1.0,0.706789,0.828209
1,Classification Tree,1.0,0.778426,0.87541
2,Random Forest,1.0,0.731778,0.845118
3,AdaBoost,1.0,0.765098,0.866918


# Move AdaBoost accuracy against "new data" down here for easy finding

In [53]:
#AdaBoost accuracy
adaboost_accuracy = (adaboost_true_positive + adaboost_true_negative) / (adaboost_true_positive + adaboost_true_negative + adaboost_false_positive + adaboost_false_negative)
print('The Adaboost model accuracy on the new data in the test partition is', adaboost_accuracy)

The Adaboost model accuracy on the new data in the test partition is 0.7650978758850479
