In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix

# Import Data and Perform Overall Data Preparation

## Import Data

In [2]:
reddit_data = pd.read_csv(r'C:\Users\Home\Documents\Text Analytics\Group Project\Raw Data\Training-Validation.csv')
trimmed_data = reddit_data.copy()

trimmed_data.drop(['score', 'url', 'num_comments', 'created'], axis = 1, inplace = True)

## Tokenize and clean each post and title and stick the results in lists

In [3]:
#stopwords = stopwords.words('english')

#Tokenize "body".
tokenized_posts_list = []
# Note: word_tokenize() only accepts 1 string at a time. I must loop through the strings and then tokenize it.
for i in trimmed_data['body']:
    tokens_list = []
    tokens = nltk.word_tokenize(str(i))
    for j in tokens:
        if j.isalpha():
            tokens_list.append(j.lower())
    tokenized_posts_list.append(tokens_list)
    
#Tokenize "title". 
tokenized_titles_list = []
# Note: word_tokenize() only accepts 1 string at a time. I must loop through the strings and then tokenize it.
for i in trimmed_data['title']:
    tokens_list = []
    tokens = nltk.word_tokenize(str(i))
    for j in tokens:
        if j.isalpha():
            tokens_list.append(j.lower())
    tokenized_titles_list.append(tokens_list)

#Add list of fully tokenized and cleaned posts onto existing dataframe. This will allow us to analyze each post and count
#stuff in order to create our features
trimmed_data['Cleaned and Tokenized Titles'] = tokenized_titles_list
trimmed_data['Cleaned and Tokenized Posts'] = tokenized_posts_list

## Create the "Gold Standard" by classifying each subreddit as 1 (likely depression) or 0 (not likely depression).

In [4]:
subreddits_indicative_of_depression = ['depression','depression_help','mentalhealth']

manual_classification = []
for i in trimmed_data['subreddit']:
    
    if i in subreddits_indicative_of_depression: 
        manual_classification.append(1)
    else:
        manual_classification.append(0)
        
trimmed_data['Manual Classification'] = manual_classification

trimmed_data.head()

# Creation of the "Features Grocery Store" - So we can just go "shopping" whenever we need to (meaning create new dataframes with desired features)

## Psycholinguistic Markers (note: the researchers did not remove stopwords; neither will I)

### Aisle 1: Counts of punctuations and words

In [7]:
#Number of Punctuation Characters
punctuation = ['~','-',':',';','"',',','.','?','!']
number_of_punctuation_characters = []

for i in trimmed_data['body']:
    number = 0
    for j in punctuation:
        number += str(i).count(j)
    number_of_punctuation_characters.append(number)

#Number of words
number_of_words = []

for i in trimmed_data['Cleaned and Tokenized Posts']:
    number_of_words.append(len(i))
    
#Number of unique words
number_of_unique_words = []

for i in trimmed_data['Cleaned and Tokenized Posts']:
    number_of_unique_words.append(len(set(i)))

### Aisle 2: Parts of Speech

In [8]:
#The remaining needed Psycholinguistic Markers are all parts of speech. The first step is to tag the P.O.S once for the whole document
#run the P.O.S. tagger on the entire tokenized "tokenized_posts_list" list (remember: this is a list of lists)
POS_list = []
for i in trimmed_data['Cleaned and Tokenized Posts']:
    part_of_speech_tags = nltk.pos_tag(i)
    POS_list.append(part_of_speech_tags)

#Number of Verbs
number_of_all_verbs = []
for i in POS_list:
    all_verbs = [(word,tag) for (word,tag) in i if tag.startswith('V')]
    number_of_all_verbs.append(len(all_verbs))
    
#Number of Adjectives
number_of_adjectives = []
for i in POS_list:
    adjectives = [(word,tag) for (word,tag) in i if tag.startswith('J')]
    number_of_adjectives.append(len(adjectives))
    
#Number of Conjunctions (coordinating conjunctions only)
number_of_conjunctions = []
for i in POS_list:
    conjunctions = [(word,tag) for (word,tag) in i if tag.startswith('CC')]
    number_of_conjunctions.append(len(conjunctions))

#Number of Prepositions (includes prepositions and subordinating conjunctions)
number_of_prepositions = []
for i in POS_list:
    prepositions = [(word,tag) for (word,tag) in i if tag.startswith('IN')]
    number_of_prepositions.append(len(prepositions))
    
#Number of Infinitives
number_of_infinitives = []
for i in POS_list:
    infinitives = [(word,tag) for (word,tag) in i if tag.endswith('VB')]
    number_of_infinitives.append(len(infinitives))
    
#Number of Past Tense Verbs
number_of_past_tense_verbs = []
for i in POS_list:
    past_tense_verbs = [(word,tag) for (word,tag) in i if tag.startswith('VBD')]
    number_of_past_tense_verbs.append(len(past_tense_verbs))
    
#Number of First Person Verbs (this will be a little inexact; this is the rough combo of VBD and VBP)
number_of_first_person_verbs = []
for i in POS_list:
    first_person_verbs = [(word,tag) for (word,tag) in i if tag.startswith('VBD') or tag.startswith('VBP')]
    number_of_first_person_verbs.append(len(first_person_verbs))
    
#Number of Third-Person Verbs (this will undercount. VBZ is 3rd person singular only; combining with anything else will overcount alot)
number_of_third_person_verbs = []
for i in POS_list:
    third_person_verbs = [(word,tag) for (word,tag) in i if tag.startswith('VBZ')]
    number_of_third_person_verbs.append(len(third_person_verbs))
    
#Number of Pronouns
number_of_pronouns = []
NLTK_pronouns = ['PRP', 'PRP$', 'WP', 'WP$']
for i in POS_list:
    pronouns = [(word,tag) for (word,tag) in i if tag in NLTK_pronouns]
    number_of_pronouns.append(len(pronouns))
    
#Number of First-Person Pronouns
number_of_first_person_pronouns = []
list_of_first_person_pronouns = ['We', 'us', 'our','ourselves', 'I', 'me', 'my', 'mine', 'myself']
for i in POS_list:
    first_person_pronouns = [(word,tag) for (word,tag) in i if word in list_of_first_person_pronouns]
    number_of_first_person_pronouns.append(len(first_person_pronouns))
    
#Number of Singular First-Person Pronouns
number_of_singular_first_person_pronouns = []
list_of_singular_first_person_pronouns = ['I', 'me', 'my', 'mine', 'myself']
for i in POS_list:
    singular_first_person_pronouns = [(word,tag) for (word,tag) in i if word in list_of_singular_first_person_pronouns]
    number_of_singular_first_person_pronouns.append(len(singular_first_person_pronouns))
    
#Number of Plural First-Person Pronouns
number_of_plural_first_person_pronouns = []
list_of_plural_first_person_pronouns = ['We', 'us', 'our','ourselves']
for i in POS_list:
    plural_first_person_pronouns = [(word,tag) for (word,tag) in i if word in list_of_plural_first_person_pronouns]
    number_of_plural_first_person_pronouns.append(len(plural_first_person_pronouns))
    


### Aisle 3: Entire Sentences

In [9]:
#Because of the non-standard use of punctuation to end sentences, this count is not perfect. But, eyeballing the first 5 entries
#it looks good enough
number_of_sentences = []
for i in trimmed_data['body']:
    sentences = nltk.sent_tokenize(str(i))
    number_of_sentences.append(len(sentences))

# Dataframe Creation

## "Paper 1" Dataframe - The Psycholinguistic Markers

### Step 1: Creation of all needed new columns

In [12]:
#(N punctuation characters) / (N words)
punctuation_divided_by_words = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_punctuation_characters, number_of_words)]

#(N unique words) / (N words)
unique_words_divided_by_words = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_unique_words, number_of_words)]

#(N verbs) / (N adjectives)
verbs_divided_by_adjectives = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_all_verbs, number_of_adjectives)]

#(N conjunctions + N prepositions) / (N sentences)
conjunctions_plus_prepositions_divided_by_sentences = [(i + k) / j if j > 0 else 'divide by zero' for i,k,j in zip(number_of_conjunctions, number_of_prepositions, number_of_sentences)]

#(N infinitives) / (N verbs)
infintives_divided_by_verbs = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_infinitives, number_of_all_verbs)]

#(N singular first person past tense verbs) / (N verbs) (note: inexactly calculated)
SFPPT_verbs_divided_by_verbs = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_past_tense_verbs, number_of_all_verbs)]

#(N first person verbs) / (N verbs) (inexactly calculated)
first_person_verbs_divided_by_verbs = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_first_person_verbs, number_of_all_verbs)]

#(N third person verbs) / (N verbs) (inexactly calculated)
third_person_verbs_divided_by_verbs = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_third_person_verbs, number_of_all_verbs)]

#(N first person pronouns) / (N pronouns)
first_person_pronouns_divided_by_pronouns = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_first_person_pronouns, number_of_pronouns)]

#(N singular first person pronouns) / (N pronouns)
singular_first_person_pronouns_divided_by_pronouns = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_singular_first_person_pronouns, number_of_pronouns)]

#(N plural first person pronouns) / (N pronouns)
plural_first_person_pronouns_divided_by_pronouns = [i / j if j > 0 else 'divide by zero' for i, j in zip(number_of_plural_first_person_pronouns, number_of_pronouns)]

### Step 2: Creation of "Dataframe With Psycholinguistic Markers"

In [13]:
PM_data = ({'Manual Classification': trimmed_data['Manual Classification'], 
            'Subreddit': trimmed_data['subreddit'],
            'Post Title': trimmed_data['title'], 
            'Post Body': trimmed_data['body'], 
            '(N punctuation characters) / (N words)': punctuation_divided_by_words,
            '(N unique words) / (N words)': unique_words_divided_by_words,
            '(N verbs) / (N adjectives)': verbs_divided_by_adjectives,
            '(N conjunctions + N prepositions) / (N sentences)': conjunctions_plus_prepositions_divided_by_sentences,
            '(N infinitives) / (N verbs)': infintives_divided_by_verbs,
            '(N singular first person past tense verbs) / (N verbs)': SFPPT_verbs_divided_by_verbs,
            '(N first person verbs) / (N verbs)': first_person_verbs_divided_by_verbs,
            '(N third person verbs) / (N verbs)': third_person_verbs_divided_by_verbs,
            '(N first person pronouns) / (N pronouns)': first_person_pronouns_divided_by_pronouns,
            '(N singular first person pronouns) / (N pronouns)': singular_first_person_pronouns_divided_by_pronouns,
            '(N plural first person pronouns) / (N pronouns)': plural_first_person_pronouns_divided_by_pronouns})

PM_dataframe = pd.DataFrame(data = PM_data)

In [14]:
PM_dataframe.head()

Unnamed: 0,Manual Classification,Subreddit,Post Title,Post Body,(N punctuation characters) / (N words),(N unique words) / (N words),(N verbs) / (N adjectives),(N conjunctions + N prepositions) / (N sentences),(N infinitives) / (N verbs),(N singular first person past tense verbs) / (N verbs),(N first person verbs) / (N verbs),(N third person verbs) / (N verbs),(N first person pronouns) / (N pronouns),(N singular first person pronouns) / (N pronouns),(N plural first person pronouns) / (N pronouns)
0,1,depression,Our most-broken and least-understood rules is ...,We understand that most people who reply immed...,0.133423,0.447439,1.88406,3.594595,0.376923,0.0769231,0.292308,0.169231,0.125,0.046875,0.078125
1,1,depression,"Regular Check-In Post, with important reminder...",Welcome to /r/depression's check-in post - a p...,0.150943,0.566038,1.91304,3.6,0.340909,0.0681818,0.25,0.159091,0.12,0,0.12
2,1,depression,I Feel Like I Live In A World Where Evil Wins,"Hello everyone, little short intro here. I am...",0.135802,0.510288,2.86364,3.071429,0.206349,0.222222,0.380952,0.0952381,0.444444,0.111111,0.333333
3,1,depression,I just need to tell someone this,\nI’m 17. Today is my Dads 70th birthday. Tha...,0.0766129,0.504032,2.29167,2.235294,0.2,0.127273,0.309091,0.327273,0.393939,0.393939,0
4,1,depression,Im pretty sure i stay up so late because I don...,"I dont know if that makes sense, but thats how...",0.0833333,0.916667,2.0,2.0,0.0,0.0,0.5,0.5,divide by zero,divide by zero,divide by zero


# Machine Learning Models (with Psycholinguistic Markers as features)

## Grab the Psycholingustic Markers dataframe and get rid of any row with "divide by zero".
## Please note that I researched the given warning message and the Python community agrees that it is a bug caused by Numpy and Pandas fighting with each other. It is safe to ignore this warning when conducting this particular operation.

In [17]:
#Copy existing dataframe
PM_dataframe_for_ML_models = PM_dataframe.copy()

#Drop any row which contains the text string "divide by zero"
all_columns = list(PM_dataframe_for_ML_models.columns)

for i in all_columns:
    divide_by_zero_filter = PM_dataframe_for_ML_models[i] == 'divide by zero'
    PM_dataframe_for_ML_models.drop(PM_dataframe_for_ML_models[divide_by_zero_filter].index, inplace = True)

PM_dataframe_for_ML_models.reset_index(drop = True, inplace = True)
PM_dataframe_for_ML_models.head()

  res_values = method(rvalues)


Unnamed: 0,Manual Classification,Subreddit,Post Title,Post Body,(N punctuation characters) / (N words),(N unique words) / (N words),(N verbs) / (N adjectives),(N conjunctions + N prepositions) / (N sentences),(N infinitives) / (N verbs),(N singular first person past tense verbs) / (N verbs),(N first person verbs) / (N verbs),(N third person verbs) / (N verbs),(N first person pronouns) / (N pronouns),(N singular first person pronouns) / (N pronouns),(N plural first person pronouns) / (N pronouns)
0,1,depression,Our most-broken and least-understood rules is ...,We understand that most people who reply immed...,0.133423,0.447439,1.88406,3.594595,0.376923,0.0769231,0.292308,0.169231,0.125,0.046875,0.078125
1,1,depression,"Regular Check-In Post, with important reminder...",Welcome to /r/depression's check-in post - a p...,0.150943,0.566038,1.91304,3.6,0.340909,0.0681818,0.25,0.159091,0.12,0.0,0.12
2,1,depression,I Feel Like I Live In A World Where Evil Wins,"Hello everyone, little short intro here. I am...",0.135802,0.510288,2.86364,3.071429,0.206349,0.222222,0.380952,0.0952381,0.444444,0.111111,0.333333
3,1,depression,I just need to tell someone this,\nI’m 17. Today is my Dads 70th birthday. Tha...,0.0766129,0.504032,2.29167,2.235294,0.2,0.127273,0.309091,0.327273,0.393939,0.393939,0.0
4,1,depression,im 14. do i deserve to be depressed? or am i j...,my day started off with my mom yelling at me b...,0.062954,0.469734,3.28571,9.0,0.206522,0.326087,0.532609,0.0543478,0.703704,0.703704,0.0


## Model Setup: Generate Variable List, Standardize Values, and Partition Data

In [18]:
#Setting Up Numerical Variables (there are no categorical variables). Since I create the dataframe in the above code, I can
#safely drop 'manual classification', 'subreddit', 'post title', and 'post body' by directly slicing the list by index
nvar_list = list(PM_dataframe_for_ML_models.columns)
del nvar_list[0:4]

#Drop unneeded text columns from dataframe which will be used in Machine Learning models
PM_dataframe_for_ML_models.drop(['Subreddit', 'Post Title', 'Post Body'], axis = 1, inplace=True)

#Standardizing Numerical variables
standardized_PM_dataframe_for_ML_models = PM_dataframe_for_ML_models.copy()

original_column_values = PM_dataframe_for_ML_models[nvar_list]
sample_mean = PM_dataframe_for_ML_models[nvar_list].mean()
sample_stddev = PM_dataframe_for_ML_models[nvar_list].std()

standardized_PM_dataframe_for_ML_models[nvar_list] = ((original_column_values - sample_mean)/sample_stddev)



#Data Partition
#Splitting the data into our partitions will return two dataframes, so we must prep like so:
test_partition_size = .2
data_to_be_partitioned = standardized_PM_dataframe_for_ML_models

non_test_data, test_data = train_test_split(data_to_be_partitioned, test_size = test_partition_size, random_state = 1)

#non_test_data.head()

## Logistic Regression

### Model in the Training/Validation Partitions

In [25]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y = non_test_data[DV]
x = non_test_data.drop(columns = [DV])

#Create Lin Hao's function to return the logistic regression coefficient results in a nice format
def summary_coef(model_object):
    n_predictors = x.shape[1]
    model_coef = pd.DataFrame(model_object.coef_.reshape(1,n_predictors),columns = x.columns.values)
    model_coef['Intercept'] = model_object.intercept_
    return (model_coef.transpose())

#Setup Logistic Regression with k-folds = 5
kfolds = 5

#Establish alpha range for Python to search within
min_alpha = .01
max_alpha = 100

max_C = 1/min_alpha
min_C = 1/max_alpha

#Because there are infinite values between min_alpha and max_alpha, we must specify how many alphas Python should look for
#Python will then divide that interval into an even number of searches. We need numpy for this
n_candidates = 5000
c_list= list(np.linspace(min_C, max_C, num = n_candidates))

#Run logistic regression, use "scoring = roc_auc" to get the Area Under the Curve, and send it into the "nice formatting" function
classifier_optimal = LogisticRegressionCV(Cs = c_list, cv=kfolds, scoring = 'roc_auc', penalty = 'l1',solver='saga',max_iter=200, random_state=1, n_jobs = -1).fit(x,y)
print(summary_coef(classifier_optimal))

#Find the optimal selected alpha
print("\nThis model's optimal alpha in the validation partition is",1/classifier_optimal.C_)

# predict probabilities
logistic_regression_predicted_probabilities_validation_partition = classifier_optimal.predict_proba(x)

# keep probabilities for the positive outcome only
logistic_regression_predicted_probabilities_validation_partition = logistic_regression_predicted_probabilities_validation_partition[:, 1]

# calculate AUC
AUC_logistic_regression_validation_partition = roc_auc_score(y, logistic_regression_predicted_probabilities_validation_partition)
print('The AUC of the optimal model in the validation partition is', AUC_logistic_regression_validation_partition)

                                                           0
(N punctuation characters) / (N words)              0.360646
(N unique words) / (N words)                        0.891401
(N verbs) / (N adjectives)                         -0.282906
(N conjunctions + N prepositions) / (N sentences)   0.105490
(N infinitives) / (N verbs)                        -0.059542
(N singular first person past tense verbs) / (N... -0.754415
(N first person verbs) / (N verbs)                 -0.140026
(N third person verbs) / (N verbs)                 -0.464957
(N first person pronouns) / (N pronouns)            0.000000
(N singular first person pronouns) / (N pronouns)   1.033547
(N plural first person pronouns) / (N pronouns)    -0.197621
Intercept                                           0.957425

This model's optimal alpha in the validation partition is [3.99961596]
The AUC of the optimal model in the validation partition is 0.8724610111449711


### Model in the Test Partition

In [26]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y2 = test_data[DV]
x2 = test_data.drop(columns = [DV])

#Run logistic regression, use "scoring = roc_auc" to get the Area Under the Curve, and send it into the "nice formatting" function
classifier_optimal_2 = LogisticRegressionCV(Cs = c_list, cv=kfolds, scoring = 'roc_auc', penalty = 'l1',solver='saga',max_iter=200, random_state=1, n_jobs = -1).fit(x2,y2)
print(summary_coef(classifier_optimal_2))

#Find the optimal selected alpha
print("\nThis model's optimal alpha in the test partition is",1/classifier_optimal_2.C_)

# predict probabilities
logistic_regression_predicted_probabilities_test_partition = classifier_optimal_2.predict_proba(x2)

# keep probabilities for the positive outcome only
logistic_regression_predicted_probabilities_test_partition = logistic_regression_predicted_probabilities_test_partition[:, 1]

# calculate AUC
AUC_logistic_regression_test_partition = roc_auc_score(y2, logistic_regression_predicted_probabilities_test_partition)
print('The AUC of the optimal model in the test partition is', AUC_logistic_regression_test_partition)

                                                           0
(N punctuation characters) / (N words)              0.415034
(N unique words) / (N words)                        0.830570
(N verbs) / (N adjectives)                         -0.079822
(N conjunctions + N prepositions) / (N sentences)   0.273052
(N infinitives) / (N verbs)                        -0.015371
(N singular first person past tense verbs) / (N... -0.778614
(N first person verbs) / (N verbs)                 -0.122146
(N third person verbs) / (N verbs)                 -0.501808
(N first person pronouns) / (N pronouns)            0.000000
(N singular first person pronouns) / (N pronouns)   1.310021
(N plural first person pronouns) / (N pronouns)    -0.298459
Intercept                                           1.180626

This model's optimal alpha in the test partition is [1.36972789]
The AUC of the optimal model in the test partition is 0.8792966183086489


### Logistic Regression Confusion Matrix (validation partition)

#### Note: since we are dealing with flagging depression, we are most concerned with capturing all True Positives and avoiding False Negatives, since we want to help people to are depressed (capture True Positives) and avoid accidentally ignoring people who are depressed (avoid False Negatives). Thus, we want to measure positive precision and positive recall together as the F-score.

In [27]:
#convert predicted probabilties into 1s and 0s
y_predicted_logistic_regression = []
for i in logistic_regression_predicted_probabilities_validation_partition:
    if i >= .5:
        y_predicted_logistic_regression.append(1)
    else:
        y_predicted_logistic_regression.append(0)
        
#create confusion matrix: confusion_matrix(y_true, y_pred)
#Note from sci-k itlearn.org:
#Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
    #true positives is M[1,1], and false positives is M[0,1].
logistic_regression_confusion_matrix = confusion_matrix(y, y_predicted_logistic_regression)

logistic_regression_true_positive = logistic_regression_confusion_matrix[1,1]
logistic_regression_true_negative = logistic_regression_confusion_matrix[0,0]
logistic_regression_false_positive = logistic_regression_confusion_matrix[0,1]
logistic_regression_false_negative = logistic_regression_confusion_matrix[1,0]

#Calculate positive precision
logistic_regression_positive_precision = logistic_regression_true_positive / (logistic_regression_true_positive + logistic_regression_false_positive)
print('The positive precision for the logistic regression model in the validation partition is',logistic_regression_positive_precision)

#Calculate positive recall
logistic_regression_positive_recall = logistic_regression_true_positive / (logistic_regression_true_positive + logistic_regression_false_negative)
print('The positive recall for the logistic regression model in the validation partition is',logistic_regression_positive_recall)

#Calculate F score
logistic_regression_F_score = (2 * logistic_regression_positive_precision * logistic_regression_positive_recall) / (logistic_regression_positive_precision + logistic_regression_positive_recall)
print('The F-score for the logistic regression model in the validation partition is',logistic_regression_F_score)

The positive precision for the logistic regression model in the validation partition is 0.8342516069788797
The positive recall for the logistic regression model in the validation partition is 0.8648262732032366
The F-score for the logistic regression model in the validation partition is 0.8492638466931526


## k-NN

### Model in the Training/Validation Partitions

In [28]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y = non_test_data[DV]
x = non_test_data.drop(columns = [DV])

# Run Nearest Neighbors with k-fold cross validation with k=5
kfolds = 5

# Here we specify within which range of Ks we will search through and save that range into a dictionary
max_k = 200
parameters_grid = {'n_neighbors': list(range(1, max_k + 1))}

#Setup a k-NN model which will search through all of our specified hyperparameters (the ks) and then apply the model to our data
k_NN = GridSearchCV(KNeighborsClassifier(metric = 'euclidean'), parameters_grid, scoring='roc_auc', cv=kfolds, n_jobs=-1)
k_NN.fit(x,y)
classifier_best_KNN = k_NN.best_estimator_

# Display optimal k
print('The optimal k in the validation partition is',classifier_best_KNN.n_neighbors)

# predict probabilities
k_NN_predicted_probabilities_validation_partition = classifier_best_KNN.predict_proba(x)

# keep probabilities for the positive outcome only
k_NN_predicted_probabilities_validation_partition = k_NN_predicted_probabilities_validation_partition[:, 1]

# calculate AUC
AUC_k_NN_validation_partition = roc_auc_score(y, k_NN_predicted_probabilities_validation_partition)
print('The AUC of the optimal model in the validation partition is', AUC_k_NN_validation_partition)

The optimal k in the validation partition is 63
The AUC of the optimal model in the validation partition is 0.8927928182003623


### Model in the Test Partition

In [29]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y2 = test_data[DV]
x2 = test_data.drop(columns = [DV])

#Setup a k-NN model which will search through all of our specified hyperparameters (the ks) and then apply the model to our data
k_NN_2 = GridSearchCV(KNeighborsClassifier(metric = 'euclidean'), parameters_grid, scoring='roc_auc', cv=kfolds, n_jobs=-1)
k_NN_2.fit(x2,y2)
classifier_best_KNN_2 = k_NN_2.best_estimator_

# Display optimal k
print('The optimal k in the test partition is',classifier_best_KNN_2.n_neighbors)

# predict probabilities
k_NN_predicted_probabilities_test_partition = classifier_best_KNN_2.predict_proba(x2)

# keep probabilities for the positive outcome only
k_NN_predicted_probabilities_test_partition = k_NN_predicted_probabilities_test_partition[:, 1]

# calculate AUC
AUC_k_NN_test_partition = roc_auc_score(y2, k_NN_predicted_probabilities_test_partition)
print('The AUC of the optimal model in the test partition is', AUC_k_NN_test_partition)

The optimal k in the test partition is 104
The AUC of the optimal model in the test partition is 0.8913776980235555


### k-NN Confusion Matrix (validation partition)

#### Note: since we are dealing with flagging depression, we are most concerned with capturing all True Positives and avoiding False Negatives, since we want to help people to are depressed (capture True Positives) and avoid accidentally ignoring people who are depressed (avoid False Negatives). Thus, we want to measure positive precision and positive recall together as the F-score.

In [30]:
#convert predicted probabilties into 1s and 0s
y_predicted_k_NN = []
for i in k_NN_predicted_probabilities_validation_partition:
    if i >= .5:
        y_predicted_k_NN.append(1)
    else:
        y_predicted_k_NN.append(0)
        
#create confusion matrix: confusion_matrix(y_true, y_pred)
#Note from sci-k itlearn.org:
#Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
    #true positives is M[1,1], and false positives is M[0,1].
k_NN_confusion_matrix = confusion_matrix(y,y_predicted_k_NN)

k_NN_true_positive = k_NN_confusion_matrix[1,1]
k_NN_true_negative = k_NN_confusion_matrix[0,0]
k_NN_false_positive = k_NN_confusion_matrix[0,1]
k_NN_false_negative = k_NN_confusion_matrix[1,0]

#Calculate positive precision
k_NN_positive_precision = k_NN_true_positive / (k_NN_true_positive + k_NN_false_positive)
print('The positive precision for the k-NN model in the validation partition is',k_NN_positive_precision)

#Calculate positive recall
k_NN_positive_recall = k_NN_true_positive / (k_NN_true_positive + k_NN_false_negative)
print('The positive recall for the k-NN model in the validation partition is',k_NN_positive_recall)

#Calculate F score
k_NN_F_score = (2 * k_NN_positive_precision * k_NN_positive_recall) / (k_NN_positive_precision + k_NN_positive_recall)
print('The F-score for the k-NN model in the validation partition is',k_NN_F_score)

The positive precision for the k-NN model in the validation partition is 0.8584409373505499
The positive recall for the k-NN model in the validation partition is 0.8543550690147549
The F-score for the k-NN model in the validation partition is 0.8563931297709924


## Classification Tree (data standardization does not matter, though our data is already standardized at this point)

### Model in the Validation Partition

In [31]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y = non_test_data[DV]
x = non_test_data.drop(columns = [DV])

# Run Classification tree with k-fold cross validation with k=5
kfolds = 5

# Here we specify within which range of depths we will search for the best pruned tree
maximum_depth = 100
minimum_depth = 1
parameter_grid = {'max_depth': list(range(minimum_depth, maximum_depth + 1))}

classification_tree = GridSearchCV(DecisionTreeClassifier(criterion='entropy', random_state=1), parameter_grid, scoring='roc_auc', cv=kfolds, n_jobs=-1)
classification_tree.fit(x,y)
best_pruned_tree = classification_tree.best_estimator_

# Display the level of depth of the best pruned tree
print('The best pruned tree in the validation partition is of depth',best_pruned_tree.get_depth())

# predict probabilities
classification_tree_predicted_probabilities_validation_partition = best_pruned_tree.predict_proba(x)

# keep probabilities for the positive outcome only
classification_tree_predicted_probabilities_validation_partition = classification_tree_predicted_probabilities_validation_partition[:, 1]

# calculate AUC
AUC_classification_tree_validation_partition = roc_auc_score(y, classification_tree_predicted_probabilities_validation_partition)
print('The AUC of the best pruned tree in the validation partition is', AUC_classification_tree_validation_partition)

The best pruned tree in the validation partition is of depth 5
The AUC of the best pruned tree in the validation partition is 0.8872916112920872


### Model in the Test Partition

In [32]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y2 = test_data[DV]
x2 = test_data.drop(columns = [DV])

# Run Classification tree with k-fold cross test with k=5
kfolds = 5

# Here we specify within which range of depths we will search for the best pruned tree
maximum_depth = 100
minimum_depth = 1
parameter_grid = {'max_depth': list(range(minimum_depth, maximum_depth + 1))}

classification_tree_2 = GridSearchCV(DecisionTreeClassifier(criterion='entropy', random_state=1), parameter_grid, scoring='roc_auc', cv=kfolds, n_jobs=-1)
classification_tree_2.fit(x2,y2)
best_pruned_tree_2 = classification_tree_2.best_estimator_

# Display the level of depth of the best pruned tree
print('The best pruned tree in the test partition is of depth',best_pruned_tree_2.get_depth())

# predict probabilities
classification_tree_predicted_probabilities_test_partition = best_pruned_tree_2.predict_proba(x2)

# keep probabilities for the positive outcome only
classification_tree_predicted_probabilities_test_partition = classification_tree_predicted_probabilities_test_partition[:, 1]

# calculate AUC
AUC_classification_tree_test_partition = roc_auc_score(y2, classification_tree_predicted_probabilities_test_partition)
print('The AUC of the best pruned tree in the test partition is', AUC_classification_tree_test_partition)

The best pruned tree in the test partition is of depth 3
The AUC of the best pruned tree in the test partition is 0.8772020168831826


### Classification Tree Confusion Matrix (validation partition)

#### Note: since we are dealing with flagging depression, we are most concerned with capturing all True Positives and avoiding False Negatives, since we want to help people to are depressed (capture True Positives) and avoid accidentally ignoring people who are depressed (avoid False Negatives). Thus, we want to measure positive precision and positive recall together as the F-score.

In [33]:
#convert predicted probabilties into 1s and 0s
y_predicted_classification_tree = []
for i in classification_tree_predicted_probabilities_validation_partition:
    if i >= .5:
        y_predicted_classification_tree.append(1)
    else:
        y_predicted_classification_tree.append(0)
        
#create confusion matrix: confusion_matrix(y_true, y_pred)
#Note from sci-k itlearn.org:
#Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
    #true positives is M[1,1], and false positives is M[0,1].
classification_tree_confusion_matrix = confusion_matrix(y,y_predicted_classification_tree)

classification_tree_true_positive = classification_tree_confusion_matrix[1,1]
classification_tree_true_negative = classification_tree_confusion_matrix[0,0]
classification_tree_false_positive = classification_tree_confusion_matrix[0,1]
classification_tree_false_negative = classification_tree_confusion_matrix[1,0]

#Calculate positive precision
classification_tree_positive_precision = classification_tree_true_positive / (classification_tree_true_positive + classification_tree_false_positive)
print('The positive precision for the classification tree model in the validation partition is',classification_tree_positive_precision)

#Calculate positive recall
classification_tree_positive_recall = classification_tree_true_positive / (classification_tree_true_positive + classification_tree_false_negative)
print('The positive recall for the classification tree model in the validation partition is',classification_tree_positive_recall)

#Calculate F score
classification_tree_F_score = (2 * classification_tree_positive_precision * classification_tree_positive_recall) / (classification_tree_positive_precision + classification_tree_positive_recall)
print('The F-score for the classification tree model in the validation partition is',classification_tree_F_score)

The positive precision for the classification tree model in the validation partition is 0.8568738229755178
The positive recall for the classification tree model in the validation partition is 0.866254164683484
The F-score for the classification tree model in the validation partition is 0.8615384615384615


## Random Forest

### Model in the validation partition

In [34]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y = non_test_data[DV]
x = non_test_data.drop(columns = [DV])

# Run Random Forest with k-fold cross validation with k=5
kfolds = 5

#Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 3)]

#Number of features to consider at every split
#From the Random Forest Python documentation: 
    #Empirical good default values are max_features=sqrt(n_features) for classification tasks
    #This translates to the 'sqrt' default option that the model contains
max_features = ['sqrt']

#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 3)]
max_depth.append(None)

#Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

#Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

#Create the random grid, and run the classifier on our x and y
param_grid = {'n_estimators': n_estimators,'max_features': max_features,'max_depth': max_depth,
              'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

rf = RandomForestClassifier()
grid_cv = GridSearchCV(rf, param_grid, scoring = 'roc_auc', cv = kfolds, n_jobs=-1)
grid_cv.fit(x, y)

random_forest_classifier_optimal_validation_partition = grid_cv.best_estimator_

# predict probabilities using the classifier and keep probabilities for the positive outcome only
random_forest_predicted_probabilities_validation_partition = random_forest_classifier_optimal_validation_partition.predict_proba(x)[:,1]

# Get the AUC of the best Random Forest Classifier
AUC_random_forest_validation_partition = roc_auc_score(y, random_forest_predicted_probabilities_validation_partition)
print('The AUC for the random forest model in the validation partition is', AUC_random_forest_validation_partition )

The AUC for the random forest model in the validation partition is 0.9992806455800268


### Model in the test partition

In [37]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y2 = test_data[DV]
x2 = test_data.drop(columns = [DV])

# Run Random Forest with k-fold cross test with k=5
kfolds = 5

#Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 3)]

#Number of features to consider at every split
#From the Random Forest Python documentation: 
    #Empirical good default values are max_features=sqrt(n_features) for classification tasks
    #This translates to the 'sqrt' default option that the model contains
max_features = ['sqrt']

#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 3)]
max_depth.append(None)

#Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

#Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

#Create the random grid, and run the classifier on our x and y
param_grid = {'n_estimators': n_estimators,'max_features': max_features,'max_depth': max_depth,
              'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

rf = RandomForestClassifier()
grid_cv = GridSearchCV(rf, param_grid, scoring = 'roc_auc', cv = kfolds, n_jobs=-1)
grid_cv.fit(x2, y2)

random_forest_classifier_optimal_test_partition = grid_cv.best_estimator_

# predict probabilities using the classifier and keep probabilities for the positive outcome only
random_forest_predicted_probabilities_test_partition = random_forest_classifier_optimal_test_partition.predict_proba(x2)[:,1]

#Get the AUC of the best Random Forest Classifier
AUC_random_forest_test_partition = roc_auc_score(y2, random_forest_predicted_probabilities_test_partition)
print('The AUC for the random forest model in the test partition is', AUC_random_forest_test_partition)

The AUC for the random forest model in the test partition is 0.9915584087347723


### Random Forest Confusion Matrix (validation partition)

#### Note: since we are dealing with flagging depression, we are most concerned with capturing all True Positives and avoiding False Negatives, since we want to help people to are depressed (capture True Positives) and avoid accidentally ignoring people who are depressed (avoid False Negatives). Thus, we want to measure positive precision and positive recall together as the F-score.

In [38]:
#convert predicted probabilties into 1s and 0s
y_predicted_random_forest = []
for i in random_forest_predicted_probabilities_validation_partition:
    if i >= .5:
        y_predicted_random_forest.append(1)
    else:
        y_predicted_random_forest.append(0)
        
#create confusion matrix: confusion_matrix(y_true, y_pred)
#Note from sci-k itlearn.org:
#Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
    #true positives is M[1,1], and false positives is M[0,1].
random_forest_confusion_matrix = confusion_matrix(y,y_predicted_random_forest)

random_forest_true_positive = random_forest_confusion_matrix[1,1]
random_forest_true_negative = random_forest_confusion_matrix[0,0]
random_forest_false_positive = random_forest_confusion_matrix[0,1]
random_forest_false_negative = random_forest_confusion_matrix[1,0]

#Calculate positive precision
random_forest_positive_precision = random_forest_true_positive / (random_forest_true_positive + random_forest_false_positive)
print('The positive precision for the random forest model in the validation partition is',random_forest_positive_precision)

#Calculate positive recall
random_forest_positive_recall = random_forest_true_positive / (random_forest_true_positive + random_forest_false_negative)
print('The positive recall for the random forest model in the validation partition is',random_forest_positive_recall)

#Calculate F score
random_forest_F_score = (2 * random_forest_positive_precision * random_forest_positive_recall) / (random_forest_positive_precision + random_forest_positive_recall)
print('The F-score for the random forest model in the validation partition is',random_forest_F_score)

The positive precision for the random forest model in the validation partition is 0.9775386055217595
The positive recall for the random forest model in the validation partition is 0.99428843407901
The F-score for the random forest model in the validation partition is 0.9858423784804153


## AdaBoost

### Model in the validation partition

In [19]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y = non_test_data[DV]
x = non_test_data.drop(columns = [DV])

# Run Random Forest with k-fold cross validation with k=5
kfolds = 5

#Set a parameter grid for the GridSearchCV
param_grid = {"base_estimator__criterion": ["gini", "entropy"], "base_estimator__splitter": ["best", "random"], 
              "n_estimators": [10, 50, 100, 500]}

#Assigning models and functions
DTC = DecisionTreeClassifier(random_state = 1, max_features = "auto", max_depth = 10)
adaboost = AdaBoostClassifier(base_estimator = DTC, random_state=1)

#Use a GridSearchCV to find the optimal model candidate
grid_cv = GridSearchCV(adaboost, param_grid, scoring = 'roc_auc', cv = kfolds, n_jobs=-1)
grid_cv.fit(x, y)
adaboost_classifier_optimal_validation_partition = grid_cv.best_estimator_

# predict probabilities using the classifier and keep probabilities for the positive outcome only
adaboost_predicted_probabilities_validation_partition = adaboost_classifier_optimal_validation_partition.predict_proba(x)[:,1]

# Get the AUC of the best ADA Boost - test
AUC_adaboost_validation_partition = roc_auc_score(y, adaboost_predicted_probabilities_validation_partition)
print('The AUC for the adaboost model in the validation partition is', AUC_adaboost_validation_partition)

The AUC for the adaboost model in the validation partition is 1.0


### Model in the test partition

In [20]:
#Establish, for Python's sake, the independent and dependent variables
DV = 'Manual Classification'
y2 = test_data[DV]
x2 = test_data.drop(columns = [DV])

# Run Random Forest with k-fold cross test with k=5
kfolds = 5

#Set a parameter grid for the GridSearchCV
param_grid = {"base_estimator__criterion": ["gini", "entropy"], "base_estimator__splitter": ["best", "random"], 
              "n_estimators": [10, 50, 100, 500]}

#Assigning models and functions
DTC = DecisionTreeClassifier(random_state = 1, max_features = "auto", max_depth = 10)
adaboost = AdaBoostClassifier(base_estimator = DTC, random_state=1)

#Use a GridSearchCV to find the optimal model candidate
grid_cv = GridSearchCV(adaboost, param_grid, scoring = 'roc_auc', cv = kfolds, n_jobs=-1)
grid_cv.fit(x2, y2)
adaboost_classifier_optimal_test_partition = grid_cv.best_estimator_

# predict probabilities using the classifier and keep probabilities for the positive outcome only
adaboost_predicted_probabilities_test_partition = adaboost_classifier_optimal_test_partition.predict_proba(x2)[:,1]

# Get the AUC of the best ADA Boost - test
AUC_adaboost_test_partition = roc_auc_score(y2, adaboost_predicted_probabilities_test_partition)
print('The AUC for the adaboost model in the test partition is', AUC_adaboost_test_partition)

The AUC for the adaboost model in the test partition is 1.0


### AdaBoost Confusion Matrix (validation partition)

#### Note: since we are dealing with flagging depression, we are most concerned with capturing all True Positives and avoiding False Negatives, since we want to help people to are depressed (capture True Positives) and avoid accidentally ignoring people who are depressed (avoid False Negatives). Thus, we want to measure positive precision and positive recall together as the F-score.

In [21]:
#convert predicted probabilties into 1s and 0s
y_predicted_adaboost = []
for i in adaboost_predicted_probabilities_validation_partition:
    if i >= .5:
        y_predicted_adaboost.append(1)
    else:
        y_predicted_adaboost.append(0)
        
#create confusion matrix: confusion_matrix(y_true, y_pred)
#Note from sci-k itlearn.org:
#Thus in binary classification, the count of true negatives is M[0,0], false negatives is M[1,0], 
    #true positives is M[1,1], and false positives is M[0,1].
adaboost_confusion_matrix = confusion_matrix(y,y_predicted_adaboost)

adaboost_true_positive = adaboost_confusion_matrix[1,1]
adaboost_true_negative = adaboost_confusion_matrix[0,0]
adaboost_false_positive = adaboost_confusion_matrix[0,1]
adaboost_false_negative = adaboost_confusion_matrix[1,0]

#Calculate positive precision
adaboost_positive_precision = adaboost_true_positive / (adaboost_true_positive + adaboost_false_positive)
print('The positive precision for the adaboost model in the validation partition is',adaboost_positive_precision)

#Calculate positive recall
adaboost_positive_recall = adaboost_true_positive / (adaboost_true_positive + adaboost_false_negative)
print('The positive recall for the adaboost model in the validation partition is',adaboost_positive_recall)

#Calculate F score
adaboost_F_score = (2 * adaboost_positive_precision * adaboost_positive_recall) / (adaboost_positive_precision + adaboost_positive_recall)
print('The F-score for the adaboost model in the validation partition is',adaboost_F_score)

The positive precision for the adaboost model in the validation partition is 1.0
The positive recall for the adaboost model in the validation partition is 1.0
The F-score for the adaboost model in the validation partition is 1.0


# Create dataframe of overall results for easy viewing

In [42]:
model_names = ['Logistic Regression', 'k-NN', 'Classification Tree', 'Random Forest', 'AdaBoost']

AUC_in_validation_partition = [AUC_logistic_regression_validation_partition,
                              AUC_k_NN_validation_partition,
                              AUC_classification_tree_validation_partition,
                              AUC_random_forest_validation_partition,
                              AUC_adaboost_validation_partition]

AUC_in_test_partition = [AUC_logistic_regression_test_partition,
                              AUC_k_NN_test_partition,
                              AUC_classification_tree_test_partition,
                              AUC_random_forest_test_partition,
                              AUC_adaboost_test_partition]

positive_precision = [logistic_regression_positive_precision,
                      k_NN_positive_precision,
                      classification_tree_positive_precision,
                      random_forest_positive_precision,
                      adaboost_positive_precision]

positive_recall = [logistic_regression_positive_recall,
                      k_NN_positive_recall,
                      classification_tree_positive_recall,
                      random_forest_positive_recall,
                      adaboost_positive_recall]

F_score = [logistic_regression_F_score,
                      k_NN_F_score,
                      classification_tree_F_score,
                      random_forest_F_score,
                      adaboost_F_score]
summary_data = ({'Model Names': model_names, 
                 'AUC in Validation Partition': AUC_in_validation_partition,
                'AUC in Test Partition': AUC_in_test_partition,
                'Positive Precision': positive_precision,
                'Positive Recall': positive_recall,
                'F-Score': F_score})

summary_df = pd.DataFrame(summary_data)

#Add one more column to compare validation and test partitions to guard against overfitting
summary_df['Ratio of AUC in Test Partition to AUC in Validation Partition'] = summary_df['AUC in Test Partition'] / summary_df['AUC in Validation Partition']

summary_df.head()

Unnamed: 0,Model Names,AUC in Validation Partition,AUC in Test Partition,Positive Precision,Positive Recall,F-Score,Ratio of AUC in Test Partition to AUC in Validation Partition
0,Logistic Regression,0.872461,0.879297,0.834252,0.864826,0.849264,1.007835
1,k-NN,0.892793,0.891378,0.858441,0.854355,0.856393,0.998415
2,Classification Tree,0.887292,0.877202,0.856874,0.866254,0.861538,0.988629
3,Random Forest,0.999281,0.991558,0.977539,0.994288,0.985842,0.992272
4,AdaBoost,1.0,1.0,1.0,1.0,1.0,1.0


# For final report purposes, bring relevant AdaBoost metrics to the bottom for easy finding

In [22]:
#AdaBoost accuracy
adaboost_accuracy = (adaboost_true_positive + adaboost_true_negative) / (adaboost_true_positive + adaboost_true_negative + adaboost_false_positive + adaboost_false_negative)
print('The Adaboost model accuracy on the original data in the validation partition is', adaboost_accuracy)

#Adaboost F-score
print('The F-score for the adaboost model on the orginal data in the validation partition is',adaboost_F_score)

#AdaBoost false negative rate
adaboost_false_negative_rate = adaboost_false_negative / (adaboost_false_negative + adaboost_true_positive)
print('The false negative rate for the adaboost model on the orginal data in the validation partition is',adaboost_false_negative_rate)




The Adaboost model accuracy on the original data in the validation partition is 1.0
The F-score for the adaboost model on the orginal data in the validation partition is 1.0
The false negative rate for the adaboost model on the orginal data in the validation partition is 0.0
