# **Libraries**


We will import all the necessary libraries for the NN model and for the data preprocessing.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
# ***********************
# *****| LIBRARIES |*****
# ***********************
import numpy as np
import os
import json
from google.colab import files
from joblib import dump, load

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix

# **Global Variables**

In [None]:
# ******************************
# *****| GLOBAL VARIABLES |*****
# ******************************
# Variables for the current NN

# **Utility Functions**

**GET FILES FUNCTION**: This function will return a list of paths to the files.

In [None]:
# *****************
# *** GET FILES ***
# *****************
def getFiles( driverPath, directory, basename, extension):  # Define a function that will return a list of files
    pathList = []                                           # Declare an empty array
    directory = os.path.join( driverPath, directory)        # 
    
    for root, dirs, files in os.walk( directory):           # Iterate through roots, dirs and files recursively
        for file in files:                                  # For every file in files
            if os.path.basename(root) == basename:          # If the parent directory of the current file is equal with the parameter
                if file.endswith('.%s' % (extension)):      # If the searched file ends in the parameter
                    path = os.path.join(root, file)         # Join together the root path and file name
                    pathList.append(path)                   # Append the new path to the list
    return pathList  

**GET DATAFRAMES FUNCTION**: This function will return a list with objects from the list of paths

In [None]:
# ****************************************
# *** GET DATA INTO A PANDAS DATAFRAME ***
# ****************************************
def getDataFrame( listFiles, maxFiles, minWords, limit):
    counter_real, counter_max, limitReached = 0, 0, 0
    text_list, label_list = [], []

    print("Word min set to: %i." % ( minWords))
    # Iterate through all the files
    for file in listFiles:
        # Open each file and look into it
        with open(file) as f:
            if(limitReached):
              break
            if maxFiles == 0:
                break
            else:
                maxFiles -= 1
            objects = json.loads( f.read())['data']                  # Get the data from the JSON file
            # Look into each object from the file and test for limiters
            for object in objects:
              if limit > 0 and counter_real >= (limit * 1000):
                limitReached = 1
                break
              if len( object['text'].split()) >= minWords:
                text_list.append(object['text'])
                label_list.append(object['label'])
                counter_real += 1
              counter_max += 1

    if(counter_real > 0 and counter_max > 0):
      ratio = counter_real / counter_max * 100
    else:
      ratio = 0
    # Print the final result
    print("Lists created with %i/%i (%.2f%%) data objects." % ( counter_real, counter_max, ratio))
    print("Rest ignored due to minimum words limit of %i or the limit of %i data objects maximum." % ( minWords, limit * 1000))
    # Return the final Pandas DataFrame
    return text_list, label_list, counter_real

In [None]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('wordnet')

class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [ self.wnl.lemmatize(t) for t in word_tokenize(doc) ]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


# **Get paths of the processed files**

This code will get us the paths we need in order to access the data for the CNN.

In [None]:
# ***********************************
# *** GET THE PATHS FOR THE FILES ***
# ***********************************

# Path to the content of the Google Drive 
driverPath = "/content/drive/My Drive"

# Sub-directories in the driver
paths = ["processed/depression/submission",
         "processed/depression/comment", 
         "processed/AskReddit/submission", 
         "processed/AskReddit/comment"]

files = [None] * len(paths)
for i in range(len(paths)):
  files[i] = getFiles( driverPath, paths[i], "text", "json")
  print("Gathered %i files from %s." % ( len(files[i]), paths[i]))

Gathered 750 files from processed/depression/submission.
Gathered 2892 files from processed/depression/comment.
Gathered 1311 files from processed/AskReddit/submission.
Gathered 5510 files from processed/AskReddit/comment.


# **Gather the data**

In [None]:
# ************************************
# *** GATHER THE DATA AND SPLIT IT ***
# ************************************
# Local variables
rand_state_splitter = 1000
test_size = 0.2

min_files = [ 750, 0, 1300, 0] 
max_words = [ 50, 0, 50, 0]
limit_packets = [750, 0, 750, 0]
message = ["Depression submissions", "Depression comments", "AskReddit submissions", "AskReddit comments"]
text, label = [], []

# Get the pandas data frames for each category
print("Build the Pandas DataFrames for each category.")
for i in range(4):
  dummy_text, dummy_label, counter = getDataFrame( files[i], min_files[i], max_words[i], limit_packets[i])
  if counter > 0:
    text += dummy_text
    label += dummy_label
    dummy_text, dummy_label = None, None
    print("Added %i samples to data list: %s.\n" % ( counter ,message[i]) )

# Splitting the data
x_train, x_test, y_train, y_test = train_test_split(text, 
                                                    label, 
                                                    test_size = test_size, 
                                                    shuffle = True, 
                                                    random_state = rand_state_splitter)
print("Training data: %i samples." % ( len(y_train)) )
print("Testing data: %i samples." % ( len(y_test)) )

# Clear data no longer needed
del rand_state_splitter, min_files, max_words, message, dummy_label, dummy_text

Build the Pandas DataFrames for each category.
Word min set to: 50.
Lists created with 619489/750000 (82.60%) data objects.
Rest ignored due to minimum words limit of 50 or the limit of 750000 data objects maximum.
Added 619489 samples to data list: Depression submissions.

Word min set to: 0.
Lists created with 0/0 (0.00%) data objects.
Rest ignored due to minimum words limit of 0 or the limit of 0 data objects maximum.
Word min set to: 50.
Lists created with 651762/1300000 (50.14%) data objects.
Rest ignored due to minimum words limit of 50 or the limit of 750000 data objects maximum.
Added 651762 samples to data list: AskReddit submissions.

Word min set to: 0.
Lists created with 0/0 (0.00%) data objects.
Rest ignored due to minimum words limit of 0 or the limit of 0 data objects maximum.
Training data: 1017000 samples.
Testing data: 254251 samples.


# **TFIDF Vectorizer for the data**

This will transform the previously split data into data which is comprehensible by the Character-Level CNN.

In [None]:
import re

# Implement a TFIDF Vectorizer for words
print("Define the TFIDF Vectorizer and fit the data.")
#
vectorizer = TfidfVectorizer(   analyzer='word',
                                norm='l2',
                                tokenizer = LemmaTokenizer(),
                                min_df = 0.00016,
                                smooth_idf = True,
                                ngram_range = (1,2)
                            )
print("Fit the data in the vectorizer.")  
# Get the vocabulary from the sentences_train
vectorizer_fit = vectorizer.fit(x_train)
print("Vocabulary length: ", len( vectorizer_fit.vocabulary_))   
print("Words eliminated due to imposed minimum: %i:", len( vectorizer_fit.stop_words_))
print("Eliminating the stop words list in order to free memory.")
delattr(vectorizer_fit, 'stop_words_')

# Transform the data for training and testing into a sparse matrix
print("Transform the data into a sparse matrix.")
x_train = vectorizer_fit.transform(x_train)
x_test = vectorizer_fit.transform(x_test)
print("Completed.")

Define the TFIDF Vectorizer and fit the data.
Fit the data in the vectorizer.




Vocabulary length:  112853
Words eliminated due to imposed minimum: %i: 8298417
Eliminating the stop words list in order to free memory.
Transform the data into a sparse matrix.
Completed.


In [None]:
vectorizerType = "TFIDF"
min_df = "0.00016"
n_grams = "(1,2)"
min_words = "50"
vectorizerName = ("%sVectorizer_minDf_%s_nGrams_%s_minWords_%s" % ( vectorizerType, min_df, n_grams, min_words) )
dump(vectorizer_fit, "/content/drive/My Drive/vector_final.pkl")
dump(vectorizer_fit, "/content/drive/My Drive/vector_final.joblib")

['/content/drive/My Drive/vector_final.joblib']

# **Build the Character-Level CNN**

In [None]:
# ***************************************
# *** CROSS-VALIDATION SGD Classifier ***
# ***************************************
# Local variables
rand_state_nn = 1000

print("Build the classifier.")
grid = dict(
        alpha = [1 * 1e-6],
        max_iter = [1000],
        warm_start = [True],
        tol = [1 * 1e-6],
        loss = ['modified_huber']
        )
# Build the classifier
classifier = SGDClassifier(random_state = rand_state_nn,
                           n_iter_no_change = 10,
                           penalty = 'l2',
                           verbose = 10)
# Build the grid for the search 
classifier = GridSearchCV(estimator = classifier, 
                          param_grid = grid, 
                          cv = 5)
print("Classifier built.")

Build the classifier.
Classifier built.


# **Train the CNN**

In [None]:
# ****************************
# *** TRAIN THE CLASSIFIER ***
# ****************************

# Train the Classifier
print("Starting to train the Classifier.")
grid_result = classifier.fit( x_train, y_train)                                
print("Classifier trained.")

print("Best parameters from GRID CV: ", grid_result.best_params_)
print("Best score from GRID CV: ", grid_result.best_score_)

# Save the Classifier
print("Attempting to save the model.")

dump(classifier, "/content/drive/My Drive/SGDClassifier_final.joblib")
print("Model saved.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
-- Epoch 27
Norm: 208.09, NNZs: 112851, Bias: -0.601751, T: 21967200, Avg. loss: 0.066377
Total training time: 26.38 seconds.
-- Epoch 28
Norm: 207.80, NNZs: 112851, Bias: -0.616435, T: 22780800, Avg. loss: 0.066253
Total training time: 27.37 seconds.
-- Epoch 29
Norm: 207.53, NNZs: 112851, Bias: -0.599513, T: 23594400, Avg. loss: 0.066115
Total training time: 28.36 seconds.
-- Epoch 30
Norm: 207.27, NNZs: 112851, Bias: -0.612806, T: 24408000, Avg. loss: 0.066021
Total training time: 29.37 seconds.
-- Epoch 31
Norm: 207.05, NNZs: 112851, Bias: -0.609785, T: 25221600, Avg. loss: 0.065913
Total training time: 30.47 seconds.
-- Epoch 32
Norm: 206.81, NNZs: 112851, Bias: -0.616245, T: 26035200, Avg. loss: 0.065821
Total training time: 31.44 seconds.
-- Epoch 33
Norm: 206.62, NNZs: 112851, Bias: -0.606141, T: 26848800, Avg. loss: 0.065741
Total training time: 32.43 seconds.
-- Epoch 34
Norm: 206.44, NNZs: 112851, Bias: -0.6105

# **Test the CNN**

In [None]:
# *******************************
# *** TEST THE SGD CLASSIFIER ***
# *******************************
# Predit the results for the testing data
y_predict = classifier.predict( x_test)                             # Predict the data for test
# Get the accuracy for the testing data
test_accuracy = classifier.score( x_test, y_test)

# Build the confusion matrix 
confMatrix = confusion_matrix(y_test, y_predict)   
tn, fp, fn, tp = confMatrix.ravel()  
# Build a classification report                       
classification_reports = classification_report( y_test, y_predict, target_names = ['Non-depressed', 'Depressed'], digits=3)

# *************************
# *** PRINT THE RESULTS ***
# *************************
print("Test accuracy from classifier score: ", test_accuracy)

# Print the confusion matrix, the classification report and other data regarding the used parameters
print(confMatrix)
print("TP - Predicted that a man is depressive and he is: %i." % ( tp))
print("TN - Predicted that a man is NOT depressive and he is NOT: %i." % ( tn))
print("FP - Predicted that a man is depressive and he is NOT: %i." % ( fp))
print("FN - Predicted that a man is not depressive and he is: %i." % ( fn))
print(classification_reports)

Test accuracy from classifier score:  0.9614868771410928
[[125626   4531]
 [  5261 118833]]
TP - Predicted that a man is depressive and he is: 118833.
TN - Predicted that a man is NOT depressive and he is NOT: 125626.
FP - Predicted that a man is depressive and he is NOT: 4531.
FN - Predicted that a man is not depressive and he is: 5261.
               precision    recall  f1-score   support

Non-depressed      0.960     0.965     0.962    130157
    Depressed      0.963     0.958     0.960    124094

     accuracy                          0.961    254251
    macro avg      0.962     0.961     0.961    254251
 weighted avg      0.961     0.961     0.961    254251

