# **Libraries**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ***********************
# *****| LIBRARIES |*****
# ***********************
%tensorflow_version 2.x
import pandas as pd
import numpy as np
import os
import json

from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model
from keras.utils import to_categorical
from keras.optimizers import SGD
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print("GPU not found")
else:
    print('Found GPU at: {}'.format(device_name))

Using TensorFlow backend.


Found GPU at: /device:GPU:0


In [None]:
# ******************************
# *****| GLOBAL VARIABLES |*****
# ******************************
test_size = 0.2

convsize = 256
convsize2 = 1024
embedding_size = 27
input_size = 1000
conv_layers = [
    [convsize, 7, 3],
    [convsize, 7, 3],
    [convsize, 3, -1],
    [convsize, 3, -1],
    [convsize, 3, -1],
    [convsize, 3, 3]
    ]

fully_connected_layers = [convsize2, convsize2]
num_of_classes= 2
dropout_p = 0.5
optimizer= 'adam'
batch = 128
loss = 'categorical_crossentropy'

# **Utility functions**

In [None]:
# *****************
# *** GET FILES ***
# *****************
def getFiles( driverPath, directory, basename, extension):  # Define a function that will return a list of files
    pathList = []                                           # Declare an empty array
    directory = os.path.join( driverPath, directory)        # 
    
    for root, dirs, files in os.walk( directory):           # Iterate through roots, dirs and files recursively
        for file in files:                                  # For every file in files
            if os.path.basename(root) == basename:          # If the parent directory of the current file is equal with the parameter
                if file.endswith('.%s' % (extension)):      # If the searched file ends in the parameter
                    path = os.path.join(root, file)         # Join together the root path and file name
                    pathList.append(path)                   # Append the new path to the list
    return pathList  

In [None]:
# ****************************************
# *** GET DATA INTO A PANDAS DATAFRAME ***
# ****************************************
def getDataFrame( listFiles, maxFiles, minWords, limit):
    counter_real, counter_max, limitReached = 0, 0, 0
    text_list, label_list = [], []

    print("Word min set to: %i." % ( minWords))
    # Iterate through all the files
    for file in listFiles:
        # Open each file and look into it
        with open(file) as f:
            if(limitReached):
              break
            if maxFiles == 0:
                break
            else:
                maxFiles -= 1
            objects = json.loads( f.read())['data']                  # Get the data from the JSON file
            # Look into each object from the file and test for limiters
            for object in objects:
              if limit > 0 and counter_real >= (limit * 1000):
                limitReached = 1
                break
              if len( object['text'].split()) >= minWords:
                text_list.append(object['text'])
                label_list.append(object['label'])
                counter_real += 1
              counter_max += 1

    if(counter_real > 0 and counter_max > 0):
      ratio = counter_real / counter_max * 100
    else:
      ratio = 0
    # Print the final result
    print("Lists created with %i/%i (%.2f%%) data objects." % ( counter_real, counter_max, ratio))
    print("Rest ignored due to minimum words limit of %i or the limit of %i data objects maximum." % ( minWords, limit * 1000))
    # Return the final Pandas DataFrame
    return text_list, label_list, counter_real

# **Gather the path to files**

In [None]:
# ***********************************
# *** GET THE PATHS FOR THE FILES ***
# ***********************************

# Path to the content of the Google Drive 
driverPath = "/content/drive/My Drive"

# Sub-directories in the driver
paths = ["processed/depression/submission",
         "processed/depression/comment", 
         "processed/AskReddit/submission", 
         "processed/AskReddit/comment"]

files = [None] * len(paths)
for i in range(len(paths)):
  files[i] = getFiles( driverPath, paths[i], "text", "json")
  print("Gathered %i files from %s." % ( len(files[i]), paths[i]))

Gathered 750 files from processed/depression/submission.
Gathered 2892 files from processed/depression/comment.
Gathered 1311 files from processed/AskReddit/submission.
Gathered 5510 files from processed/AskReddit/comment.


# **Gather the data from files**

In [None]:
# ************************************
# *** GATHER THE DATA AND SPLIT IT ***
# ************************************
# Local variables
rand_state_splitter = 1000
test_size = 0.2

min_files = [ 750, 0, 1300, 0] 
max_words = [ 50, 0, 50, 0]
limit_packets = [300, 0, 300, 0]
message = ["Depression submissions", "Depression comments", "AskReddit submissions", "AskReddit comments"]
text, label = [], []

# Get the pandas data frames for each category
print("Build the Pandas DataFrames for each category.")
for i in range(4):
  dummy_text, dummy_label, counter = getDataFrame( files[i], min_files[i], max_words[i], limit_packets[i])
  if counter > 0:
    text += dummy_text
    label += dummy_label
    dummy_text, dummy_label = None, None
    print("Added %i samples to data list: %s.\n" % ( counter ,message[i]) )

# Splitting the data
x_train, x_test, y_train, y_test = train_test_split(text, 
                                                    label, 
                                                    test_size = test_size, 
                                                    shuffle = True, 
                                                    random_state = rand_state_splitter)
print("Training data: %i samples." % ( len(y_train)) )
print("Testing data: %i samples." % ( len(y_test)) )

# Clear data no longer needed
del rand_state_splitter, min_files, max_words, message, dummy_label, dummy_text

Build the Pandas DataFrames for each category.
Word min set to: 50.
Lists created with 300000/349305 (85.88%) data objects.
Rest ignored due to minimum words limit of 50 or the limit of 300000 data objects maximum.
Added 300000 samples to data list: Depression submissions.

Word min set to: 0.
Lists created with 0/0 (0.00%) data objects.
Rest ignored due to minimum words limit of 0 or the limit of 0 data objects maximum.
Word min set to: 50.
Lists created with 300000/554781 (54.08%) data objects.
Rest ignored due to minimum words limit of 50 or the limit of 300000 data objects maximum.
Added 300000 samples to data list: AskReddit submissions.

Word min set to: 0.
Lists created with 0/0 (0.00%) data objects.
Rest ignored due to minimum words limit of 0 or the limit of 0 data objects maximum.
Training data: 480000 samples.
Testing data: 120000 samples.


# **Process the data at a character-level**

In [None]:
# *******************************
# *** CONVERT STRING TO INDEX ***
# *******************************
print("Convert the strings to indexes.")
tk = Tokenizer(num_words = None, char_level = True, oov_token='UNK')
tk.fit_on_texts(x_train)
print("Original:", x_train[0])
# *********************************
# *** CONSTRUCT A NEW VOCABULARY***
# *********************************
print("Construct a new vocabulary")
alphabet = "abcdefghijklmnopqrstuvwxyz"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1
print("dictionary")
tk.word_index = char_dict.copy()                                # Use char_dict to replace the tk.word_index
print(tk.word_index)
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1       # Add 'UNK' to the vocabulary
print(tk.word_index)
# *************************
# *** TEXT TO SEQUENCES ***
# *************************
print("Text to sequence.")
x_train = tk.texts_to_sequences(x_train)
x_test = tk.texts_to_sequences(x_test)
print("After sequences:", x_train[0])
# ***************
# *** PADDING ***
# ***************
print("Padding the sequences.")
x_train = pad_sequences( x_train, maxlen = input_size, padding = 'post')
x_test = pad_sequences( x_test, maxlen= input_size , padding = 'post')

# ************************
# *** CONVERT TO NUMPY ***
# ************************
print("Convert to Numpy arrays")
x_train = np.array( x_train, dtype = 'float32')
x_test = np.array(x_test, dtype = 'float32')

# **************************************
# *** GET CLASSES FOR CLASSIFICATION ***
# **************************************
y_test_copy = y_test
y_train_list = [x-1 for x in y_train]
y_test_list = [x-1 for x in y_test]

y_train = to_categorical( y_train_list, num_of_classes)
y_test = to_categorical( y_test_list, num_of_classes)

Convert the strings to indexes.
Original: i did not think i had have to post in this subreddit i just feel empty and completely alone i am hanging out with friends but nothing makes me feel happy as i used to be i know people generally have it worse i just want someone to talk to and just be silly with 
Construct a new vocabulary
dictionary
{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, 'UNK': 27}
Text to sequence.
After sequences: [9, 27, 4, 9, 4, 27, 14, 15, 20, 27, 20, 8, 9, 14, 11, 27, 9, 27, 8, 1, 4, 27, 8, 1, 22, 5, 27, 20, 15, 27, 16, 15, 19, 20, 27, 9, 14, 27, 20, 8, 9, 19, 27, 19

# **Load embedding words**

In [None]:
# ***********************
# *** LOAD EMBEDDINGS ***
# ***********************
embedding_weights = []
vocab_size = len(tk.word_index)
embedding_weights.append(np.zeros(vocab_size))

for char, i in tk.word_index.items():
    onehot = np.zeros(vocab_size)
    onehot[i-1] = 1
    embedding_weights.append(onehot)
embedding_weights = np.array(embedding_weights)

print("Vocabulary size: ",vocab_size)
print("Embedding weights: ", embedding_weights)

Vocabulary size:  27
Embedding weights:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]
 [0. 0. 0. 0.

# **Build the CNN model**

In [None]:
def KerasModel():
    # ***************************************
    # *****| BUILD THE NEURAL NETWORK |******
    # ***************************************
    embedding_layer = Embedding(vocab_size+1,
                                embedding_size,
                                input_length = input_size,
                                weights = [embedding_weights])

    # Input layer
    inputs = Input(shape=(input_size,), name='input', dtype='int64')

    # Embedding layer
    x = embedding_layer(inputs)

    # Convolution
    for filter_num, filter_size, pooling_size in conv_layers:
        x = Conv1D(filter_num, filter_size)(x)
        x = Activation('relu')(x)
        if pooling_size != -1:
            x = MaxPooling1D( pool_size = pooling_size)(x)
    x = Flatten()(x)

    # Fully Connected layers
    for dense_size in fully_connected_layers:
            x = Dense( dense_size, activation='relu')(x)
            x = Dropout( dropout_p)(x)

    # Output Layer
    predictions = Dense(num_of_classes, activation = 'softmax')(x)

    # BUILD MODEL
    model = Model( inputs = inputs, outputs = predictions)
    model.compile(optimizer = optimizer, loss = loss, metrics = ['accuracy'])
    model.summary()

    return model

# **Train the CNN**

In [None]:
#with tf.device("/gpu:0"):
#    history = model.fit(x_train, y_train,
#            validation_data = ( x_test, y_test),
#            epochs = 10,
#            batch_size = batch,
#            verbose = True)
    
with tf.device("/gpu:0"):
    grid = KerasClassifier(build_fn = KerasModel, epochs = 15, verbose= True)
    param_grid = dict(
                    epochs = [15]
                  )
    #grid = GridSearchCV(estimator = model, 
    #                    param_grid = param_grid,
    #                    cv = 5, 
    #                    verbose = 10,  
    #                    return_train_score = True)
    
    grid_result = grid.fit(x_train, y_train)

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 27)          756       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 994, 256)          48640     
_________________________________________________________________
activation_1 (Activation)    (None, 994, 256)          0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 331, 256)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 325, 256)          459008    
_________________________________________________________________
activation_2 (Activation)    (None, 325, 256)          0   

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15

KeyboardInterrupt: ignored

# **Test the CNN**

In [None]:
#loss, accuracy = model.evaluate( x_train, y_train, verbose = True)
#print("Training Accuracy: {:.4f}".format( accuracy))
#loss, accuracy = model.evaluate( x_test, y_test, verbose = True)
#print("Testing Accuracy:  {:.4f}".format( accuracy))

from sklearn.metrics import classification_report, confusion_matrix
y_predict = grid.predict( x_test)
# Build the confusion matrix 
y_tested = y_test
print( type(y_test))
print(y_tested)
y_tested = np.argmax( y_tested, axis = 1)
print(y_tested)
confMatrix = confusion_matrix(y_tested, y_predict)   
tn, fp, fn, tp = confMatrix.ravel()  
# Build a classification report                       
classification_reports = classification_report( y_tested, y_predict, target_names = ['Non-depressed', 'Depressed'], digits=3)
print(confMatrix)
print(classification_reports)