In [21]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras.layers import Dense, Flatten, MaxPooling1D, Dropout, Conv1D, Input, LSTM

In [2]:
df = pd.read_csv('combined-selftext.csv')
#df.head()

In [3]:
def str_join(df, sep, *cols):
   ...:     from functools import reduce
   ...:     return reduce(lambda x, y: x.astype(str).str.cat(y.astype(str), sep=sep), 
   ...:                   [df[col] for col in cols])
   ...: 

In [4]:
df['text'] = str_join(df," ", 'title', 'usertext')

In [5]:
del df['title']
del df['usertext']

In [6]:
import gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
STOPWORDS = STOPWORDS.union(set(['im', 'ive', 'ill', 'wa', 'ha', 'aint', 'thats', 'la', 'le', 'please', 'feel', 'rly', 'u', 'nan', 'emptypost']))

stop = STOPWORDS
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [7]:
df['text'].dropna(inplace=True)
# 2. Changing all text to lowercase
df['text_original'] = df['text']
df['text'] = [entry.lower() for entry in df['text']]
# 3. Tokenization-In this each entry in the corpus will be broken into set of words
df['text']= [word_tokenize(entry) for entry in df['text']]
# 4. Remove Stop words, Non-Numeric and perfoming Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

df.head()

Unnamed: 0,y,text,text_original
0,0,"[need, help, hi, know, phrase, situation, try,...",need help hi know phrase situation try life go...
1,1,"[feeling, overwhelmed, hopeless, depressed, pa...",feeling overwhelmed hopeless depressed past co...
2,0,"[matter, anymore, getting, worse, hi, know, de...",matter anymore getting worse hi know devastate...
3,1,"[tired, hearing, bullshit, shit, like, better,...",tired hearing bullshit shit like better purpos...
4,0,"[wish, wish, prettier, wish, like, burden, wis...",wish wish prettier wish like burden wish broke...


In [10]:
for index,entry in enumerate(df['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    df.loc[index,'text_final'] = str(Final_words)

df.head()

Unnamed: 0,y,text,text_original,text_final
0,0,"[need, help, hi, know, phrase, situation, try,...",need help hi know phrase situation try life go...,"['need', 'help', 'hi', 'know', 'phrase', 'situ..."
1,1,"[feeling, overwhelmed, hopeless, depressed, pa...",feeling overwhelmed hopeless depressed past co...,"['feel', 'overwhelmed', 'hopeless', 'depress',..."
2,0,"[matter, anymore, getting, worse, hi, know, de...",matter anymore getting worse hi know devastate...,"['matter', 'anymore', 'get', 'bad', 'hi', 'kno..."
3,1,"[tired, hearing, bullshit, shit, like, better,...",tired hearing bullshit shit like better purpos...,"['tired', 'hearing', 'bullshit', 'shit', 'like..."
4,0,"[wish, wish, prettier, wish, like, burden, wis...",wish wish prettier wish like burden wish broke...,"['wish', 'wish', 'prettier', 'wish', 'like', '..."


In [13]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(df['text_final'],df['y'],test_size=0.2, random_state=42)

In [14]:
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

In [15]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(df['text_final'])
X_train = Tfidf_vect.transform(X_train)
X_test = Tfidf_vect.transform(X_test)

In [16]:
X_train = X_train.todense()
X_test = X_test.todense()

In [15]:
print(X_train.shape, X_test.shape)

(1498, 8548) (375, 8548)


In [13]:
print(y_train.shape, y_test.shape)

(1498,) (375,)


In [14]:
X_train.shape[0]

1498

In [15]:
X_train.shape[1]

8547

In [24]:
#CNN

model4 = keras.Sequential()

model4.add(keras.layers.Input(shape=(X_train.shape[1], 1)))
model4.add(keras.layers.Conv1D(5, (2,), padding='same', activation='relu'))
model4.add(keras.layers.Dropout(0.5))
model4.add(keras.layers.Flatten())
model4.add(keras.layers.Dense(64, activation='relu', kernel_initializer='he_uniform'))
model4.add(keras.layers.Dense(1, activation='sigmoid'))
model4.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy',
                              tf.keras.metrics.Precision(),
                              tf.keras.metrics.Recall(),
                              tfa.metrics.F1Score(num_classes=1)])
model4.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_3 (Conv1D)           (None, 8548, 5)           15        
                                                                 
 dropout_1 (Dropout)         (None, 8548, 5)           0         
                                                                 
 flatten_5 (Flatten)         (None, 42740)             0         
                                                                 
 dense_10 (Dense)            (None, 64)                2735424   
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
Total params: 2,735,504
Trainable params: 2,735,504
Non-trainable params: 0
_________________________________________________________________


In [14]:
history4 = model4.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=32, 
    validation_split=0.2, 
    verbose=1, 
    shuffle=True
)
model4.evaluate(X_test, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.8153498768806458,
 0.637333333492279,
 0.606217622756958,
 0.6610169410705566,
 array([0.6413044], dtype=float32)]

In [29]:
#LSTM

model6 = keras.Sequential()

model6.add(keras.layers.Input(shape=(X_train.shape[1], 1)))
model6.add(keras.layers.LSTM(100, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
model6.add(keras.layers.MaxPooling1D(pool_size = 2))
model6.add(keras.layers.Flatten())
model6.add(keras.layers.Dense(64, activation='relu', kernel_initializer='he_uniform'))
model6.add(keras.layers.Dense(1, activation='sigmoid'))
model6.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy',
                              tf.keras.metrics.Precision(),
                              tf.keras.metrics.Recall(),
                              tfa.metrics.F1Score(num_classes=1)])
model6.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_8 (LSTM)               (None, 8548, 100)         40800     
                                                                 
 max_pooling1d_7 (MaxPooling  (None, 4274, 100)        0         
 1D)                                                             
                                                                 
 flatten_8 (Flatten)         (None, 427400)            0         
                                                                 
 dense_16 (Dense)            (None, 64)                27353664  
                                                                 
 dense_17 (Dense)            (None, 1)                 65        
                                                                 
Total params: 27,394,529
Trainable params: 27,394,529
Non-trainable params: 0
_________________________________________

In [17]:
history6 = model6.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=32, 
    validation_split=0.2, 
    verbose=1, 
    shuffle=True
)
model6.evaluate(X_test, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.6473361849784851,
 0.6293333172798157,
 0.6144578456878662,
 0.5762711763381958,
 array([0.6413044], dtype=float32)]

In [30]:
#LSTM + CNN

model7 = keras.Sequential()

model7.add(keras.layers.Input(shape=(X_train.shape[1], 1)))
model7.add(keras.layers.LSTM(100, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
model7.add(keras.layers.Conv1D(5, (2,), padding='same', activation='relu'))
model7.add(keras.layers.MaxPooling1D(pool_size = 2))
model7.add(keras.layers.Flatten())
model7.add(keras.layers.Dense(64, activation='relu', kernel_initializer='he_uniform'))
model7.add(keras.layers.Dense(1, activation='sigmoid'))
model7.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy',
                              tf.keras.metrics.Precision(),
                              tf.keras.metrics.Recall(),
                              tfa.metrics.F1Score(num_classes=1)])
model7.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_9 (LSTM)               (None, 8548, 100)         40800     
                                                                 
 conv1d_6 (Conv1D)           (None, 8548, 5)           1005      
                                                                 
 max_pooling1d_8 (MaxPooling  (None, 4274, 5)          0         
 1D)                                                             
                                                                 
 flatten_9 (Flatten)         (None, 21370)             0         
                                                                 
 dense_18 (Dense)            (None, 64)                1367744   
                                                                 
 dense_19 (Dense)            (None, 1)                 65        
                                                     

In [19]:
history7 = model7.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=32, 
    validation_split=0.2, 
    verbose=1, 
    shuffle=True
)
model7.evaluate(X_test, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.6935282945632935,
 0.6426666378974915,
 0.6174863576889038,
 0.6384180784225464,
 array([0.6413044], dtype=float32)]

In [31]:
#BiLSTM

model8 = keras.Sequential()

model8.add(keras.layers.Input(shape=(X_train.shape[1], 1)))
model8.add(keras.layers.Bidirectional(LSTM(100, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)))
model8.add(keras.layers.MaxPooling1D(pool_size = 2))
model8.add(keras.layers.Flatten())
model8.add(keras.layers.Dense(64, activation='relu', kernel_initializer='he_uniform'))
model8.add(keras.layers.Dense(1, activation='sigmoid'))
model8.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy',
                              tf.keras.metrics.Precision(),
                              tf.keras.metrics.Recall(),
                              tfa.metrics.F1Score(num_classes=1)])
model8.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_2 (Bidirectio  (None, 8548, 200)        81600     
 nal)                                                            
                                                                 
 max_pooling1d_9 (MaxPooling  (None, 4274, 200)        0         
 1D)                                                             
                                                                 
 flatten_10 (Flatten)        (None, 854800)            0         
                                                                 
 dense_20 (Dense)            (None, 64)                54707264  
                                                                 
 dense_21 (Dense)            (None, 1)                 65        
                                                                 
Total params: 54,788,929
Trainable params: 54,788,929

In [15]:
history8 = model8.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=32, 
    validation_split=0.2, 
    verbose=1, 
    shuffle=True
)
model8.evaluate(X_test, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.6947916150093079,
 0.47200000286102295,
 0.47200000286102295,
 1.0,
 array([0.6413044], dtype=float32)]

In [32]:
#BiLSTM + CNN

model9 = keras.Sequential()

model9.add(keras.layers.Input(shape=(X_train.shape[1], 1)))
model9.add(keras.layers.Bidirectional(LSTM(100, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)))
model9.add(keras.layers.Conv1D(5, (2,), padding='same', activation='relu'))
model9.add(keras.layers.MaxPooling1D(pool_size = 2))
model9.add(keras.layers.Flatten())
model9.add(keras.layers.Dense(64, activation='relu', kernel_initializer='he_uniform'))
model9.add(keras.layers.Dense(1, activation='sigmoid'))
model9.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy',
                              tf.keras.metrics.Precision(),
                              tf.keras.metrics.Recall(),
                              tfa.metrics.F1Score(num_classes=1)])
model9.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_3 (Bidirectio  (None, 8548, 200)        81600     
 nal)                                                            
                                                                 
 conv1d_7 (Conv1D)           (None, 8548, 5)           2005      
                                                                 
 max_pooling1d_10 (MaxPoolin  (None, 4274, 5)          0         
 g1D)                                                            
                                                                 
 flatten_11 (Flatten)        (None, 21370)             0         
                                                                 
 dense_22 (Dense)            (None, 64)                1367744   
                                                                 
 dense_23 (Dense)            (None, 1)               

In [17]:
history9 = model9.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=32, 
    validation_split=0.2, 
    verbose=1, 
    shuffle=True
)
model9.evaluate(X_test, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[1.035083293914795,
 0.6346666812896729,
 0.6098901033401489,
 0.6271186470985413,
 array([0.6413044], dtype=float32)]