In [1]:
from __future__ import division, print_function
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string

### Read data

In [2]:
from datetime import datetime
startTimeScript = datetime.now()

In [3]:
#normal_tweets_text_sample_df=pd.read_excel('normal_tweets_text_sample.xlsx')
normal_tweets_text_sample_df=pd.read_excel('normal_tweets_text_sample.xlsx')
depressive_tweets_text_sample_df=pd.read_excel('depressive_tweets_text_sample.xlsx')
also_depressive_tweets_text_sample_df=pd.read_excel('normal_tweets_text_sample_0only.xlsx')
tweets_text_sample_df=pd.concat([normal_tweets_text_sample_df,depressive_tweets_text_sample_df,also_depressive_tweets_text_sample_df])
tweets_text_sample_df=tweets_text_sample_df.iloc[:,1:6]
tweets_text_sample_df

Unnamed: 0,id,text,lable,normalized_text
0,1680093728,@Dreameress1 Hello!!! Hows it going?,0,hello going
1,2053040367,is on the train to london,0,train london
2,2060651098,@TheLadyJane hey i wrote you a message today ...,0,hey wrot mess today kind gam ar ref playin
3,2059350074,"@SuperRecords goodnight , im off to bed also ...",0,goodnight im bed also nic talk d
4,1960820806,@SilverSteer I am always wearing turquoise jew...,0,alway wear turquo jewel fav
...,...,...,...,...
7995,1833084157,I can't download K-lite codec pack,1,not download klite codec pack
7996,2214628609,@britishxo and u didnt come see me !,1,u didnt com see
7997,2245101277,looking forward to a great weekend of revising...,1,look forward gre weekend rev gcse monday pah ...
7998,1997866869,Heidi and spencer are michigamua-esque,1,heid spent ar michigamuaesque


In [4]:
tweets_text_sample_df=tweets_text_sample_df.dropna(axis=0,subset = ["normalized_text"])
cw = lambda x: x.split()
tweets_text_sample_df["normalized_text_words"]=tweets_text_sample_df["normalized_text"].apply(cw)
tweets_text_sample_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_text_sample_df["normalized_text_words"]=tweets_text_sample_df["normalized_text"].apply(cw)


Unnamed: 0,id,text,lable,normalized_text,normalized_text_words
0,1680093728,@Dreameress1 Hello!!! Hows it going?,0,hello going,"[hello, going]"
1,2053040367,is on the train to london,0,train london,"[train, london]"
2,2060651098,@TheLadyJane hey i wrote you a message today ...,0,hey wrot mess today kind gam ar ref playin,"[hey, wrot, mess, today, kind, gam, ar, ref, p..."
3,2059350074,"@SuperRecords goodnight , im off to bed also ...",0,goodnight im bed also nic talk d,"[goodnight, im, bed, also, nic, talk, d]"
4,1960820806,@SilverSteer I am always wearing turquoise jew...,0,alway wear turquo jewel fav,"[alway, wear, turquo, jewel, fav]"
...,...,...,...,...,...
7995,1833084157,I can't download K-lite codec pack,1,not download klite codec pack,"[not, download, klite, codec, pack]"
7996,2214628609,@britishxo and u didnt come see me !,1,u didnt com see,"[u, didnt, com, see]"
7997,2245101277,looking forward to a great weekend of revising...,1,look forward gre weekend rev gcse monday pah ...,"[look, forward, gre, weekend, rev, gcse, monda..."
7998,1997866869,Heidi and spencer are michigamua-esque,1,heid spent ar michigamuaesque,"[heid, spent, ar, michigamuaesque]"


### Split data into test and train

In [5]:
np.random.seed(1234)
data_train, data_test = train_test_split(tweets_text_sample_df, test_size=0.20, random_state=42)

In [6]:
all_training_words = [word for tokens in data_train["normalized_text_words"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["normalized_text_words"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

139694 words total, with a vocabulary size of 14025
Max sentence length is 64


In [7]:
all_test_words = [word for tokens in data_test["normalized_text_words"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["normalized_text_words"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

34517 words total, with a vocabulary size of 6153
Max sentence length is 38


### Load Google News Word2Vec model

In [8]:
word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [9]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['normalized_text_words'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

### Get Embeddings

In [10]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

In [11]:
MAX_SEQUENCE_LENGTH = 38
EMBEDDING_DIM = 300

### Tokenize and Pad sequences

In [12]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["normalized_text"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["normalized_text"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 13997 unique tokens.


In [13]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [14]:
len(train_word_index)

13997

In [15]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)

In [16]:
test_sequences = tokenizer.texts_to_sequences(data_test["normalized_text"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

### Define RNN-LSTM

In [17]:
label_names = ['lable']

In [18]:
y_train = data_train[label_names].values

In [19]:
x_train = train_cnn_data
y_tr = y_train

In [20]:
def recurrent_nn(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    print(sequence_input,"sequence_input")
    embedded_sequences = embedding_layer(sequence_input)
    print(embedded_sequences,"embedded_sequences")

    #lstm = LSTM(256, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(embedded_sequences)
    lstm = LSTM(256)(embedded_sequences)
    
    #x = Dense(128, activation='relu')(lstm)
    x = Dropout(0.4)(lstm)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [21]:
model = recurrent_nn(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

KerasTensor(type_spec=TensorSpec(shape=(None, 38), dtype=tf.int32, name='input_1'), name='input_1', description="created by layer 'input_1'") sequence_input
KerasTensor(type_spec=TensorSpec(shape=(None, 38, 300), dtype=tf.float32, name=None), name='embedding/embedding_lookup/Identity_1:0', description="created by layer 'embedding'") embedded_sequences
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 38)]              0         
                                                                 
 embedding (Embedding)       (None, 38, 300)           4199400   
                                                                 
 lstm (LSTM)                 (None, 256)               570368    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                     

### Train RNN-LSTM

In [22]:
num_epochs = 5
batch_size = 34

In [23]:
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [24]:
print ('Total time for the script:',(datetime.now() - startTimeScript))

Total time for the script: 0:04:23.578822


### Test RNN-LSTM

In [25]:
predictions = model.predict(test_cnn_data, batch_size=34, verbose=1)



In [26]:
prediction_labels=[]
for p in predictions:
    prediction_labels.append(round(p[0]))
    print(round(p[0]))

0
1
1
0
0
1
0
0
0
1
0
1
0
0
1
1
1
1
1
0
1
1
0
0
0
0
1
1
0
1
1
0
1
1
0
0
1
1
1
0
0
0
0
0
0
0
1
0
0
1
0
0
1
1
0
1
0
0
0
1
0
1
0
1
1
0
0
1
0
0
0
0
0
0
1
0
0
1
1
1
0
0
0
1
0
0
1
1
0
1
0
0
1
0
1
1
1
0
0
1
1
1
1
1
0
1
1
0
1
0
0
1
0
0
1
0
0
1
1
1
0
1
1
0
0
0
0
1
1
1
0
0
1
1
0
0
0
0
0
1
1
1
1
1
1
1
1
1
0
0
1
1
1
0
1
1
0
0
0
1
0
0
1
1
1
0
0
0
1
1
1
1
0
1
0
0
1
1
1
1
0
1
1
0
1
0
0
0
0
1
0
1
0
0
0
0
1
0
0
1
0
0
1
1
0
1
1
1
0
1
1
1
0
1
1
0
0
0
1
1
1
0
0
1
1
1
0
0
0
0
0
1
0
0
0
0
1
0
0
0
1
1
1
0
1
0
1
1
0
1
1
0
0
0
0
1
0
0
1
1
0
1
0
1
0
1
1
0
1
0
1
0
0
1
1
1
0
1
1
1
0
0
0
1
0
1
1
0
0
1
1
0
1
1
1
1
1
0
0
1
0
1
0
1
1
0
1
1
0
0
1
1
0
1
0
0
0
0
1
1
1
0
0
1
1
0
0
1
0
0
0
0
1
0
1
0
1
0
1
1
0
1
1
1
0
1
1
1
0
0
1
0
1
1
1
1
0
1
1
1
1
1
0
1
0
1
0
1
1
0
1
1
1
0
1
0
1
0
1
0
1
0
0
0
0
1
0
0
1
0
0
0
0
1
1
1
0
1
0
0
1
0
1
1
1
1
1
0
1
0
0
1
0
1
1
0
0
0
0
0
1
0
0
0
0
0
0
1
0
0
0
1
1
0
1
0
0
1
1
0
1
1
1
0
1
0
0
0
1
0
1
1
0
0
1
1
1
0
1
0
0
0
1
0
1
0
1
1
0
1
0
0
0
0
0
0
1
0
0
0
1
0
0
1
1
0
0
0
1
1
1
0
0
1
1
0
1
0
1
0


In [27]:
sum(data_test.lable==prediction_labels)/len(prediction_labels)

0.7811031412317586

In [28]:
result_test=pd.concat([pd.DataFrame(predictions),pd.DataFrame(prediction_labels)],axis=1)
result_test=pd.concat([result_test.reset_index(),pd.DataFrame(data_test[['lable','text','normalized_text']]).reset_index()],axis=1)
result_test.columns=[['index1','predictions','prediction_label','index2','test_label','text','normalized_text']]
result_test=result_test[['predictions','prediction_label','test_label','text','normalized_text']]
result_test

Unnamed: 0,predictions,prediction_label,test_label,text,normalized_text
0,0.431088,0,0,i think my mum's gonna let me join spcrew.,think mum gon na let join spcrew
1,0.998282,1,1,can someone take my mushy depression brain and...,someon tak mushy depress brain giv new thx
2,0.680830,1,1,@school boreeed.... I want Saturday back,boreee want saturday back
3,0.134667,0,0,@SilverDayDreams Ooh - dwarves? Where?,ooh dwarv wher
4,0.462802,0,0,@carmenego I have a vague recollection of that,hav vagu recollect
...,...,...,...,...,...
4038,0.812893,1,0,Got my suit sorted Just need to get my hair a...,got suit sort nee get hair fac sort hot hot b...
4039,0.584473,1,1,So lazy today.I could only see the Louver and ...,lazy todayi could see louv got back hotelmy fr...
4040,0.999642,1,1,When depression hits so does anxiety.,depress hit doe anxy
4041,0.062107,0,1,"Yea, thanks for the tip! Im taking it in today...",ye thank tip im tak today see anyth els tha...


In [29]:
result_test.isnull().sum()

predictions         0
prediction_label    0
test_label          0
text                1
normalized_text     0
dtype: int64

In [30]:
lstm_wrong_result=result_test[['text','normalized_text','prediction_label','test_label']]
lstm_wrong_result
#lstm_wrong_result.to_excel('lstm_wrong_result.xlsx',index=False)

Unnamed: 0,text,normalized_text,prediction_label,test_label
0,i think my mum's gonna let me join spcrew.,think mum gon na let join spcrew,0,0
1,can someone take my mushy depression brain and...,someon tak mushy depress brain giv new thx,1,1
2,@school boreeed.... I want Saturday back,boreee want saturday back,1,1
3,@SilverDayDreams Ooh - dwarves? Where?,ooh dwarv wher,0,0
4,@carmenego I have a vague recollection of that,hav vagu recollect,0,0
...,...,...,...,...
4038,Got my suit sorted Just need to get my hair a...,got suit sort nee get hair fac sort hot hot b...,1,0
4039,So lazy today.I could only see the Louver and ...,lazy todayi could see louv got back hotelmy fr...,1,1
4040,When depression hits so does anxiety.,depress hit doe anxy,1,1
4041,"Yea, thanks for the tip! Im taking it in today...",ye thank tip im tak today see anyth els tha...,0,1


In [31]:
result_test.to_csv("result_test.csv",index=False)

In [32]:
#Train accuracy
predictions_train = model.predict(train_cnn_data, batch_size=564, verbose=1)
prediction_labels_train=[]
for p in predictions_train:
    prediction_labels_train.append(round(p[0]))
sum(data_train.lable==prediction_labels_train)/len(prediction_labels_train)



0.8294885274290309