<a href="https://colab.research.google.com/github/Samar-Agarwal/Detecting-Depression-through-Tweets/blob/main/npl_wids_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
import re

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
data =pd.read_csv("drive/MyDrive/WIDS_NLP_Project/dataset2.csv", on_bad_lines = 'skip')
# Keeping only the neccessary columns
data = data[['Sentiment', 'SentimentText']]

In [6]:
data['SentimentText'] = data['SentimentText'].apply(lambda x: x.lower()) # lowering all alphabets
data['SentimentText'] = data['SentimentText'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))  #using re module for stooping

print(data[ data['Sentiment'] == 1].size)
print(data[ data['Sentiment'] == 0].size)

1580354
1576870


In [7]:
max_fatures = 20000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['SentimentText'].values)
X = tokenizer.texts_to_sequences(data['SentimentText'].values)
X = pad_sequences(X)

In [8]:
import gensim

In [9]:
data.head()


Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my apl friend
1,0,i missed the new moon trailer
2,1,omg its already 730 o
3,0,omgaga im sooo im gunna cry ive be...
4,0,i think mi bf is cheating on me ...


In [10]:
documents = [_text.split() for _text in data.SentimentText]

In [11]:
data.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my apl friend
1,0,i missed the new moon trailer
2,1,omg its already 730 o
3,0,omgaga im sooo im gunna cry ive be...
4,0,i think mi bf is cheating on me ...


In [12]:
print(documents[0:10])

[['is', 'so', 'sad', 'for', 'my', 'apl', 'friend'], ['i', 'missed', 'the', 'new', 'moon', 'trailer'], ['omg', 'its', 'already', '730', 'o'], ['omgaga', 'im', 'sooo', 'im', 'gunna', 'cry', 'ive', 'been', 'at', 'this', 'dentist', 'since', '11', 'i', 'was', 'suposed', '2', 'just', 'get', 'a', 'crown', 'put', 'on', '30mins'], ['i', 'think', 'mi', 'bf', 'is', 'cheating', 'on', 'me', 't_t'], ['or', 'i', 'just', 'worry', 'too', 'much'], ['juuuuuuuuuuuuuuuuussssst', 'chillin'], ['sunny', 'again', 'work', 'tomorrow', 'tv', 'tonight'], ['handed', 'in', 'my', 'uniform', 'today', 'i', 'miss', 'you', 'already'], ['hmmmm', 'i', 'wonder', 'how', 'she', 'my', 'number']]


In [13]:
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10
w2v_model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=12)


w2v_model.build_vocab(documents)

In [14]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 45138


In [15]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [16]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 1891920877169563921
 xla_global_id: -1, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 14415560704
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 3432606693960804968
 physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
 xla_global_id: 416903419]

In [17]:
%%time
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

CPU times: user 34min 3s, sys: 9.47 s, total: 34min 13s
Wall time: 17min 46s


(488054826, 663205824)

In [18]:
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(data['SentimentText'].values) 
X = tokenizer.texts_to_sequences(data['SentimentText'].values) 
X = pad_sequences(X)

In [19]:
vocab_size = len(tokenizer.word_index)+1
print('Vocab Size is ',vocab_size)

Vocab Size is  820831


In [20]:
tokenizer.word_index['sam']

1582

In [21]:
print(X)


[[     0      0      0 ...      5 245990    256]
 [     0      0      0 ...     69    775   1257]
 [     0      0      0 ...    192   3506    408]
 ...
 [     0      0      0 ...      1     17    656]
 [     0      0      0 ...    171     29    105]
 [     0      0      0 ... 148510      1    110]]


In [22]:
w2v_model.wv['sad'].shape


(300,)

In [23]:

embedding_matrix = np.zeros(( vocab_size, W2V_SIZE))
for word , i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

(820831, 300)


In [24]:
from keras.models import Sequential
from keras.layers import SpatialDropout1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

lstm_out = 53
model = Sequential()
model.add(Embedding(vocab_size, W2V_SIZE, weights = [ embedding_matrix], input_length = X.shape[1], trainable = False))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(2*lstm_out, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]
print(model.summary())



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 41, 300)           246249300 
                                                                 
 spatial_dropout1d (SpatialD  (None, 41, 300)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 41, 106)           172568    
                                                                 
 lstm_1 (LSTM)               (None, 53)                33920     
                                                                 
 dense (Dense)               (None, 2)                 108       
                                                                 
Total params: 246,455,896
Trainable params: 206,596
Non-trainable params: 246,249,300
____________________________________

In [25]:
data.head()
from sklearn.model_selection import train_test_split

In [26]:
Y = pd.get_dummies(data['Sentiment']).values
X_new, X_del, Y_new, Y_del = train_test_split(X,Y, test_size = 0.5, random_state = 42)
X_train,X_test,Y_train, Y_test =train_test_split(X_new,Y_new, test_size = 0.3, random_state =14 )
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(552514, 41) (552514, 2)
(236792, 41) (236792, 2)


In [27]:
batch_size = 256
epochs = 2
model.fit(X_train, Y_train, batch_size = batch_size, epochs = epochs, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f83f062c4f0>

In [28]:
score,acc = model.evaluate(X_test, Y_test, verbose = 1, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.39
acc: 0.82


In [43]:
model2 = Sequential((Dense(2,activation='sigmoid')))
#callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
 #             EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]
model2.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
# print(model2.summary())

In [44]:
print(model2.summary())

ValueError: ignored

In [46]:
from sklearn.model_selection import train_test_split
X_new1, X_del1, Y_new1, Y_del1 = train_test_split(X,Y, test_size = 0.2, random_state = 51)
X_train1,X_test1,Y_train1, Y_test1 =train_test_split(X_new1,Y_new1, test_size = 0.3, random_state =11 )

In [47]:
batch_size = 256
epochs = 6
model2.fit(X_train1, Y_train1, batch_size = batch_size, epochs = epochs, verbose=1)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f84d75d5070>

In [48]:
score,acc = model2.evaluate(X_test, Y_test, verbose = 1, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 34.69
acc: 0.52
