In [1]:
#Import all the libraries needed
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

In [2]:
data_train = pd.read_csv("/content/phm_train.csv")

In [3]:
data_test = pd.read_csv("/content/phm_test.csv")

In [4]:
print(data_train)

          tweet_id  label                                              tweet
0     6.430000e+17      0  user_mention all i can tell you is i have had ...
1     6.440000e+17      0  my doctor told me stop he gave me sum pop i mi...
2     8.150000e+17      1  i take tylenol and i wake up in the middle of ...
3     6.820000e+17      0  i got xans in an advil bottle i dont take them...
4     6.440000e+17      1  mom says i need to stop eating so much bc ive ...
...            ...    ...                                                ...
9986  6.480000e+17      1                          that vicodin messed me up
9987  5.710000e+17      0                  user_mention get some tylenol lol
9988  6.470000e+17      0                          like a walking tamiflu ad
9989  6.990000e+17      0                         klay and steph on steroids
9990  8.230000e+17      0                    horrible pops another xanax url

[9991 rows x 3 columns]


In [5]:
print(data_test)

          tweet_id  label                                              tweet
0     6.411550e+17      0  when you try to run away from the iv needle so...
1     6.425520e+17      1  i just knew i took an ambien for sleep too ear...
2     6.410410e+17      1  i mean i get that my celexa is the reason behi...
3     7.476620e+17      0  if you call me dumb or her dumb one more time ...
4     6.406830e+17      0  i do not want to go to the grocery store but i...
...            ...    ...                                                ...
3326  6.392340e+17      0                         fina take this xanax knock
3327  6.398700e+17      0                user_mention yr on citalopram right
3328  6.433340e+17      0                   user_mention yeah im going norco
3329  5.588580e+17      0                   user_mention tylenol w codin lol
3330  7.131560e+17      0                thats determination on steroids url

[3331 rows x 3 columns]


In [6]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
def load_train_dataset():
    x_train = data_train['tweet']
    y_train = data_train['label']

    x_train = x_train.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_train = x_train.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_train = x_train.apply(lambda tweet: [w for w in tweet.split() if w not in english_stops])  # remove stop words
    x_train = x_train.apply(lambda tweet: [w.lower() for w in tweet])   # lower case

    return x_train, y_train

x_train, y_train = load_train_dataset()

print('tweet')
print(x_train, '\n')
print('label')
print(y_train)

tweet
0       [user, mention, tell, relapses, cure, hear, do...
1       [doctor, told, stop, gave, sum, pop, mix, w, a...
2       [take, tylenol, wake, middle, night, put, ice,...
3       [got, xans, advil, bottle, dont, take, shits, ...
4       [mom, says, need, stop, eating, much, bc, ive,...
                              ...                        
9986                                    [vicodin, messed]
9987                   [user, mention, get, tylenol, lol]
9988                         [like, walking, tamiflu, ad]
9989                              [klay, steph, steroids]
9990                [horrible, pops, another, xanax, url]
Name: tweet, Length: 9991, dtype: object 

label
0       0
1       0
2       1
3       0
4       1
       ..
9986    1
9987    0
9988    0
9989    0
9990    0
Name: label, Length: 9991, dtype: int64


In [8]:
def load_test_dataset():
    x_test = data_test['tweet']
    y_test = data_test['label']

    x_test = x_test.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_test = x_test.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_test = x_test.apply(lambda tweet: [w for w in tweet.split() if w not in english_stops])  # remove stop words
    x_test = x_test.apply(lambda tweet: [w.lower() for w in tweet])   # lower case

    return x_test, y_test

x_test, y_test = load_test_dataset()

print('tweet')
print(x_test, '\n')
print('label')
print(y_test)

tweet
0       [try, run, away, iv, needle, doctor, drug, w, ...
1       [knew, took, ambien, sleep, early, im, ready, ...
2       [mean, get, celexa, reason, behind, lot, weigh...
3       [call, dumb, dumb, one, time, dont, care, many...
4       [want, go, grocery, store, cant, pay, anyone, ...
                              ...                        
3326                           [fina, take, xanax, knock]
3327               [user, mention, yr, citalopram, right]
3328              [user, mention, yeah, im, going, norco]
3329              [user, mention, tylenol, w, codin, lol]
3330                [thats, determination, steroids, url]
Name: tweet, Length: 3331, dtype: object 

label
0       0
1       1
2       1
3       0
4       0
       ..
3326    0
3327    0
3328    0
3329    0
3330    0
Name: label, Length: 3331, dtype: int64


In [9]:
print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
0       [user, mention, tell, relapses, cure, hear, do...
1       [doctor, told, stop, gave, sum, pop, mix, w, a...
2       [take, tylenol, wake, middle, night, put, ice,...
3       [got, xans, advil, bottle, dont, take, shits, ...
4       [mom, says, need, stop, eating, much, bc, ive,...
                              ...                        
9986                                    [vicodin, messed]
9987                   [user, mention, get, tylenol, lol]
9988                         [like, walking, tamiflu, ad]
9989                              [klay, steph, steroids]
9990                [horrible, pops, another, xanax, url]
Name: tweet, Length: 9991, dtype: object 

0       [try, run, away, iv, needle, doctor, drug, w, ...
1       [knew, took, ambien, sleep, early, im, ready, ...
2       [mean, get, celexa, reason, behind, lot, weigh...
3       [call, dumb, dumb, one, time, dont, care, many...
4       [want, go, grocery, store, cant, pay, anyone, ...
                   

In [10]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

In [11]:
# Calculates the average tweet length (in number of words) and rounds it up to use as the maximum sequence length for padding.
def get_max_length():
    tweet_length = []
    for tweet in x_train:
        tweet_length.append(len(tweet))

    return int(np.ceil(np.mean(tweet_length)))

In [12]:
print(get_max_length())

10


In [13]:
max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum tweet length: ', max_length)

Encoded X Train
 [[    2     1   200 ...   944  3624  1952]
 [  115   122   147 ...   193    40   322]
 [    6     3   330 ...   626  1710    29]
 ...
 [    7   529  1739 ...     0     0     0]
 [12658 12659     8 ...     0     0     0]
 [  645  1436   174 ...     0     0     0]] 

Encoded X Test
 [[  98  606  109 ...  193    4  318]
 [ 585   11   56 ...   16  707   55]
 [ 327   12 1209 ...  778    5   88]
 ...
 [   2    1  126 ...    0    0    0]
 [   2    1    3 ...    0    0    0]
 [  59    8    9 ...    0    0    0]] 

Maximum tweet length:  10


In [14]:
# model.build(input_shape=(None, max_length))
# model.summary()

In [15]:
print(total_words)

12660


**LSTM**

In [16]:
# # LSTM Model
# EMBED_DIM = 32
# LSTM_OUT = 64

# LSTM_model = Sequential()
# LSTM_model.add(Embedding(12660, 32, input_length = 10))
# LSTM_model.add(LSTM(64))
# LSTM_model.add(Dense(1, activation='sigmoid'))


# print(LSTM_model.summary())

In [37]:
# LSTM Model
EMBED_DIM = 100
LSTM_OUT = 128

LSTM_model = Sequential()
LSTM_model.add(Embedding(total_words, EMBED_DIM))  # No need for input_length
LSTM_model.add(LSTM(LSTM_OUT, return_sequences=True, dropout=0.3, recurrent_dropout=0.1))
LSTM_model.add(LSTM(80, dropout=0.3))
# LSTM_model.add(Dense(1, activation='sigmoid'))
LSTM_model.add(Dense(1, activation='relu'))

LSTM_model.build(input_shape=(None, 10))
LSTM_model.summary()

In [38]:
LSTM_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [39]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [40]:
# LSTM Model Training
LSTM_model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5
[1m78/79[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 90ms/step - accuracy: 0.7158 - loss: 0.8152
Epoch 1: accuracy improved from -inf to 0.73806, saving model to models/LSTM.h5




[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 91ms/step - accuracy: 0.7163 - loss: 0.8103
Epoch 2/5
[1m78/79[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 101ms/step - accuracy: 0.7808 - loss: 0.4555
Epoch 2: accuracy improved from 0.73806 to 0.80312, saving model to models/LSTM.h5




[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 102ms/step - accuracy: 0.7814 - loss: 0.4553
Epoch 3/5
[1m78/79[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 108ms/step - accuracy: 0.8674 - loss: 0.3810
Epoch 3: accuracy improved from 0.80312 to 0.86408, saving model to models/LSTM.h5




[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 108ms/step - accuracy: 0.8673 - loss: 0.3811
Epoch 4/5
[1m78/79[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 108ms/step - accuracy: 0.8862 - loss: 0.3328
Epoch 4: accuracy improved from 0.86408 to 0.88570, saving model to models/LSTM.h5




[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 108ms/step - accuracy: 0.8861 - loss: 0.3330
Epoch 5/5
[1m78/79[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 89ms/step - accuracy: 0.8673 - loss: 0.3808
Epoch 5: accuracy did not improve from 0.88570
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 89ms/step - accuracy: 0.8674 - loss: 0.3800


<keras.src.callbacks.history.History at 0x79f9ee28a8d0>

In [41]:
# LSTM Model Testing
pred = LSTM_model.predict(x=x_test)
y_pred = (pred >= 0.5) * 1

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step
Correct Prediction: 2726
Wrong Prediction: 605
Accuracy: 81.83728610027019


**Bi-LSTM**

In [32]:
# Bi-LSTM Model
EMBED_DIM = 100
BILSTM_OUT = 128

BILSTM_model = Sequential()
BILSTM_model.add(Embedding(total_words, EMBED_DIM))  # No need for input_length
BILSTM_model.add(Bidirectional(LSTM(BILSTM_OUT, return_sequences=True, dropout=0.3, recurrent_dropout=0.1)))
BILSTM_model.add(LSTM(80, dropout=0.3))
# BILSTM_model.add(Dense(1, activation='sigmoid'))
BILSTM_model.add(Dense(1, activation='relu'))


BILSTM_model.build(input_shape=(None, 10))
BILSTM_model.summary()

In [33]:
BILSTM_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [34]:
checkpoint_bilstm = ModelCheckpoint(
    'models/BiLSTM.h5',   # <-- different file name
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [35]:
# Bi-LSTM Model Training
BILSTM_model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step - accuracy: 0.7224 - loss: 0.8730
Epoch 1: accuracy did not improve from 0.91032
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 170ms/step - accuracy: 0.7228 - loss: 0.8700
Epoch 2/5
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step - accuracy: 0.8504 - loss: 0.4146
Epoch 2: accuracy did not improve from 0.91032
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 172ms/step - accuracy: 0.8503 - loss: 0.4149
Epoch 3/5
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step - accuracy: 0.7729 - loss: 0.5770
Epoch 3: accuracy did not improve from 0.91032
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 173ms/step - accuracy: 0.7731 - loss: 0.5770
Epoch 4/5
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step - accuracy: 0.8696 - loss: 0.3570
Epoch 4: accuracy did not improve from 0.91032
[1m79

<keras.src.callbacks.history.History at 0x79f9f57f3690>

In [36]:
# Bi-LSTM Model Testing
pred1 = BILSTM_model.predict(x=x_test)
y_pred1 = (pred1 >= 0.5) * 1

true = 0
for i, y in enumerate(y_test):
    if y == y_pred1[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred1) - true))
print('Accuracy: {}'.format(true/len(y_pred1)*100))

[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step
Correct Prediction: 2734
Wrong Prediction: 597
Accuracy: 82.07745421795258
