## 1. Standard imports

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import numpy as np

In [7]:
print("Hello world")

Hello world


In [None]:
# Below code will run on tensorflow-gpu enabled system.

# gpus = tf.config.experimental.list_physical_devices('GPU')

# if gpus:
#     try:
#         tf.config.experimental.set_memory_growth(gpus[0], True)
#         tf.config.set_visible_devices(gpus[0], 'GPU')
#         print("GPU is set for TensorFlow!")
#     except RuntimeError as e:
#         print(e)

AttributeError: module 'tensorflow' has no attribute 'config'

## 2. Get the data

### (i) Reading imbalance data

In [10]:
imbalanced_data = pd.read_csv("../data/dataset/imbalanced_data.csv",) 

In [11]:
imbalanced_data.head(3)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty


In [16]:
imbalanced_data.iloc[3]['tweet']

'#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  '

In [19]:
import emoji

# Example text with encoding issue
text = "#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦"

# Fix the double encoding issue
decoded_text = text.encode('latin-1').decode('utf-8')  # Convert back to readable text

# Convert emojis to text representation
final_text = emoji.demojize(decoded_text)

print(final_text)

#model   i love u take with u all the time in ur:mobile_phone:!!! :kissing_face_with_smiling_eyes::smiling_face_with_sunglasses::mouth::tongue::sweat_droplets::sweat_droplets::sweat_droplets:


In [5]:
imbalanced_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [6]:
# another way of checking null values
imbalanced_data.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

In [7]:
# To find the duplicate rows
imbalanced_data[imbalanced_data.duplicated()]

Unnamed: 0,id,label,tweet


In [8]:
# Check whether data is imbalance or not
imbalanced_data['label'].value_counts()

# Total rows : 31962 

label
0    29720
1     2242
Name: count, dtype: int64

#### Conclusion : 
- Highly imbalanced data
- 0--> No hate
- 1 --> hate

In [9]:
# removing the unwanted column id
imbalanced_data.drop(labels='id', axis=1, inplace=True)

In [10]:
imbalanced_data.head(2)

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...


In [11]:
imbalanced_data['tweet'].iloc[11]

'we are so selfish. #orlando #standwithorlando #pulseshooting #orlandoshooting #biggerproblems #selfish #heabreaking   #values #love #'

### (ii) Reading raw data file

In [12]:
raw_data = pd.read_csv("raw_data.csv", index_col=0)

In [13]:
raw_data.head(3)

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...


In [14]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24783 entries, 0 to 25296
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   count               24783 non-null  int64 
 1   hate_speech         24783 non-null  int64 
 2   offensive_language  24783 non-null  int64 
 3   neither             24783 non-null  int64 
 4   class               24783 non-null  int64 
 5   tweet               24783 non-null  object
dtypes: int64(5), object(1)
memory usage: 1.3+ MB


In [15]:
# To find the duplicate rows
raw_data[raw_data.duplicated()]

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet


In [16]:
raw_data.columns

Index(['count', 'hate_speech', 'offensive_language', 'neither', 'class',
       'tweet'],
      dtype='object')

In [17]:
# Removing extra columns
raw_data.drop(labels=['count', 'hate_speech', 'offensive_language', 'neither'], axis=1, inplace=True )

In [18]:
raw_data.head(2)

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...


In [19]:
# lets find-out the classes 
raw_data['class'].value_counts()

class
1    19190
2     4163
0     1430
Name: count, dtype: int64

- class 0: hate
- class 1: abusive
- class 2: no hate

In [20]:
raw_data['tweet'].iloc[4]

'!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;'

## 3. Data preprocessing

#### (a) raw_data class merging

- class 2 --> class 0 <br>
- class 0 --> class 1

In [21]:
# Let's copy the valus of the class 1 into class 0.
raw_data[raw_data['class']==0]['class']=1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_data[raw_data['class']==0]['class']=1


In [22]:
raw_data['class'].value_counts()

class
1    19190
2     4163
0     1430
Name: count, dtype: int64

In [23]:
# replace the value of 0 to 1
raw_data["class"].replace({0:1},inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_data["class"].replace({0:1},inplace=True)


In [24]:
raw_data['class'].value_counts()

class
1    20620
2     4163
Name: count, dtype: int64

In [25]:
# Let's replace the value of 2 to 0.
raw_data["class"].replace({2:0}, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_data["class"].replace({2:0}, inplace = True)


In [26]:
raw_data['class'].value_counts()

class
1    20620
0     4163
Name: count, dtype: int64

In [27]:
# Let's change the name of the 'class' to label
raw_data.rename(columns={'class':'label'},inplace =True)

In [28]:
raw_data.head(3)

Unnamed: 0,label,tweet
0,0,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...


In [29]:
raw_data['label'].value_counts()

label
1    20620
0     4163
Name: count, dtype: int64

#### (b) Merging two DFs

In [30]:
# Let's concatinate both the data into a single data frame.
frame = [imbalanced_data, raw_data]
df = pd.concat(frame)

In [31]:
df.index

Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
       ...
       25286, 25287, 25288, 25289, 25290, 25291, 25292, 25294, 25295, 25296],
      dtype='int64', length=56745)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56745 entries, 0 to 25296
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   56745 non-null  int64 
 1   tweet   56745 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


In [33]:
df['label'].value_counts()

label
0    33883
1    22862
Name: count, dtype: int64

In [34]:
# class imbalance ratio
22862/33883

0.6747336422394711

#### (c) Text Preprocessing

In [35]:
import re
import nltk
import string
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rvalinux/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
# Let's apply stemming and stopwords on the data
stemmer = nltk.SnowballStemmer("english")

# The set() function converts the stopwords list into a set for faster lookup.
# Sets are more efficient for checking membership (word in stopword) than lists.
stopword = set(stopwords.words('english'))

In [37]:
# Let's apply regex and do cleaning.
def data_cleaning(words):

    # lowercasing operation
    words = str(words).lower()
    
    words = re.sub('\[.*?\]', '', words)
    # Removing http(s) links
    words = re.sub('https?://\S+|www\.\S+', '', words)
    words = re.sub('<.*?>+', '', words)
    
    # remving all the standard string punctuation characters
    words = re.sub(f"[{re.escape(string.punctuation)}]", '', words)
    
    words = re.sub('\n', '', words)
    words = re.sub('\w*\d\w*', '', words)

    #print(words,"\n\n\n") # for debugging purpose
    
    words = [word for word in words.split(' ') if words not in stopword]
    words=" ".join(words)

    #print(words,"\n\n\n")  : for debugging purpose
    #print(words.split(' '))
    
    # Stemming operation (bring every word to its root format)
    #words = [stemmer.stem(word) for word in words.split(' ')]
    # print(words,"\n\n\n") :  for debugging purpose
    #words=" ".join(words)

    # words.strip() : remove the leading and trailing whitespace
    # To replace multiple spaces in the middle of a string with a single space
    #cleaned_text = re.sub(r'\s+', ' ', words.strip())
    return words 

In [38]:
# Returns row number 10
df['tweet'][9].iloc[0]

" @user @user welcome here !  i'm   it's so #gr8 ! "

In [39]:
data_cleaning(words=df['tweet'][9])
# " @user @user welcome here !  i'm   it's so #gr8 ! "

'     user user welcome here   im   its so      rhythmixx hobbies include fighting marianame tweet dtype object'

#### Apply data cleaning to the DF

In [40]:
# Apply the data_cleaning step
df['clean_tweet']=df['tweet'].apply(data_cleaning)

In [41]:
df['clean_tweet'].head(4)

0     user when a father is dysfunctional and is so...
1    user user thanks for lyft credit i cant use ca...
2                                  bihday your majesty
3    model   i love u take with u all the time in u...
Name: clean_tweet, dtype: object

## 4. Train-test split

- using sklearn library for this

In [1]:
from sklearn.model_selection import train_test_split

In [43]:
x = df['clean_tweet']
y = df['label']

In [44]:
# Let's split the data into train and test
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=.20, random_state = 42)

print(len(x_train),len(y_train))
print(len(x_test),len(y_test))

45396 45396
11349 11349


## 4. Text to vector representation

In [3]:
import keras

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [47]:
int(df['clean_tweet'].apply(len).max())

268

In [48]:
max_words = 50000 # vocab size
max_len = 300     

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train)

sequences_train = tokenizer.texts_to_sequences(x_train)
sequences_matrix_train = pad_sequences(sequences_train, maxlen=max_len)

In [49]:
sequences_matrix_train  # left padded by default

array([[    0,     0,     0, ...,    41,    17,   157],
       [    0,     0,     0, ...,     4,  4430,   166],
       [    0,     0,     0, ...,    17, 19325, 19326],
       ...,
       [    0,     0,     0, ...,   174,   186,     2],
       [    0,     0,     0, ...,   977, 15809,   784],
       [    0,     0,     0, ...,   424,   403,     8]], dtype=int32)

In [50]:
len(sequences_matrix_train)

45396

## 5. Model Building

In [4]:
from keras.models import Sequential
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, SpatialDropout1D
from keras.optimizers import RMSprop, Adam

### (i) Model1

In [52]:
# Creating model architecture.
model1 = Sequential()
#model1.add(Embedding(max_words,100,input_length=max_len))
model1.add( Embedding(max_words, 100) )
model1.add( SpatialDropout1D(0.2) )
model1.add( LSTM(100,dropout=0.2,) )
model1.add( Dense(1,activation='sigmoid') )
model1.summary()

I0000 00:00:1742301538.539067    7488 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


In [53]:
# Model compilation
model1.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [54]:
# Model fitting
history1 = model1.fit(sequences_matrix_train,
                      y_train,
                      batch_size=128,
                      epochs = 10,
                      validation_split=0.2)

Epoch 1/10


I0000 00:00:1742301541.229531    7608 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 94ms/step - accuracy: 0.7730 - loss: 0.4493 - val_accuracy: 0.9312 - val_loss: 0.1928
Epoch 2/10
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 55ms/step - accuracy: 0.9426 - loss: 0.1637 - val_accuracy: 0.9422 - val_loss: 0.1641
Epoch 3/10
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.9551 - loss: 0.1304 - val_accuracy: 0.9452 - val_loss: 0.1775
Epoch 4/10
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 25ms/step - accuracy: 0.9616 - loss: 0.1155 - val_accuracy: 0.9432 - val_loss: 0.1621
Epoch 5/10
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 26ms/step - accuracy: 0.9680 - loss: 0.1006 - val_accuracy: 0.9372 - val_loss: 0.1688
Epoch 6/10
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 24ms/step - accuracy: 0.9743 - loss: 0.0827 - val_accuracy: 0.9444 - val_loss: 0.1748
Epoch 7/10
[1m284/284[0m [32

In [55]:
test_sequences = tokenizer.texts_to_sequences(x_test)
test_sequences_matrix = pad_sequences(test_sequences,
                                      maxlen=max_len)

In [56]:
model1.evaluate(test_sequences_matrix, y_test)

[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.9366 - loss: 0.1969


[0.20072534680366516, 0.9344435930252075]

In [57]:
lstm_prediction1 = model1.predict(test_sequences_matrix)

[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step


In [58]:
pred1 = []
for prediction in lstm_prediction1:
    if prediction[0] < 0.5:
        pred1.append(0)
    else:
        pred1.append(1)

In [59]:
from sklearn.metrics import confusion_matrix, classification_report

In [60]:
print(confusion_matrix(y_test,pred1))

[[6351  385]
 [ 369 4244]]


In [61]:
print(classification_report(y_test,pred1))

              precision    recall  f1-score   support

           0       0.95      0.94      0.94      6736
           1       0.92      0.92      0.92      4613

    accuracy                           0.93     11349
   macro avg       0.93      0.93      0.93     11349
weighted avg       0.93      0.93      0.93     11349



### (ii) Model2

In [62]:
# Creating model architecture.
model2 = Sequential()
model2.add( Embedding(max_words, 100) )
model2.add( LSTM(100,dropout=0.2,) )
model2.add( Dense(1,activation='sigmoid') )
model2.summary()

In [63]:
# Model compilation
model2.compile(loss='binary_crossentropy',optimizer=Adam(),metrics=['accuracy'])

In [64]:
# Model fitting
history2 = model2.fit(sequences_matrix_train,
                      y_train,
                      batch_size=128,
                      epochs = 10,
                      validation_split=0.2)

# Model2 is trying to overfit the training data...not so good

Epoch 1/10
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step - accuracy: 0.8092 - loss: 0.3893 - val_accuracy: 0.9428 - val_loss: 0.1565
Epoch 2/10
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 29ms/step - accuracy: 0.9647 - loss: 0.0992 - val_accuracy: 0.9445 - val_loss: 0.1588
Epoch 3/10
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 29ms/step - accuracy: 0.9839 - loss: 0.0514 - val_accuracy: 0.9420 - val_loss: 0.1789
Epoch 4/10
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 29ms/step - accuracy: 0.9907 - loss: 0.0315 - val_accuracy: 0.9337 - val_loss: 0.2237
Epoch 5/10
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - accuracy: 0.9942 - loss: 0.0209 - val_accuracy: 0.9270 - val_loss: 0.2401
Epoch 6/10
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 26ms/step - accuracy: 0.9944 - loss: 0.0183 - val_accuracy: 0.9308 - val_loss: 0.2679
Epoch 7/10
[1m284/284

In [65]:
model2.evaluate(test_sequences_matrix, y_test)

[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.9286 - loss: 0.3415


[0.34529274702072144, 0.9238699674606323]

In [66]:
lstm_prediction2 = model2.predict(test_sequences_matrix)

[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step


In [67]:
pred2 = []
for prediction in lstm_prediction2:
    if prediction[0] < 0.5:
        pred2.append(0)
    else:
        pred2.append(1)

In [68]:
print(confusion_matrix(y_test,pred2))

[[6286  450]
 [ 398 4215]]


In [69]:
print(classification_report(y_test,pred2))

              precision    recall  f1-score   support

           0       0.94      0.93      0.94      6736
           1       0.90      0.91      0.91      4613

    accuracy                           0.93     11349
   macro avg       0.92      0.92      0.92     11349
weighted avg       0.93      0.93      0.93     11349



### (iii) Model3 

In [70]:
# Creating model architecture.
model3 = Sequential()
model3.add( Embedding(max_words, 100) )
model3.add( SpatialDropout1D(0.2) )
model3.add( LSTM(10,dropout=0.2,) )
model3.add( Dense(1,activation='sigmoid') )
model3.summary()

In [71]:
# Model compilation
model3.compile(loss='binary_crossentropy',optimizer=Adam(),metrics=['accuracy'])

In [72]:
# Model fitting
history3 = model3.fit(sequences_matrix_train,
                      y_train,
                      batch_size=64,
                      epochs = 10,
                      validation_split=0.2)

Epoch 1/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 22ms/step - accuracy: 0.8103 - loss: 0.4111 - val_accuracy: 0.9434 - val_loss: 0.1599
Epoch 2/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 26ms/step - accuracy: 0.9632 - loss: 0.1098 - val_accuracy: 0.9437 - val_loss: 0.1538
Epoch 3/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 22ms/step - accuracy: 0.9821 - loss: 0.0592 - val_accuracy: 0.9384 - val_loss: 0.1765
Epoch 4/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 25ms/step - accuracy: 0.9886 - loss: 0.0369 - val_accuracy: 0.9396 - val_loss: 0.2068
Epoch 5/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 26ms/step - accuracy: 0.9934 - loss: 0.0226 - val_accuracy: 0.9394 - val_loss: 0.2277
Epoch 6/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 22ms/step - accuracy: 0.9939 - loss: 0.0183 - val_accuracy: 0.9350 - val_loss: 0.2400
Epoch 7/10
[1m5

In [73]:
model3.evaluate(test_sequences_matrix, y_test)

[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.9278 - loss: 0.3486


[0.3501318097114563, 0.9267776608467102]

In [74]:
lstm_prediction3 = model3.predict(test_sequences_matrix)

[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step


In [75]:
pred3 = []
for prediction in lstm_prediction3:
    if prediction[0] < 0.5:
        pred3.append(0)
    else:
        pred3.append(1)

In [76]:
print(classification_report(y_test,pred3))

              precision    recall  f1-score   support

           0       0.93      0.94      0.94      6736
           1       0.91      0.90      0.91      4613

    accuracy                           0.93     11349
   macro avg       0.92      0.92      0.92     11349
weighted avg       0.93      0.93      0.93     11349



### (iv) Model4

In [78]:
from tensorflow.keras.layers import LSTM, Dropout

In [79]:
# Creating model architecture.
model4 = Sequential()
model4.add( Embedding(max_words, 100) )
model4.add( SpatialDropout1D(0.2) )
model4.add( LSTM(10,) )
model4.add(Dropout(0.2) )
model4.add( Dense(1,activation='sigmoid') )
model4.summary()

In [80]:
# Model compilation
model4.compile(loss='binary_crossentropy',optimizer=Adam(),metrics=['accuracy'])

In [81]:
# Model fitting
history4 = model4.fit(sequences_matrix_train,
                      y_train,
                      batch_size=64,
                      epochs = 10,
                      validation_split=0.2)

Epoch 1/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 26ms/step - accuracy: 0.8320 - loss: 0.3964 - val_accuracy: 0.9443 - val_loss: 0.1585
Epoch 2/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 19ms/step - accuracy: 0.9645 - loss: 0.1111 - val_accuracy: 0.9355 - val_loss: 0.1684
Epoch 3/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 21ms/step - accuracy: 0.9840 - loss: 0.0550 - val_accuracy: 0.9401 - val_loss: 0.1770
Epoch 4/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 21ms/step - accuracy: 0.9903 - loss: 0.0333 - val_accuracy: 0.9385 - val_loss: 0.2113
Epoch 5/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 17ms/step - accuracy: 0.9941 - loss: 0.0227 - val_accuracy: 0.9298 - val_loss: 0.2489
Epoch 6/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 21ms/step - accuracy: 0.9949 - loss: 0.0193 - val_accuracy: 0.9320 - val_loss: 0.2654
Epoch 7/10
[1m5

In [82]:
model4.evaluate(test_sequences_matrix, y_test)

[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9281 - loss: 0.3946


[0.39810535311698914, 0.9256322383880615]

In [83]:
lstm_prediction4 = model4.predict(test_sequences_matrix)

[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step


In [85]:
pred4 = []
for prediction in lstm_prediction4:
    if prediction[0] < 0.5:
        pred4.append(0)
    else:
        pred4.append(1)

In [86]:
print(classification_report(y_test,pred3))

              precision    recall  f1-score   support

           0       0.93      0.94      0.94      6736
           1       0.91      0.90      0.91      4613

    accuracy                           0.93     11349
   macro avg       0.92      0.92      0.92     11349
weighted avg       0.93      0.93      0.93     11349



### (v) Model5

In [98]:
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, SpatialDropout1D, GRU

In [88]:
# Creating model architecture : Slows down the training
# model5 = Sequential()
# model5.add( Embedding(max_words,100) )
# model5.add(SpatialDropout1D(0.2))
# model5.add( LSTM(25,dropout=0.2,recurrent_dropout=0.2) )
# model5.add( Dense(1,activation='sigmoid') )
# model5.summary()

In [102]:
# Creating model architecture.
model5 = Sequential()
model5.add( Embedding(max_words, 100) )
model5.add( SpatialDropout1D(0.25) )
model5.add( GRU(50, dropout=.25, return_sequences=True) )
model5.add( GRU(25, dropout=.25) )
model5.add(Dropout(0.2) )
model5.add( Dense(1,activation='sigmoid') )
model5.summary()

In [103]:
# Model compilation
model5.compile(loss='binary_crossentropy',optimizer=Adam(),metrics=['accuracy'])

In [104]:
# Model fitting
history5 = model5.fit(sequences_matrix_train,
                      y_train,
                      batch_size=128,
                      epochs = 20,
                      validation_split=0.2)

Epoch 1/20
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 39ms/step - accuracy: 0.7979 - loss: 0.3936 - val_accuracy: 0.9422 - val_loss: 0.1564
Epoch 2/20
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 39ms/step - accuracy: 0.9604 - loss: 0.1142 - val_accuracy: 0.9433 - val_loss: 0.1545
Epoch 3/20
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 38ms/step - accuracy: 0.9787 - loss: 0.0683 - val_accuracy: 0.9427 - val_loss: 0.1842
Epoch 4/20
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 32ms/step - accuracy: 0.9875 - loss: 0.0424 - val_accuracy: 0.9399 - val_loss: 0.2107
Epoch 5/20
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 37ms/step - accuracy: 0.9911 - loss: 0.0318 - val_accuracy: 0.9339 - val_loss: 0.2415
Epoch 6/20
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 37ms/step - accuracy: 0.9939 - loss: 0.0232 - val_accuracy: 0.9336 - val_loss: 0.2474
Epoch 7/20
[1m28