In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import tensorflow
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [5]:
df = pd.read_csv("artifacts/final_data.csv")
df.head(3)

Unnamed: 0,Content,Label,num_words
0,retweet to the rejects who constantly call my ...,1,24
1,i purpose that whatever attack everyone who ca...,1,16
2,are you fucking kidding me you deserve to fuck...,1,10


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Content    120000 non-null  object
 1   Label      120000 non-null  int64 
 2   num_words  120000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 2.7+ MB


In [10]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
from nltk.stem import WordNetLemmatizer

def lemmatization(text):
    lemmtizer = WordNetLemmatizer()
    text_words = text.split()
    text = [lemmtizer.lemmatize(word) for word in text_words]

    return " ".join(text)

df['Content'] = df['Content'].apply(lemmatization)

In [3]:
max_vocab = 7000
max_seq_len = 100

In [4]:
tokenizer = Tokenizer(num_words=max_vocab, oov_token="<OOV>")
tokenizer.fit_on_texts(df['Content'])

In [5]:
sequences = tokenizer.texts_to_sequences(df['Content'])

In [6]:
padded_seq = pad_sequences(sequences, maxlen=max_seq_len, padding='post')

In [7]:
padded_seq

array([[   3,  256,   18, ...,  164,   40,   16],
       [ 318,  977,    0, ...,    0,    0,    0],
       [   1,  236,   45, ...,    0,    0,    0],
       ...,
       [  43,  108,   41, ...,    0,    0,    0],
       [ 181,   23, 1089, ...,   22,  145,    1],
       [  49,   12, 1516, ...,    0,    0,    0]])

In [8]:
from sklearn.model_selection import train_test_split

X = padded_seq
y = df['Label'].values

X1, X_test, y1, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X1, y1, test_size=0.1/0.9, random_state=42)

In [9]:
X_train.shape, X_val.shape

((96000, 100), (12000, 100))

In [10]:
vocab_size = len(tokenizer.word_index)

In [11]:
vocab_size

66595

#### Training LSTM model - v1

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization

vocab_size = len(tokenizer.word_index)

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_seq_len, mask_zero=True),
    LSTM(64, return_sequences=True),
    BatchNormalization(),
    LSTM(32,return_sequences=False),
    BatchNormalization(),
    Dense(16, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

In [None]:
model.fit(
    X_train, y_train,
    epochs=6,
    batch_size=32,
    validation_data=(X_val, y_val)
)

#### Training LSTM model - v2

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, LayerNormalization, BatchNormalization
from keras.callbacks import ModelCheckpoint

vocab_size = len(tokenizer.word_index)

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_seq_len, mask_zero=True),
    LSTM(16, return_sequences=True),
    LayerNormalization(),
    LSTM(8, dropout=0.2, recurrent_dropout=0.2 ,return_sequences=False),
    LayerNormalization(),
    Dense(8, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 64)           4262080   
                                                                 
 lstm (LSTM)                 (None, 100, 16)           5184      
                                                                 
 layer_normalization (Layer  (None, 100, 16)           32        
 Normalization)                                                  
                                                                 
 lstm_1 (LSTM)               (None, 8)                 800       
                                                                 
 layer_normalization_1 (Lay  (None, 8)                 16        
 erNormalization)                                                
                                                                 
 dense (Dense)               (None, 8)                 7

In [14]:
checkpoint_callback = ModelCheckpoint(
    filepath='models/lstm-v2.h5',
    save_best_only=True,
    monitor='val_accuracy',
    mode='max',
    verbose=1
)

In [15]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=64,
    validation_data=(X_val, y_val),
    callbacks=[checkpoint_callback]
)

#### Training Bi-LSTM model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Embedding, Bidirectional, LSTM, Dense,
                                     Dropout, BatchNormalization, GlobalMaxPooling1D)
from keras.callbacks import ModelCheckpoint

vocab_size = len(tokenizer.word_index)

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_seq_len, mask_zero=True),
    Bidirectional(LSTM(16, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(16, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

In [26]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 100, 64)           4262080   
                                                                 
 bidirectional_2 (Bidirecti  (None, 100, 32)           10368     
 onal)                                                           
                                                                 
 global_max_pooling1d_2 (Gl  (None, 32)                0         
 obalMaxPooling1D)                                               
                                                                 
 dense_6 (Dense)             (None, 16)                528       
                                                                 
 batch_normalization_3 (Bat  (None, 16)                64        
 chNormalization)                                                
                                                      

In [27]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
checkpoint_callback1 = ModelCheckpoint(
    filepath='models/bilstm.h5',
    save_best_only=True,
    monitor='val_loss',
    mode='min',
    verbose=1
)

In [None]:
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=64,
    validation_data=(X_val, y_val),
    callbacks=[checkpoint_callback1]
)

In [None]:
# Saving tokenizer
import pickle
pickle.dump(tokenizer, open('models/tokenizer.pkl', 'wb'))

### Evaluating Models

In [None]:
import pandas as pd

df = pd.read_csv("artifacts/final_data.csv")
df.drop(columns='Content_int', axis=1, inplace=True)

In [17]:
test_df = df.sample(n=200, random_state=60)

test_df['Label'].value_counts()

Label
0    174
1     26
Name: count, dtype: int64

In [18]:
test_df.isnull().sum()

Content    0
Label      0
dtype: int64

In [19]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Removing Stopwords from texts
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    text = [word for word in text.split() if word not in stop_words]
    
    return " ".join(text)

# Lemmatizing words
def lemmatization(text):
    lemmtizer = WordNetLemmatizer()
    text_words = text.split()
    text = [lemmtizer.lemmatize(word) for word in text_words]

    return " ".join(text)

# Preprocessing function
from tqdm import tqdm
def preprocess(df):
    try:
        tqdm.pandas()
        print("Removing stopwords...")
        df['Content'] = df['Content'].progress_apply(remove_stopwords)
        
        print("\n Performing lemmatization")
        df['Content'] = df['Content'].progress_apply(lemmatization)

        print("\n Finished preprocessing successfully")
        return df
    except Exception as e:
        print(f"Error during preprocessing: {e}")
        raise

In [20]:
norm_testdf = preprocess(test_df)
norm_testdf.head()

Removing stopwords...


100%|██████████| 200/200 [00:00<00:00, 1054.83it/s]



 Performing lemmatization


100%|██████████| 200/200 [00:06<00:00, 32.37it/s]


 Finished preprocessing successfully





Unnamed: 0,Content,Label
237813,white council see remove stuff white counsil a...,0
202190,result terror result violence result demonstra...,0
423720,utc concerned edits mirach didnt bother correc...,0
338957,even wikipedia agrees,0
136158,retweet ferguson crisis slut manufactured open...,1


In [21]:
X1_test = tokenizer.texts_to_sequences(test_df['Content'])
X1_padded_test = pad_sequences(X1_test, maxlen=max_seq_len, padding='post')

In [None]:
y1_test = test_df['Label'].astype(int)

In [None]:
# model paths
bilstm_model_path = "models/bi-lstm.h5"
lstmv1_model_path = "models/lstm-v1.h5"
lstmv2_model_path = "models/lstm-v2.h5"

In [None]:
# Load models
from tensorflow.keras.models import load_model

bilstm = load_model(bilstm_model_path)
lstmv1 = load_model(lstmv1_model_path)
lstmv2 = load_model(lstmv2_model_path)

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

#### RESULTS ...

In [None]:
# testing bilstm model

y_prob = bilstm.predict(X1_padded_test)
y_pred = (y_prob > 0.5).astype(int)

print("accuracy: ", accuracy_score(y1_test, y_pred))
print("precision: ", precision_score(y1_test, y_pred))
print("recall: ", recall_score(y1_test, y_pred))
print("roc_auc: ", roc_auc_score(y1_test, y_pred))

accuracy:  0.79
precision:  0.3620689655172414
recall:  0.8076923076923077
roc_auc:  0.7975243147656941


In [None]:
# testing lstm-v1 model

y_prob = lstmv1.predict(X1_padded_test)
y_pred = (y_prob > 0.5).astype(int)

print("accuracy: ", accuracy_score(y1_test, y_pred))
print("precision: ", precision_score(y1_test, y_pred))
print("recall: ", recall_score(y1_test, y_pred))
print("roc_auc: ", roc_auc_score(y1_test, y_pred))

accuracy:  0.76
precision:  0.328125
recall:  0.8076923076923077
roc_auc:  0.7802829354553493


In [None]:
# testing lstm-v2 model

y_prob = lstmv2.predict(X1_padded_test)
y_pred = (y_prob > 0.5).astype(int)

print("accuracy: ", accuracy_score(y1_test, y_pred))
print("precision: ", precision_score(y1_test, y_pred))
print("recall: ", recall_score(y1_test, y_pred))
print("roc_auc: ", roc_auc_score(y1_test, y_pred))

accuracy:  0.82
precision:  0.4074074074074074
recall:  0.8461538461538461
roc_auc:  0.8311229000884174
