## Model Training

#### Import Data and Required Packages

In [31]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import warnings

## Feature engineering

#### Load the data

In [2]:
df = pd.read_csv('data/cleaned.csv')

#### Preparing X and Y variables

In [6]:
X = df['tweet']
y = df['label']

#### Split the dataset into training and testing sets

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
df.dropna(inplace=True)

#### Feature Engineering 

In [7]:
max_words = 50000 # Define the maximum number of unique words to keep in the vocabulary

tokenizer = Tokenizer(num_words=max_words) # Initialize the tokenizer with the specified maximum number of words

tokenizer.fit_on_texts(X_train)  # Fit the tokenizer on the text data (X) to create a word index

sequences = tokenizer.texts_to_sequences(X_train)  # Convert the text data into sequences of integers based on the word index

sequence_matrix = pad_sequences(sequences, maxlen=300)   # Pad the sequences to ensure they all have the same length of 300

In [8]:
sequence_matrix

array([[    0,     0,     0, ...,   129,  7184,   184],
       [    0,     0,     0, ...,  8550, 17176, 11012],
       [    0,     0,     0, ...,     0,    50,  3692],
       ...,
       [    0,     0,     0, ...,    37,   256,   484],
       [    0,     0,     0, ...,     2,  3293,   295],
       [    0,     0,     0, ...,    12,    85, 16295]], dtype=int32)

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from tensorflow.keras.optimizers import RMSprop

## Creating model architecture
1. Initialize a Sequential model
2. Add an Embedding layer
  - Converts input sequences (word indices) into dense vectors (embeddings).
  - max_words: Total vocabulary size.
  - 100: Size of embedding vectors.
  - input_length: Length of input sequences.

3. Add a SpatialDropout1D layer
    - Drops features randomly across the embedding dimension (helps reduce overfitting).
    - 0.2: Dropout rate.

4. Add an LSTM layer
    - Captures long-term dependencies in sequential data, like text.
    - 100: Number of units (neurons) in the LSTM layer.
    - dropout: Fraction of input units dropped during training.
    - recurrent_dropout: Fraction of recurrent units dropped during training.
5. Add a Dense layer
    - Outputs a single probability for binary classification tasks.
    - 1: Single output neuron for classification.
    - activation: Sigmoid function used to return probabilities (values between 0 and 1).


6. Display the model's summary (Prints the architecture of the model with layer details.)

In [27]:
# Initialize the model
model = Sequential()

# Add layers
model.add(Embedding(max_words, 100, input_length=300))  # Embedding layer
model.add(SpatialDropout1D(0.2))  # Spatial dropout layer
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))  # LSTM layer
model.add(Dense(1, activation='sigmoid'))  # Dense layer for binary classification

# Display model summary
model.summary()

In [28]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

### Start the model training

In [29]:
# Train the model
history = model.fit(sequence_matrix, y_train, batch_size=128, epochs=1, validation_split=0.2)

[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1229s[0m 4s/step - accuracy: 0.8177 - loss: 0.4106 - val_accuracy: 0.9284 - val_loss: 0.1678


## Evaluation 

In [33]:
test_sequences = tokenizer.texts_to_sequences(X_test)  # Convert the text data into sequences of integers based on the word index

test_sequences_matrix = pad_sequences(test_sequences, maxlen=300) 

#### model evaluation 

In [34]:
acc = model.evaluate(test_sequences_matrix, y_test)

[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 112ms/step - accuracy: 0.9406 - loss: 0.1705


#### Make prediction

In [35]:
lstm_prediction = model.predict(test_sequences_matrix)

[1m340/340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 102ms/step


In [36]:
res = []
for pred in lstm_prediction:
    if pred[0] < 0.5:
        res.append(0)
    else:
        res.append(1)


In [37]:
from sklearn.metrics import confusion_matrix

In [38]:
print(confusion_matrix(y_test, res))

[[6167  190]
 [ 472 4030]]


#### Save the tokenizer for later use

In [39]:
import pickle
with open('data/tokenizer.pickle', 'wb') as file:
    pickle.dump(tokenizer, file)

#### save the model

In [40]:
with open('data/model.h5', 'wb') as file:
    pickle.dump(model, file)

In [41]:
with open('data/tokenizer.pickle', 'rb') as file:
    load_tokenizer = pickle.load(file)
with open('data/model.h5', 'rb') as file:
    load_model = pickle.load(file)


In [54]:
s = load_tokenizer.texts_to_sequences(['whatev good one abraham lincoln quot yall hoe'])
sm = pad_sequences(s, maxlen=300) 

In [59]:
ress = 'Hate' if load_model.predict(sm)[0] > 0.5 else "No Hate"
ress

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step


'Hate'

In [60]:
model.summary()

In [61]:
acc

[0.1695365607738495, 0.9390367269515991]

array([[0.9521498 ],
       [0.20814975],
       [0.9031292 ],
       ...,
       [0.98211473],
       [0.99574697],
       [0.0021294 ]], dtype=float32)