# **Word level Embedding**

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
import pickle
import numpy as np
from sklearn.preprocessing import LabelEncoder as le
import tensorflow as tf
from sklearn.metrics import confusion_matrix,classification_report
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import layers
from gensim.models import KeyedVectors

Mounted at /content/drive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df = pd.read_csv('/Data/train_data.csv')
df.columns
df['label'] = df['label'].replace({"hate":0,"nothate":1})

In [None]:
def preprocess_text(df):
  df['text'] = df['text'].apply(lambda x: x.lower())
  df['tokens'] = df['text'].apply(lambda x: nltk.word_tokenize(x))
  df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word.isalnum()])
  stop_words = set(nltk.corpus.stopwords.words('english'))
  df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if not word in stop_words])
  stemmer = nltk.stem.PorterStemmer()
  df['tokens'] = df['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])
  df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))
  return df['processed_text']

In [None]:
max_length = 50
embedding_dim = 50
num_words = 2624



The below code snippet performs the following steps:

1. It loads the word vectors from the GloVe file using the `KeyedVectors` class from the `gensim` library. The GloVe file path is provided as `glove_file`.
2. It preprocesses the text in the `df` DataFrame and stores the processed text in a new column named `'processed_text'`.
3. It initializes a `Tokenizer` object `tokenizer`.
4. It fits the tokenizer on the `'processed_text'` column values of the `df` DataFrame.
5. It converts the text sequences in the `'processed_text'` column of the `df` DataFrame to sequences of integers using `texts_to_sequences`.
6. It pads the sequences in `sequences` using `pad_sequences`, specifying `maxlen`, `padding`, and `truncating` parameters. The resulting padded sequences are stored in `padded_sequences`.
7. It saves the `tokenizer` object to a pickle file named `'word_level_tokenizer.pickle'` using `pickle.dump`.
8. It retrieves the word index from the `tokenizer` object.
9. It calculates the number of words by adding 1 to the length of the `word_index`.
10. It initializes an embedding matrix with zeros of shape `(num_words, embedding_dim)`.
11. It iterates over the word index items.
   - If a word is present in the loaded word vectors, it assigns the corresponding word vector to the corresponding row in the embedding matrix.
12. The resulting embedding matrix is now ready to be used in the model.




In [None]:
glove_file = '/Models/glove.6B.50d.txt'
word_vectors = KeyedVectors.load_word2vec_format(glove_file, binary=False)

df['processed_text'] = preprocess_text(df)

tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['processed_text'])
sequences = tokenizer.texts_to_sequences(df['processed_text'])
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    sequences, maxlen=max_length, padding='post', truncating='post'
)

with open('/Models/WordLevel/word_level_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

word_index = tokenizer.word_index
num_words = len(word_index) + 1
# print(num_words)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if word in word_vectors:
        embedding_matrix[i] = word_vectors[word]

In [None]:
sentence_lengths = df['processed_text'].apply(lambda x: [len(x) for x in df['processed_text']])
avg_sentence_length = sentence_lengths.apply(lambda x: sum(x) / len(x))
print("Average sentence length:", avg_sentence_length.mean())

Average sentence length: 53.789280958721704


In [None]:
checkpoint_filepath = '/Models/WordLevel/wordlevel_3000.h5'
checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    verbose=1
)

In [None]:
model = tf.keras.models.Sequential([
    layers.Embedding(num_words, embedding_dim, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                    input_length=max_length, trainable=False),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.Conv1D(256, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(padded_sequences, df['label'], epochs = 100, validation_split = 0.1,callbacks=[checkpoint_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f367813a380>

In [None]:
with open('/Models/WordLevel/word_level_tokenizer.pickle', 'rb') as handle:
    loaded_tokenizer = pickle.load(handle)

loaded_model = load_model('/Models/WordLevel/wordlevel_3000.h5')



The below code snippet defines and trains a sequential model for text classification:

1. It initializes a sequential model using `tf.keras.models.Sequential()`.
2. It adds an embedding layer to the model using `layers.Embedding` with the following parameters:
   - `num_words`: The number of words in the vocabulary.
   - `embedding_dim`: The dimensionality of the word embeddings.
   - `embeddings_initializer`: The initializer for the embedding matrix, which is set to the pre-trained `embedding_matrix`.
   - `input_length`: The length of input sequences, set to `max_length`.
   - `trainable`: The embedding layer is set to be non-trainable by setting `trainable=False`.
3. It adds a 1D convolutional layer with 128 filters and a kernel size of 5, followed by a ReLU activation function.
4. It adds a max pooling layer with a pool size of 4.
5. It adds another 1D convolutional layer with 256 filters and a kernel size of 5, followed by a ReLU activation function.
6. It adds another max pooling layer with a pool size of 4.
7. It adds a flatten layer to convert the 3D tensor to a 2D tensor.
8. It adds a dense layer with 512 units, a ReLU activation function, and L2 regularization with a coefficient of 0.01.
9. It adds a dropout layer with a rate of 0.5 to prevent overfitting.
10. It adds another dense layer with 256 units, a ReLU activation function, and L2 regularization with a coefficient of 0.01.
11. It adds another dropout layer with a rate of 0.5.
12. It adds a final dense layer with 1 unit and a sigmoid activation function for binary classification.
13. It compiles the model with the Adam optimizer, binary cross-entropy loss function, and accuracy metric.
14. It trains the model on the `padded_sequences` input data and `df['label']` target labels for 100 epochs, with a validation split of 0.1.
15. It uses a checkpoint callback for model saving during training.




In [None]:
df2 = pd.read_csv('/Data/test_data.csv')
df2['label'] = df2['label'].replace({"hate":0,"nothate":1})

df2['processed_text'] = preprocess_text(df2)

test_sequences = loaded_tokenizer.texts_to_sequences(df2['processed_text'])
test_data_padded = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

pred = loaded_model.predict(test_data_padded)
loss, acc = loaded_model.evaluate(test_data_padded,df2['label'], batch_size=32)

print('Test loss:', loss)
print('Test accuracy:', acc,end = '\n\n')

for i,x in enumerate(pred):#['hate = 0' 'nothate = 1']
  if x >= 0.5:
    pred[i] = 1
  else:
    pred[i] = 0
cm = confusion_matrix(df2['label'],pred)
print(cm)
cr = classification_report(df2['label'],pred)
print(cr)

Test loss: 0.6659772992134094
Test accuracy: 0.8892215490341187

[[167  22]
 [ 15 130]]
              precision    recall  f1-score   support

           0       0.92      0.88      0.90       189
           1       0.86      0.90      0.88       145

    accuracy                           0.89       334
   macro avg       0.89      0.89      0.89       334
weighted avg       0.89      0.89      0.89       334

