<a href="https://colab.research.google.com/github/VellummyilumVinoth/Toxic_Comment_Classification/blob/main/LSTM_with_subword.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Dropout, Embedding
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.callbacks import EarlyStopping
import pandas as pd
import numpy as np
import fasttext.util
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Load and preprocess the data
dataset = pd.read_csv('/content/drive/MyDrive/Dats/Kaggle/pre_data_train.csv')

dataset, test = train_test_split(dataset, test_size=0.2)
dataset.isnull().sum()

sentiment = dataset['preprocessed_text'].values
y_list = ["toxic","severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = dataset[y_list].values

predict_data = test['preprocessed_text'].values

# Convert float values to strings
sentiment = np.array(sentiment, dtype=str)
predict_data = np.array(predict_data, dtype=str)

# Tokenize the data
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(sentiment))
seq = tokenizer.texts_to_sequences(sentiment)
pad = pad_sequences(seq, maxlen=100)
test_seq = tokenizer.texts_to_sequences(predict_data)
test_pad = pad_sequences(test_seq, maxlen=100)

# Load FastText pre-trained subword embeddings
fasttext.util.download_model('cc.en.300.bin')
ft = fasttext.load_model('cc.en.300.bin')

# Create subword embeddings for the vocabulary
embedding_dim = 300  # Dimensionality of FastText subword embeddings

embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = ft.get_word_vector(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define the model architecture with subword embeddings
def model_with_subword_embedding():
    inputs = Input(shape=(100,))
    x = Embedding(len(tokenizer.word_index) + 1, embedding_dim, weights=[embedding_matrix], trainable=False)(inputs)
    x = LSTM(50)(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model_with_subword = model_with_subword_embedding()
print(model_with_subword.summary())

# Train the model with subword embeddings
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
model_with_subword.fit(pad, y, batch_size=32, epochs=2, validation_split=0.1, callbacks=[early])

model_with_subword.save('/content/drive/MyDrive/LSTM_with_subword')

# Make predictions on the test data with subword embeddings
y_test_with_subword = model_with_subword.predict([test_pad], batch_size=1024, verbose=1)

# Apply threshold and convert to 0 or 1
threshold = 0.16  # Adjust this threshold as needed
y_test_with_subword[y_test_with_subword >= threshold] = 1
y_test_with_subword[y_test_with_subword < threshold] = 0
y_test_with_subword = y_test_with_subword.astype(int)

# Create a new DataFrame with the predicted labels with subword embeddings
predict_labels_df_with_subword = pd.DataFrame(y_test_with_subword, columns=y_list)

# Create a new DataFrame with the predicted labels with subword embeddings
predict_data_df = pd.DataFrame(predict_data, columns=["Title"])

# Combine the original DataFrame with the predicted labels with subword embeddings
predict_df_with_subword = pd.concat([predict_data_df, predict_labels_df_with_subword], axis=1)

# Save the DataFrame to a CSV file
predict_df_with_subword.to_csv('/content/drive/MyDrive/predict_lstm_with_subword.csv', index=False)

# Assuming you have the true labels for the test data
y_test_true = test[y_list].values

# Apply threshold to convert to binary values
y_test_true = np.where(y_test_true >= threshold, 1, 0)

# Calculate accuracy and F1 score for the model with subword embeddings
accuracy_with_subword = accuracy_score(y_test_true, y_test_with_subword)*100
f1_score_with_subword = f1_score(y_test_true, y_test_with_subword, average='micro')*100

# Print accuracy and F1 score
print("Accuracy (with subword embeddings):", accuracy_with_subword)
print("F1 Score (with subword embeddings):", f1_score_with_subword)


In [1]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.callbacks import EarlyStopping
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load and preprocess the data
dataset = pd.read_csv('/content/drive/MyDrive/Dats/Kaggle/pre_data_train.csv')

from sklearn.model_selection import train_test_split

dataset, test = train_test_split(dataset, test_size=0.2)

sentiment = dataset['preprocessed_text'].values

In [4]:
y_list = ["toxic","severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = dataset[y_list].values

In [5]:
predict_data = test['preprocessed_text'].values

In [6]:
import numpy as np

# Convert float values to strings
sentiment = np.array(sentiment, dtype=str)
predict_data = np.array(predict_data, dtype=str)

# Tokenize the data
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(sentiment))
seq = tokenizer.texts_to_sequences(sentiment)
pad = pad_sequences(seq, maxlen=100)
test_seq = tokenizer.texts_to_sequences(predict_data)
test_pad = pad_sequences(test_seq, maxlen=100)


In [8]:
!pip install fasttext


Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.10.4-py3-none-any.whl (222 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4393174 sha256=7d7f89b4a1026d6fff67c62d0b7d712924c901f7cd5611c04d92dc0d8437d829
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.4


In [9]:
import fasttext.util

# Download and load FastText subword embeddings
fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model('cc.en.300.bin')

# Convert tokens to FastText subword embeddings
subword_sequences = []

for sentence in sentiment:
    tokens = sentence.split()
    subword_seq = []

    for token in tokens:
        subword_embed = ft.get_word_vector(token)
        subword_seq.append(subword_embed)

    subword_sequences.append(subword_seq)

subword_pad = pad_sequences(subword_sequences, maxlen=100, padding='post', truncating='post')


Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz


KeyboardInterrupt: ignored

In [None]:
# Define the model architecture
def model_with_subword_embedding():
    inputs = Input(shape=(100, ))
    x = Embedding(20000, 128)(inputs)
    x = Bidirectional(LSTM(50))(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = model_with_subword_embedding()
print(model.summary())

In [None]:
# Train the model with subword embeddings
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
model_with_subword.fit(subword_pad, y, batch_size=32, epochs=2, validation_split=0.1, callbacks=[early])
model_with_subword.save('/content/drive/MyDrive/LSTM_with_subword')


In [None]:
model.save('/content/drive/MyDrive/LSTM_with_subword')

In [None]:
# Make predictions on the test data
y_test_with_subword = model_with_subword.predict([test_pad], batch_size=1024, verbose=1)

# Apply threshold and convert to 0 or 1
threshold = 0.16  # Adjust this threshold as needed
y_test_with_subword[y_test_with_subword >= threshold] = 1
y_test_with_subword[y_test_with_subword < threshold] = 0
y_test_with_subword = y_test_with_subword.astype(int)

# Create a new DataFrame with the predicted labels without subword embeddings
predict_labels_df_with_subword = pd.DataFrame(y_test_with_subword, columns=y_list)

# Create a new DataFrame with the predicted labels without subword embeddings
predict_data_df = pd.DataFrame(predict_data, columns=["Title"])

# Combine the original DataFrame with the predicted labels without subword embeddings
predict_df_with_subword = pd.concat([predict_data_df, predict_labels_df_with_subword], axis=1)

# Save the DataFrame to a CSV file
predict_df_with_subword.to_csv('/content/drive/MyDrive/predict_lstm_with_subword.csv', index=False)

In [None]:
predict_df_with_subword

In [None]:
import numpy as np
# Calculate accuracy and F1 score for the model with subword embeddings
from sklearn.metrics import accuracy_score, f1_score

# Assuming you have the true labels for the test data
y_test_true = test[y_list].values

# Apply threshold to convert to binary values
y_test_true = np.where(y_test_true >= threshold, 1, 0)

# Assuming you have the true labels for the test data in y_test_true
accuracy_with_subword = accuracy_score(y_test_true, y_test_with_subword)*100
f1_score_with_subword = f1_score(y_test_true, y_test_with_subword, average='micro')*100

# Print accuracy and F1 score
print("Accuracy (with subword embeddings):", accuracy_with_subword)
print("F1 Score (with subword embeddings):", f1_score_with_subword)
