<a href="https://colab.research.google.com/github/VellummyilumVinoth/Toxic_Comment_Classification/blob/main/LSTM_with_subword.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.callbacks import EarlyStopping
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load and preprocess the data
dataset = pd.read_csv('/content/drive/MyDrive/Dats/Kaggle/pre_data_train.csv')

from sklearn.model_selection import train_test_split

dataset, test = train_test_split(dataset, test_size=0.2)

sentiment = dataset['preprocessed_text'].values

In [4]:
y_list = ["toxic","severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = dataset[y_list].values

In [5]:
predict_data = test['preprocessed_text'].values

In [7]:
import numpy as np

# Convert float values to strings
sentiment = np.array(sentiment, dtype=str)
predict_data = np.array(predict_data, dtype=str)

# Tokenize the data
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(sentiment))
seq = tokenizer.texts_to_sequences(sentiment)
pad = pad_sequences(seq, maxlen=100)
test_seq = tokenizer.texts_to_sequences(predict_data)
test_pad = pad_sequences(test_seq, maxlen=100)


In [8]:
# Define the model architecture
def model_with_subword_embedding():
    inputs = Input(shape=(100, ))
    x = Embedding(20000, 128)(inputs)
    x = Bidirectional(LSTM(50))(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = model_with_subword_embedding()
print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 128)          2560000   
                                                                 
 bidirectional (Bidirectiona  (None, 100)              71600     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 50)                5050      
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                             

In [9]:
# Train the model
model_with_subword = model_with_subword_embedding()
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
model_with_subword.fit(pad, y, batch_size=32, epochs=2, validation_split=0.1, callbacks=[early])

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f804bff2560>

In [10]:
model.save('/content/drive/MyDrive/LSTM_with_subword')



In [14]:
# Make predictions on the test data
y_test_with_subword = model_with_subword.predict([test_pad], batch_size=1024, verbose=1)

# Apply threshold and convert to 0 or 1
threshold = 0.16  # Adjust this threshold as needed
y_test_with_subword[y_test_with_subword >= threshold] = 1
y_test_with_subword[y_test_with_subword < threshold] = 0
y_test_with_subword = y_test_with_subword.astype(int)

# Create a new DataFrame with the predicted labels without subword embeddings
predict_labels_df_with_subword = pd.DataFrame(y_test_with_subword, columns=y_list)

# Create a new DataFrame with the predicted labels without subword embeddings
predict_data_df = pd.DataFrame(predict_data, columns=["Title"])

# Combine the original DataFrame with the predicted labels without subword embeddings
predict_df_with_subword = pd.concat([predict_data_df, predict_labels_df_with_subword], axis=1)

# Save the DataFrame to a CSV file
predict_df_with_subword.to_csv('/content/drive/MyDrive/predict_lstm_with_subword.csv', index=False)



In [15]:
predict_df_with_subword

Unnamed: 0,Title,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,quite thing firstly write reply secondly parti...,0,0,0,0,0,0
1,people difference armenian society armenian na...,0,0,0,0,0,0
2,gosh id half see first second wish sew could s...,0,0,0,0,0,0
3,uh speedy delete wikipediagnaa vote deletion p...,0,0,0,0,0,0
4,love tell block good job youre,1,0,0,0,0,0
...,...,...,...,...,...,...,...
31910,problem remove criticism personl attack need a...,0,0,0,0,0,0
31911,archival see wparchive free bird,0,0,0,0,0,0
31912,aluminum come even aluminum safe overly empowe...,0,0,0,0,0,0
31913,please write issue article talk page personal ...,0,0,0,0,0,0


In [16]:
import numpy as np
# Calculate accuracy and F1 score for the model with subword embeddings
from sklearn.metrics import accuracy_score, f1_score

# Assuming you have the true labels for the test data
y_test_true = test[y_list].values

# Apply threshold to convert to binary values
y_test_true = np.where(y_test_true >= threshold, 1, 0)

# Assuming you have the true labels for the test data in y_test_true
accuracy_with_subword = accuracy_score(y_test_true, y_test_with_subword)*100
f1_score_with_subword = f1_score(y_test_true, y_test_with_subword, average='micro')*100

# Print accuracy and F1 score
print("Accuracy (with subword embeddings):", accuracy_with_subword)
print("F1 Score (with subword embeddings):", f1_score_with_subword)


Accuracy (with subword embeddings): 87.68290772364092
F1 Score (with subword embeddings): 68.15093908774334
