<a href="https://colab.research.google.com/github/VellummyilumVinoth/Toxic_Comment_Classification/blob/main/LSTM_without_subword.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Dropout
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.callbacks import EarlyStopping
import pandas as pd

In [3]:
# Load and preprocess the data
dataset = pd.read_csv('/content/drive/MyDrive/Dats/Kaggle/train.csv')

from sklearn.model_selection import train_test_split

dataset, test = train_test_split(dataset, test_size=0.2)


In [4]:
dataset.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [5]:
sentiment = dataset['comment_text'].values
y_list = ["toxic","severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = dataset[y_list].values
sentiment

array(['I do not understand! \n\nWhat is it that is going on here? I am trying to collaborate in writing the story! Who is this any why would it be? Help!',
       '"\nWell our fine friend Fountains has finally managed to write something more or less NPOV. Round 19:00 Wikipedia time, matter of fact. He was being a massive jerk to other editors, and there does come a time when you have to say ""stop the bullshit now."" People had been asking nicely for years. I think my work here may be done, but I\'ll check in now and again to make sure he\'s not turning the page back into edisontechcenter.com.  "',
       "BC, AD, BCE, CE, AH, etc \nThere has been lots of debate on what date format to use, and Wikipedia has been unable to agree on a standard. Because of this, new articles can use whatever date format is appropriate, but they ask that no one go through and change a bunch of articles. \n\nAbout Arabic -> English translators, I've wanted one of those for a long time. Apparently there isn

In [6]:
predict_data = test['comment_text'].values

In [7]:
# Tokenize the data
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(sentiment))
seq = tokenizer.texts_to_sequences(sentiment)
pad = pad_sequences(seq, maxlen=100)
test_seq = tokenizer.texts_to_sequences(predict_data)
test_pad = pad_sequences(test_seq, maxlen=100)

In [8]:
from keras.layers import Reshape

# Define the model architecture without subword embeddings
def model_without_subword_embedding():
    inputs = Input(shape=(100,))
    x = Reshape((100, 1))(inputs)
    x = LSTM(50)(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = model_without_subword_embedding()
print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 reshape (Reshape)           (None, 100, 1)            0         
                                                                 
 lstm (LSTM)                 (None, 50)                10400     
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense (Dense)               (None, 50)                2550      
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense_1 (Dense)             (None, 6)                 306   

In [9]:
# Train the model without subword embeddings
model_without_subword = model_without_subword_embedding()
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
model_without_subword.fit(pad, y, batch_size=32, epochs=2, validation_split=0.1, callbacks=[early])


Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa3f8d43b50>

In [10]:
model.save('/content/drive/MyDrive/LSTM_without_subword')



In [11]:
# Make predictions on the test data without subword embeddings
y_test_without_subword = model_without_subword.predict([test_pad], batch_size=1024, verbose=1)

# Apply threshold and convert to 0 or 1
threshold = 0.5  # Adjust this threshold as needed
y_test_without_subword[y_test_without_subword >= threshold] = 1
y_test_without_subword[y_test_without_subword < threshold] = 0
y_test_without_subword = y_test_without_subword.astype(int)

# Create a new DataFrame with the predicted labels without subword embeddings
predict_labels_df_without_subword = pd.DataFrame(y_test_without_subword, columns=y_list)

# Create a new DataFrame with the predicted labels without subword embeddings
predict_data_df = pd.DataFrame(predict_data, columns=["Title"])

# Combine the original DataFrame with the predicted labels without subword embeddings
predict_df_without_subword = pd.concat([predict_data_df, predict_labels_df_without_subword], axis=1)

# Save the DataFrame to a CSV file
predict_df_without_subword.to_csv('/content/drive/MyDrive/predict_lstm_without_subword.csv', index=False)





In [12]:
predict_df_without_subword

Unnamed: 0,Title,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"""\n\nThanks Severa. Due largely to your patien...",0,0,0,0,0,0
1,"I agree with r-b j, they actually believe them...",0,0,0,0,0,0
2,I am being slandered and defamed by Delicious ...,0,0,0,0,0,0
3,"""\nYou haven't been paying attention. I don't ...",0,0,0,0,0,0
4,The laws given on this page are false.\n\nRayl...,0,0,0,0,0,0
...,...,...,...,...,...,...,...
31910,"""::::::::::::I would suppose Kumioko does not ...",0,0,0,0,0,0
31911,"LoL Breein1007 you little troll. Tell me, wit...",0,0,0,0,0,0
31912,"Me to, it's so sad though.\n\nIt would be nice...",0,0,0,0,0,0
31913,I'm taking that as a compliment.,0,0,0,0,0,0


In [14]:
from sklearn.metrics import accuracy_score, f1_score, hamming_loss
import numpy as np

# Assuming you have the true labels for the test data
y_test_true = test[y_list].values

# Apply threshold to convert to binary values
y_test_true = np.where(y_test_true >= threshold, 1, 0)

# Calculate accuracy and F1 score for the model without subword embeddings
accuracy_without_subword = accuracy_score(y_test_true, y_test_without_subword)*100
f1_score_without_subword = f1_score(y_test_true, y_test_without_subword, average='micro')*100

# Print accuracy and F1 score
print("Accuracy (without subword embeddings):", accuracy_without_subword)
print("F1 Score (without subword embeddings):", f1_score_without_subword)


Accuracy (without subword embeddings): 89.96083346388846
F1 Score (without subword embeddings): 2.8899277518062045
