<a href="https://colab.research.google.com/github/VellummyilumVinoth/Toxic_Comment_Classification/blob/main/LSTM_without_subword.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Dropout
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.callbacks import EarlyStopping
import pandas as pd

In [14]:
# Load and preprocess the data
dataset = pd.read_csv('/content/drive/MyDrive/Dats/Kaggle/pre_data_train.csv')

from sklearn.model_selection import train_test_split

dataset, test = train_test_split(dataset, test_size=0.2)


In [15]:
dataset.isnull().sum()

id                    0
comment_text          0
toxic                 0
severe_toxic          0
obscene               0
threat                0
insult                0
identity_hate         0
preprocessed_text    41
dtype: int64

In [19]:
sentiment = dataset['preprocessed_text'].values
y_list = ["toxic","severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = dataset[y_list].values
sentiment

array(['never delete bike bite like',
       'username contact dewiki maybe steward let know steal username question please tell contact thank',
       'finally encyclopedically relevant look something actually good like edward morgan article scottish literature midthcentury britannicas example would conclusion superb essay intellectual level seldom scale wikipedia presumably lack aspiration rather competence problem english scot one still work difficulty part whole cultural situation scottish sense separateness real swamp english dominant prose write educate speech scottish poet th century therefore like shaman must try many disguise find one best prophesy persuade',
       ...,
       'support least get rid flash funk part title he special case he wwf use flash funk moniker cold scorpio scorpio far well know ring name likely never use cold scorpio majority career wcw ecw japan run still think change cold scorpio regardless name take make onscreen appearance wwe',
       'server go te

In [20]:
predict_data = test['preprocessed_text'].values

In [23]:
predict_data

array(['ill look also could get picture cross section soil horizon eh',
       'svg issue quite welcome friend',
       'tan wikipedias worker whatever tan go hell grow ball get girlfriend rather spend life wikipedia',
       ...,
       'thank reply original comment certainly label anyone thats write hint prejudice however doesnt much appeal block base assumption intentionally try manipulate post criticize edit true browse different website time write one sentence wrong place switch browser tab sentence make sense comment',
       'cfm nomination u state subcategories categorymembers constitution party unite state consider merge parent category encourage join discussion category discussion page',
       'gay person earth boing say zebedee'], dtype=object)

In [25]:
# Convert float values to strings
sentiment = np.array(sentiment, dtype=str)
predict_data = np.array(predict_data, dtype=str)

# Tokenize the data
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(sentiment))
seq = tokenizer.texts_to_sequences(sentiment)
pad = pad_sequences(seq, maxlen=100)
test_seq = tokenizer.texts_to_sequences(predict_data)
test_pad = pad_sequences(test_seq, maxlen=100)

In [26]:
from keras.layers import Reshape

# Define the model architecture without subword embeddings
def model_without_subword_embedding():
    inputs = Input(shape=(100,))
    x = Reshape((100, 1))(inputs)
    x = LSTM(50)(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = model_without_subword_embedding()
print(model.summary())

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 100)]             0         
                                                                 
 reshape_2 (Reshape)         (None, 100, 1)            0         
                                                                 
 lstm_2 (LSTM)               (None, 50)                10400     
                                                                 
 dropout_4 (Dropout)         (None, 50)                0         
                                                                 
 dense_4 (Dense)             (None, 50)                2550      
                                                                 
 dropout_5 (Dropout)         (None, 50)                0         
                                                                 
 dense_5 (Dense)             (None, 6)                 306 

In [27]:
# Train the model without subword embeddings
model_without_subword = model_without_subword_embedding()
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
model_without_subword.fit(pad, y, batch_size=32, epochs=2, validation_split=0.1, callbacks=[early])


Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f61efc4f4c0>

In [28]:
model.save('/content/drive/MyDrive/LSTM_without_subword')



In [62]:
# Make predictions on the test data without subword embeddings
y_test_without_subword = model_without_subword.predict([test_pad], batch_size=1024, verbose=1)

# Apply threshold and convert to 0 or 1
threshold = 0.16  # Adjust this threshold as needed
y_test_without_subword[y_test_without_subword >= threshold] = 1
y_test_without_subword[y_test_without_subword < threshold] = 0
y_test_without_subword = y_test_without_subword.astype(int)

# Create a new DataFrame with the predicted labels without subword embeddings
predict_labels_df_without_subword = pd.DataFrame(y_test_without_subword, columns=y_list)

# Create a new DataFrame with the predicted labels without subword embeddings
predict_data_df = pd.DataFrame(predict_data, columns=["Title"])

# Combine the original DataFrame with the predicted labels without subword embeddings
predict_df_without_subword = pd.concat([predict_data_df, predict_labels_df_without_subword], axis=1)

# Save the DataFrame to a CSV file
predict_df_without_subword.to_csv('/content/drive/MyDrive/predict_lstm_without_subword.csv', index=False)





In [63]:
predict_df_without_subword

Unnamed: 0,Title,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,ill look also could get picture cross section ...,1,0,0,0,0,0
1,svg issue quite welcome friend,1,0,0,0,0,0
2,tan wikipedias worker whatever tan go hell gro...,0,0,0,0,0,0
3,hmm doesnt seem mark top box could swear one d...,0,0,0,0,0,0
4,dubious claim claim russia invent lightning ro...,0,0,0,0,0,0
...,...,...,...,...,...,...,...
31910,agree problem holyroman keep remove bosniak hi...,1,0,0,0,0,0
31911,hey buddy haha like keep wikipedia nasty gram ...,0,0,0,0,0,0
31912,thank reply original comment certainly label a...,0,0,0,0,0,0
31913,cfm nomination u state subcategories categorym...,0,0,0,0,0,0


In [64]:
from sklearn.metrics import accuracy_score, f1_score, hamming_loss
import numpy as np

# Assuming you have the true labels for the test data
y_test_true = test[y_list].values

# Apply threshold to convert to binary values
y_test_true = np.where(y_test_true >= threshold, 1, 0)

# Calculate accuracy and F1 score for the model without subword embeddings
accuracy_without_subword = accuracy_score(y_test_true, y_test_without_subword)*100
f1_score_without_subword = f1_score(y_test_true, y_test_without_subword, average='micro')*100

# Print accuracy and F1 score
print("Accuracy (without subword embeddings):", accuracy_without_subword)
print("F1 Score (without subword embeddings):", f1_score_without_subword)


Accuracy (without subword embeddings): 75.14021619927934
F1 Score (without subword embeddings): 17.110820981491432
