<a href="https://colab.research.google.com/github/Palak2506/CommentClassificationModel/blob/main/harmful_comment_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from sklearn.model_selection import train_test_split

In [15]:
#Import the dataset
df = pd.read_csv('test.csv')
df = pd.read_csv('test_labels.csv')
df = pd.read_csv('train1.csv')

In [16]:
# Load datasets with correct variable names
test_df = pd.read_csv("test.csv")
test_labels_df = pd.read_csv("test_labels.csv")
train_df = pd.read_csv("train1.csv")  # Assign to train_df instead of overwriting df

# Check if data is loaded correctly
print(train_df.head())

# Define input and output columns
X = train_df["comment_text"].fillna(" ")  # Fix column reference
y = train_df.iloc[:, 2:]  # Select target labels correctly

# Display data shapes
print(f"Input Shape: {X.shape}")
print(f"Output Shape: {y.shape}")


                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
Input Shape: (159571,)
Output Shape: (159571, 6)


In [17]:
# Define input and output columns
X = train_df["comment_text"].fillna(" ")
y = train_df.iloc[:, 2:]


In [18]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Text preprocessing
MAX_NUM_WORDS = 20000  # Vocabulary size
MAX_SEQUENCE_LENGTH = 200  # Max length of text sequences


In [20]:
# Tokenization
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(test_df["comment_text"].fillna(" "))

In [21]:
# Padding
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [22]:
# Define the LSTM model
model = Sequential([
    Embedding(MAX_NUM_WORDS, 128, input_length=MAX_SEQUENCE_LENGTH),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(32)),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(6, activation='sigmoid')  # Multi-label classification
])



In [23]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [25]:
# Train the model
history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=5,
    batch_size=64,
    verbose=1
)

Epoch 1/5
[1m 324/1995[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m23:01[0m 827ms/step - accuracy: 0.9760 - loss: 0.0306

KeyboardInterrupt: 

In [26]:
# Evaluate on test set with available labels
test_labels_filtered = test_labels_df[(test_labels_df.iloc[:, 1:] != -1).all(axis=1)]
test_comments_filtered = test_df[test_df['id'].isin(test_labels_filtered['id'])]
X_test_filtered_seq = tokenizer.texts_to_sequences(test_comments_filtered["comment_text"].fillna(" "))
X_test_filtered_pad = pad_sequences(X_test_filtered_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')


In [27]:
test_predictions = model.predict(X_test_filtered_pad)

[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 111ms/step


In [28]:
# Convert predictions to binary labels
threshold = 0.5
test_predictions_binary = (test_predictions > threshold).astype(int)

In [30]:
# Ensure test_predictions_binary is a DataFrame
submission_df = pd.DataFrame(test_predictions_binary, columns=y.columns)

# Check if 'id' exists in test_comments_filtered before inserting
if 'id' in test_comments_filtered.columns:
    submission_df.insert(0, 'id', test_comments_filtered['id'].values)
else:
    print("Error: 'id' column not found in test_comments_filtered")

# Save predictions to CSV (ensure the directory exists)
submission_file_path = "toxic_comment_predictions.csv"
submission_df.to_csv(submission_file_path, index=False)

print(f"Predictions saved successfully as {submission_file_path}")


Predictions saved successfully as toxic_comment_predictions.csv


In [31]:
print("Model training and testing complete. Predictions saved.")

Model training and testing complete. Predictions saved.


In [36]:
model.save("my_model.keras")