In [1]:
# ========================
# 1. Imports and Setup
# ========================
import pandas as pd, numpy as np, re, html, unicodedata
from pathlib import Path
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models, callbacks

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
import joblib

print("TensorFlow:", tf.__version__)


TensorFlow: 2.19.0


In [2]:
# ========================
# 2. Load Data
# ========================
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# Detect text column automatically
TEXT_COL = "text" if "text" in train.columns else "comment_text"
LABEL_COL = "label" if "label" in train.columns else "toxic"

print("Text column:", TEXT_COL)
print("Label column:", LABEL_COL)
train.head()


Text column: comment_text
Label column: toxic


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
# ========================
# 3. Preprocess Text
# ========================
def normalize_text(s):
    s = str(s).lower()
    s = html.unescape(s)
    s = unicodedata.normalize("NFKC", s)
    s = re.sub(r"http\S+|www\S+", " url ", s)
    s = re.sub(r"@\w+", " user ", s)
    s = re.sub(r"[^a-z0-9' ]+", " ", s)
    return re.sub(r"\s+", " ", s).strip()

print(normalize_text("You're THE WORST!!! Visit http://abc.com"))


you're the worst visit url


In [4]:
# ========================
# 4. Train/Val Split
# ========================
X = train[TEXT_COL].astype(str).apply(normalize_text)
y = train[LABEL_COL]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train size:", len(X_train), "Val size:", len(X_val))


Train size: 127656 Val size: 31915


In [5]:
# ========================
# 5. Tokenization
# ========================
MAX_WORDS = 20000
MAX_LEN = 120

tok = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tok.fit_on_texts(X_train)

def to_seq(texts):
    return pad_sequences(tok.texts_to_sequences(texts), maxlen=MAX_LEN, padding="post")

Xtr_seq, Xval_seq = to_seq(X_train), to_seq(X_val)


In [6]:
# ========================
# 6. Model (BiLSTM)
# ========================
model = models.Sequential([
    layers.Embedding(MAX_WORDS, 128, input_length=MAX_LEN),
    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.GlobalMaxPool1D(),
    layers.Dense(64, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(1, activation="sigmoid")
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()




In [7]:
# ========================
# 7. Train
# ========================
es = callbacks.EarlyStopping(patience=2, restore_best_weights=True)
ck = callbacks.ModelCheckpoint("model_bilstm.keras", save_best_only=True)

hist = model.fit(
    Xtr_seq, y_train,
    validation_data=(Xval_seq, y_val),
    epochs=5, batch_size=128,
    callbacks=[es, ck]
)


Epoch 1/5
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 133ms/step - accuracy: 0.9367 - loss: 0.1927 - val_accuracy: 0.9639 - val_loss: 0.0977
Epoch 2/5
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 140ms/step - accuracy: 0.9685 - loss: 0.0850 - val_accuracy: 0.9638 - val_loss: 0.1007
Epoch 3/5
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 141ms/step - accuracy: 0.9750 - loss: 0.0660 - val_accuracy: 0.9622 - val_loss: 0.1138


In [8]:
# ========================
# 8. Evaluation
# ========================
model.load_weights("model_bilstm.keras")
preds = (model.predict(Xval_seq) >= 0.5).astype(int)

print(classification_report(y_val, preds))
print("F1 Score:", f1_score(y_val, preds))


[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     28856
           1       0.88      0.72      0.79      3059

    accuracy                           0.96     31915
   macro avg       0.93      0.85      0.89     31915
weighted avg       0.96      0.96      0.96     31915

F1 Score: 0.7922827262892175


In [9]:
# ========================
# 9. Save Artifacts
# ========================
joblib.dump(tok, "tokenizer.joblib")
meta = {"text_col": TEXT_COL, "max_len": MAX_LEN, "threshold": 0.5}
joblib.dump(meta, "meta.joblib")
model.save("model_bilstm.keras")

print("Artifacts saved: model_bilstm.keras, tokenizer.joblib, meta.joblib")


Artifacts saved: model_bilstm.keras, tokenizer.joblib, meta.joblib
