<a href="https://colab.research.google.com/github/VaishnaviBairagoni/Natural-Language-Processing-NLP-/blob/main/(NLP-T-11-9-2025).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Preprocessing
import re
import nltk
import string
import nltk
nltk.download('punkt_tab')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download resources (only once)
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Initialize tools
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # 1. Lowercase
    text = text.lower()

    # 2. Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)

    # 3. Remove numbers
    text = re.sub(r"\d+", "", text)

    # 4. Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # 5. Tokenization
    tokens = word_tokenize(text)

    # 6. Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # 7. Join back into a single string
    return " ".join(tokens)

# Example usage with your dataset
import pandas as pd

# Load dataset (adjust path if needed)
df = pd.read_csv("/content/tweets.csv")

# Apply preprocessing
df["clean"] = df["text"].apply(clean_text)

print(df[["text", "clean"]].head())


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                text  \
0  Communal violence in Bhainsa, Telangana. "Ston...   
1  Telangana: Section 144 has been imposed in Bha...   
2  Arsonist sets cars ablaze at dealership https:...   
3  Arsonist sets cars ablaze at dealership https:...   
4  "Lord Jesus, your love brings freedom and pard...   

                                               clean  
0  communal violence bhainsa telangana stone pelt...  
1  telangana section imposed bhainsa january clas...  
2                 arsonist set car ablaze dealership  
3                 arsonist set car ablaze dealership  
4  lord jesus love brings freedom pardon fill hol...  


In [4]:
# Feature Extraction: TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Split into train and test
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    df["clean"].values,
    df["target"].values,
    test_size=0.2,
    stratify=df["target"].values,
    random_state=42
)

# TF-IDF Vectorizer with unigrams + bigrams
tfidf_vect = TfidfVectorizer(
    max_features=10000,      # keep top 10k features
    ngram_range=(1, 2),      # unigrams + bigrams
    min_df=2,                # ignore rare terms
    norm="l2",
    sublinear_tf=True
)

# Fit on training data, transform train and test
X_train_tfidf = tfidf_vect.fit_transform(X_train_raw)
X_test_tfidf = tfidf_vect.transform(X_test_raw)

print("TF-IDF train shape:", X_train_tfidf.shape)
print("TF-IDF test shape :", X_test_tfidf.shape)


TF-IDF train shape: (9096, 10000)
TF-IDF test shape : (2274, 10000)


In [5]:
#  Deep Learning Models: MLP, 1D-CNN, Bi-LSTM
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import (Input, Embedding, GlobalAveragePooling1D, Dense, Dropout,
                                     Conv1D, GlobalMaxPooling1D, SpatialDropout1D,
                                     LSTM, Bidirectional)
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pandas as pd
import joblib
import os

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

# Hyperparameters (tweak if needed)
MAX_NUM_WORDS = 20000
MAX_SEQUENCE_LENGTH = 60   # tweets are short
EMBEDDING_DIM = 100
BATCH_SIZE = 128
EPOCHS = 6

#Tokenize + Pad
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_raw)   # X_train_raw should be list/array of cleaned strings

X_train_seq = tokenizer.texts_to_sequences(X_train_raw)
X_test_seq  = tokenizer.texts_to_sequences(X_test_raw)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")
X_test_pad  = pad_sequences(X_test_seq,  maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")

vocab_size = min(MAX_NUM_WORDS, len(tokenizer.word_index) + 1)
print("Vocab size:", vocab_size)

# Save tokenizer for later use
joblib.dump(tokenizer, "tokenizer.joblib")

# Early stopping
es = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True, verbose=1)

results = []

def eval_and_store(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    print("\n" + "="*60)
    print(f"Model: {name}")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print("\nClassification report:\n", classification_report(y_true, y_pred, digits=4))
    results.append({"model": name, "accuracy": acc, "precision": prec, "recall": rec, "f1": f1})

# Model 1: MLP on averaged embeddings
print("\nBuilding / training: MLP (avg embeddings)")
inputs = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
emb = Embedding(vocab_size, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(inputs)
x = GlobalAveragePooling1D()(emb)
x = Dense(64, activation="relu")(x)
x = Dropout(0.3)(x)
outputs = Dense(1, activation="sigmoid")(x)
mlp_model = Model(inputs, outputs)
mlp_model.compile(optimizer=Adam(1e-3), loss="binary_crossentropy", metrics=["accuracy"])
mlp_model.summary()
mlp_model.fit(X_train_pad, y_train, validation_split=0.1, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[es], verbose=1)

y_prob = mlp_model.predict(X_test_pad, verbose=0).ravel()
y_pred = (y_prob >= 0.5).astype(int)
eval_and_store("MLP (avg embeddings)", y_test, y_pred)

# Save model
mlp_model.save("mlp_avg_embeddings.h5")

# Model 2: 1D-CNN
print("\nBuilding / training: 1D-CNN")
cnn = Sequential()
cnn.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
cnn.add(Conv1D(128, kernel_size=5, activation="relu"))
cnn.add(GlobalMaxPooling1D())
cnn.add(Dense(64, activation="relu"))
cnn.add(Dropout(0.3))
cnn.add(Dense(1, activation="sigmoid"))
cnn.compile(optimizer=Adam(1e-3), loss="binary_crossentropy", metrics=["accuracy"])
cnn.summary()
cnn.fit(X_train_pad, y_train, validation_split=0.1, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[es], verbose=1)

y_prob = cnn.predict(X_test_pad, verbose=0).ravel()
y_pred = (y_prob >= 0.5).astype(int)
eval_and_store("1D-CNN", y_test, y_pred)

# Save model
cnn.save("cnn_text.h5")

# Model 3: Bidirectional LSTM
print("\nBuilding / training: Bidirectional LSTM")
lstm_model = Sequential()
lstm_model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
lstm_model.add(SpatialDropout1D(0.2))
lstm_model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
lstm_model.add(Dense(64, activation="relu"))
lstm_model.add(Dropout(0.3))
lstm_model.add(Dense(1, activation="sigmoid"))
lstm_model.compile(optimizer=Adam(1e-3), loss="binary_crossentropy", metrics=["accuracy"])
lstm_model.summary()
lstm_model.fit(X_train_pad, y_train, validation_split=0.1, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[es], verbose=1)

y_prob = lstm_model.predict(X_test_pad, verbose=0).ravel()
y_pred = (y_prob >= 0.5).astype(int)
eval_and_store("Bidirectional LSTM", y_test, y_pred)

# Save model
lstm_model.save("bilstm_text.h5")

# Save results
res_df = pd.DataFrame(results).sort_values("f1", ascending=False).reset_index(drop=True)
res_df.to_csv("dl_results.csv", index=False)
print("\nSaved dl_results.csv")


Vocab size: 19399

Building / training: MLP (avg embeddings)




Epoch 1/6
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 33ms/step - accuracy: 0.7644 - loss: 0.5270 - val_accuracy: 0.7912 - val_loss: 0.5078
Epoch 2/6
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 57ms/step - accuracy: 0.8125 - loss: 0.4813 - val_accuracy: 0.7912 - val_loss: 0.5048
Epoch 3/6
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.8125 - loss: 0.4749 - val_accuracy: 0.7912 - val_loss: 0.4988
Epoch 4/6
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.8125 - loss: 0.4691 - val_accuracy: 0.7912 - val_loss: 0.4842
Epoch 5/6
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.8125 - loss: 0.4478 - val_accuracy: 0.7912 - val_loss: 0.4494
Epoch 6/6
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.8148 - loss: 0.4006 - val_accuracy: 0.7956 - val_loss: 0.4259
Restoring model weights from the end of 




Model: MLP (avg embeddings)
Accuracy : 0.8162
Precision: 1.0000
Recall   : 0.0118
F1-score : 0.0234

Classification report:
               precision    recall  f1-score   support

           0     0.8158    1.0000    0.8985      1851
           1     1.0000    0.0118    0.0234       423

    accuracy                         0.8162      2274
   macro avg     0.9079    0.5059    0.4610      2274
weighted avg     0.8500    0.8162    0.7357      2274


Building / training: 1D-CNN




Epoch 1/6
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 89ms/step - accuracy: 0.8108 - loss: 0.5183 - val_accuracy: 0.7912 - val_loss: 0.4519
Epoch 2/6
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 116ms/step - accuracy: 0.8448 - loss: 0.3361 - val_accuracy: 0.8923 - val_loss: 0.3210
Epoch 3/6
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 108ms/step - accuracy: 0.9585 - loss: 0.1185 - val_accuracy: 0.8758 - val_loss: 0.4103
Epoch 4/6
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 74ms/step - accuracy: 0.9880 - loss: 0.0416 - val_accuracy: 0.8857 - val_loss: 0.5223
Epoch 4: early stopping
Restoring model weights from the end of the best epoch: 2.





Model: 1D-CNN
Accuracy : 0.8786
Precision: 0.7094
Recall   : 0.5887
F1-score : 0.6434

Classification report:
               precision    recall  f1-score   support

           0     0.9095    0.9449    0.9269      1851
           1     0.7094    0.5887    0.6434       423

    accuracy                         0.8786      2274
   macro avg     0.8095    0.7668    0.7851      2274
weighted avg     0.8723    0.8786    0.8741      2274






Building / training: Bidirectional LSTM


Epoch 1/6
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 257ms/step - accuracy: 0.8103 - loss: 0.5168 - val_accuracy: 0.8308 - val_loss: 0.3821
Epoch 2/6
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 249ms/step - accuracy: 0.8895 - loss: 0.2826 - val_accuracy: 0.8901 - val_loss: 0.3098
Epoch 3/6
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 264ms/step - accuracy: 0.9493 - loss: 0.1372 - val_accuracy: 0.8879 - val_loss: 0.3717
Epoch 4/6
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 246ms/step - accuracy: 0.9741 - loss: 0.0737 - val_accuracy: 0.8714 - val_loss: 0.4709
Epoch 4: early stopping
Restoring model weights from the end of the best epoch: 2.





Model: Bidirectional LSTM
Accuracy : 0.8795
Precision: 0.7159
Recall   : 0.5839
F1-score : 0.6432

Classification report:
               precision    recall  f1-score   support

           0     0.9088    0.9471    0.9275      1851
           1     0.7159    0.5839    0.6432       423

    accuracy                         0.8795      2274
   macro avg     0.8124    0.7655    0.7854      2274
weighted avg     0.8729    0.8795    0.8746      2274


Saved dl_results.csv


In [6]:
# ---------------- Evaluation Section ----------------
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

all_results = []

def evaluate_model(name, y_true, y_pred):
    """Compute metrics, print report, and store in all_results list."""
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    print("\n" + "="*60)
    print(f"Model: {name}")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=4))

    all_results.append({
        "model": name,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1
    })

# -------- Logistic Regression (TF-IDF) --------
if "lr" in globals():
    y_pred_lr = lr.predict(X_test_tfidf)
    evaluate_model("Logistic Regression (TF-IDF)", y_test, y_pred_lr)

# -------- SVM (TF-IDF) --------
if "svc" in globals():
    y_pred_svc = svc.predict(X_test_tfidf)
    evaluate_model("SVM (TF-IDF)", y_test, y_pred_svc)

# -------- MLP --------
if "mlp_model" in globals():
    y_pred_mlp = (mlp_model.predict(X_test_pad, verbose=0).ravel() >= 0.5).astype(int)
    evaluate_model("MLP (avg embeddings)", y_test, y_pred_mlp)

# -------- 1D-CNN --------
if "cnn" in globals():
    y_pred_cnn = (cnn.predict(X_test_pad, verbose=0).ravel() >= 0.5).astype(int)
    evaluate_model("1D-CNN", y_test, y_pred_cnn)

# -------- Bi-LSTM --------
if "lstm_model" in globals():
    y_pred_lstm = (lstm_model.predict(X_test_pad, verbose=0).ravel() >= 0.5).astype(int)
    evaluate_model("Bi-LSTM", y_test, y_pred_lstm)

# -------- Save comparison summary --------
results_df = pd.DataFrame(all_results).sort_values("f1", ascending=False).reset_index(drop=True)
results_df.to_csv("evaluation_summary.csv", index=False)

print("\nSaved evaluation_summary.csv with all model metrics")
print("\nFinal Results Table:\n", results_df)



Model: MLP (avg embeddings)
Accuracy : 0.8162
Precision: 1.0000
Recall   : 0.0118
F1-score : 0.0234

Classification Report:
               precision    recall  f1-score   support

           0     0.8158    1.0000    0.8985      1851
           1     1.0000    0.0118    0.0234       423

    accuracy                         0.8162      2274
   macro avg     0.9079    0.5059    0.4610      2274
weighted avg     0.8500    0.8162    0.7357      2274


Model: 1D-CNN
Accuracy : 0.8786
Precision: 0.7094
Recall   : 0.5887
F1-score : 0.6434

Classification Report:
               precision    recall  f1-score   support

           0     0.9095    0.9449    0.9269      1851
           1     0.7094    0.5887    0.6434       423

    accuracy                         0.8786      2274
   macro avg     0.8095    0.7668    0.7851      2274
weighted avg     0.8723    0.8786    0.8741      2274


Model: Bi-LSTM
Accuracy : 0.8795
Precision: 0.7159
Recall   : 0.5839
F1-score : 0.6432

Classification Repo

In [7]:
# ---------------- Brief Analysis ----------------
import pandas as pd

# Load results
results_df = pd.read_csv("evaluation_summary.csv")

print("\n=== Brief Analysis ===")
print(results_df)

# Find best model by F1
best_model = results_df.loc[results_df["f1"].idxmax()]
print(f"\nBest model overall: {best_model['model']} with F1 = {best_model['f1']:.4f}")

# Separate classical ML and deep learning
ml_models = ["Logistic Regression (TF-IDF)", "SVM (TF-IDF)"]
dl_models = [m for m in results_df["model"].tolist() if m not in ml_models]

ml_f1 = results_df[results_df["model"].isin(ml_models)]["f1"].mean()
dl_f1 = results_df[results_df["model"].isin(dl_models)]["f1"].mean()

print(f"\nAverage F1 - Classical ML (TF-IDF): {ml_f1:.4f}")
print(f"Average F1 - Deep Learning (embeddings): {dl_f1:.4f}")

if dl_f1 > ml_f1:
    print("=> Deep learning outperformed classical ML overall.")
else:
    print("=> Classical ML held its ground against deep learning.")

# Compare architectures inside DL
if "Bi-LSTM" in results_df["model"].values:
    bilstm_f1 = results_df.loc[results_df["model"] == "Bi-LSTM", "f1"].values[0]
    cnn_f1 = results_df.loc[results_df["model"] == "1D-CNN", "f1"].values[0]
    mlp_f1 = results_df.loc[results_df["model"] == "MLP (avg embeddings)", "f1"].values[0]

    print(f"\nBi-LSTM F1: {bilstm_f1:.4f}")
    print(f"1D-CNN F1 : {cnn_f1:.4f}")
    print(f"MLP F1    : {mlp_f1:.4f}")

    if bilstm_f1 > max(cnn_f1, mlp_f1):
        print("=> Bi-LSTM captured sequential patterns better than CNN/MLP.")
    else:
        print("=> Bi-LSTM did not clearly outperform CNN/MLP.")



=== Brief Analysis ===
                  model  accuracy  precision    recall        f1
0                1D-CNN  0.878628   0.709402  0.588652  0.643411
1               Bi-LSTM  0.879507   0.715942  0.583924  0.643229
2  MLP (avg embeddings)  0.816183   1.000000  0.011820  0.023364

Best model overall: 1D-CNN with F1 = 0.6434

Average F1 - Classical ML (TF-IDF): nan
Average F1 - Deep Learning (embeddings): 0.4367
=> Classical ML held its ground against deep learning.

Bi-LSTM F1: 0.6432
1D-CNN F1 : 0.6434
MLP F1    : 0.0234
=> Bi-LSTM did not clearly outperform CNN/MLP.
