In [2]:
!pip install transformers datasets --quiet
!pip install torchtext --quiet


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel


In [9]:
import re

data = []
with open("/content/urdu-sentiment-corpus-v1.tsv.txt", encoding="utf-8") as f:
    lines = f.readlines()[1:]  # Skip header

    for line in lines:
        line = line.strip()
        if not line:
            continue
        # Match label as the last token: P or N
        match = re.match(r"^(.*)\s([PN])$", line)
        if match:
            text = match.group(1).strip()
            label = 1 if match.group(2) == 'P' else 0
            data.append((text, label))

# Create DataFrame
import pandas as pd
df = pd.DataFrame(data, columns=["text", "label"])
print("Number of samples:", len(df))
df.head()



Number of samples: 980


Unnamed: 0,text,label
0,میں نے ایٹم بم بنایا ھے ۔۔۔۔او بھائی ایٹم بمب ...,1
1,چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...,0
2,"سرچ انجن گوگل کے نائب صدر نے فضا میں ، 130,000...",1
3,ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار :أْ,1
4,گندی زبان اور گٹر جیسے دماغ والے جاهل جیالے هو...,0


In [10]:
X = df['text'].values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [19]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import numpy as np

# Tokenize
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

max_len = 50
vocab_size = len(tokenizer.word_index) + 1

# Dataset class
class UrduDataset(Dataset):
    def __init__(self, texts, labels):
        sequences = tokenizer.texts_to_sequences(texts)
        self.X = pad_sequences(sequences, maxlen=max_len, padding='post')
        self.y = labels

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.long), torch.tensor(self.y[idx], dtype=torch.float)


In [20]:
train_data = UrduDataset(X_train, y_train)
test_data = UrduDataset(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)


In [21]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers=1, dropout=0.3):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.RNN(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True, nonlinearity='tanh')
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        out = self.dropout(hidden[-1])  # Last hidden state
        out = self.fc(out)
        return self.sigmoid(out)


In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = RNNModel(vocab_size, embed_dim=128, hidden_dim=64, output_dim=1).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [23]:
def train_model(model, train_loader, criterion, optimizer, device, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")


In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs).squeeze()
            probs = outputs.cpu().numpy()
            preds = (probs > 0.5).astype(int)
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return acc, prec, rec, f1

train_model(model, train_loader, criterion, optimizer, device, epochs=10)
evaluate_model(model, test_loader, device)


Epoch 1/10, Loss: 15.9787
Epoch 2/10, Loss: 15.9843
Epoch 3/10, Loss: 15.9257
Epoch 4/10, Loss: 15.9551
Epoch 5/10, Loss: 15.9707
Epoch 6/10, Loss: 15.9572
Epoch 7/10, Loss: 15.9798
Epoch 8/10, Loss: 15.9583
Epoch 9/10, Loss: 15.9774
Epoch 10/10, Loss: 15.9451
Accuracy: 0.5143
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.5142857142857142, 0.0, 0.0, 0.0)

In [2]:
#GRU
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# Load and preprocess data
def load_data(file_path):
    data = []
    with open(file_path, encoding="utf-8") as f:
        lines = f.readlines()[1:]  # Skip header

        for line in lines:
            line = line.strip()
            if not line:
                continue
            # Match label as the last token: P or N
            match = re.match(r"^(.*)\s([PN])$", line)
            if match:
                text = match.group(1).strip()
                label = 1 if match.group(2) == 'P' else 0
                data.append((text, label))

    return pd.DataFrame(data, columns=["text", "label"])

# Load data
df = load_data("urdu-sentiment-corpus-v1.tsv.txt")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.25, random_state=42)

# Tokenization
max_words = 10000  # Maximum number of words to keep
max_len = 100      # Maximum sequence length

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# GRU Model
embedding_dim = 128
gru_units = 64

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    GRU(units=gru_units, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

# Compile model
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train model
history = model.fit(
    X_train_pad, y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping]
)

# Evaluate
y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Save model for later use
model.save('urdu_sentiment_gru.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Accuracy: 0.5429
Precision: 0.5347
Recall: 0.4538
F1-score: 0.4909


In [4]:
# LSTM

import tensorflow as tf
from tensorflow.keras.layers import LSTM

# Reusing  the same preprocessing from GRU (X_train_pad, X_test_pad, y_train, y_test)
# Load and preprocess data
def load_data(file_path):
    data = []
    with open(file_path, encoding="utf-8") as f:
        lines = f.readlines()[1:]  # Skip header

        for line in lines:
            line = line.strip()
            if not line:
                continue
            # Match label as the last token: P or N
            match = re.match(r"^(.*)\s([PN])$", line)
            if match:
                text = match.group(1).strip()
                label = 1 if match.group(2) == 'P' else 0
                data.append((text, label))

    return pd.DataFrame(data, columns=["text", "label"])

# Load data
df = load_data("urdu-sentiment-corpus-v1.tsv.txt")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.25, random_state=42)

# Tokenization
max_words = 10000  # Maximum number of words to keep
max_len = 100      # Maximum sequence length

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# LSTM Model
lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=max_words, output_dim=256, input_length=max_len),
    tf.keras.layers.LSTM(128, dropout=0.3, recurrent_dropout=0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

lstm_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

history = lstm_model.fit(
    X_train_pad, y_train,
    validation_split=0.2,
    epochs=15,
    batch_size=32,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=3)]
)

# Evaluate
y_pred = (lstm_model.predict(X_test_pad) > 0.5).astype("int32")
print("LSTM Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"recall_score: {recall_score(y_test, y_pred):.4f}")


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
LSTM Metrics:
Accuracy: 0.5959
F1-score: 0.4975
Precision: 0.6282
recall_score: 0.4118


In [5]:
# BiLSTM Model
bilstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_words, 256, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.3)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.3)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

bilstm_model.compile(
    optimizer=tf.keras.optimizers.Adam(0.0001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

history = bilstm_model.fit(
    X_train_pad, y_train,
    validation_split=0.2,
    epochs=15,
    batch_size=32,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=3)]
)

# Evaluate
y_pred = (bilstm_model.predict(X_test_pad) > 0.5).astype("int32")
print("BiLSTM Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"recall_score: {recall_score(y_test, y_pred):.4f}")

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
BiLSTM Metrics:
Accuracy: 0.6449
F1-score: 0.6926
Precision: 0.5976
recall_score: 0.8235


In [1]:
!pip install --upgrade tensorflow==2.12.0 keras==2.12.0 transformers==4.30.0



In [3]:
!pip install --upgrade tensorflow==2.12.0 keras==2.12.0 transformers==4.30.0

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# 1. Data Loading and Preprocessing
def load_data(file_path):
    data = []
    with open(file_path, encoding="utf-8") as f:
        lines = f.readlines()[1:]  # Skip header

        for line in lines:
            line = line.strip()
            if not line:
                continue
            match = re.match(r"^(.*)\s([PN])$", line)
            if match:
                text = match.group(1).strip()
                label = 1 if match.group(2) == 'P' else 0
                data.append((text, label))

    return pd.DataFrame(data, columns=["text", "label"])

# Load data
df = load_data("urdu-sentiment-corpus-v1.tsv.txt")

# Split data (THIS IS WHAT WAS MISSING)
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.25, random_state=42)

# 2. mBERT Implementation
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def encode_texts(texts):
    return tokenizer(
        texts.tolist(),
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='tf'
    )

X_train_enc = encode_texts(X_train)
X_test_enc = encode_texts(X_test)

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': X_train_enc['input_ids'],
        'attention_mask': X_train_enc['attention_mask']
    },
    y_train
)).shuffle(1000).batch(8)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': X_test_enc['input_ids'],
        'attention_mask': X_test_enc['attention_mask']
    },
    y_test
)).batch(8)

# Load model
model = TFBertForSequenceClassification.from_pretrained(
    'bert-base-multilingual-cased',
    num_labels=2
)

# Compile
model.compile(
    optimizer=tf.keras.optimizers.Adam(2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Train
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=3
)

# Evaluate
logits = model.predict(test_dataset).logits
y_pred = tf.argmax(logits, axis=1)

print("\n=== mBERT Results ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred):.4f}")



All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3

=== mBERT Results ===
Accuracy: 0.6041
Precision: 0.5687
Recall: 0.7647
F1-score: 0.6523


In [1]:
from transformers import XLMRobertaTokenizer, TFXLMRobertaForSequenceClassification

# Load XLM-R tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Tokenize (same as mBERT)
X_train_enc = encode_texts(X_train)  # Reuse the same function
X_test_enc = encode_texts(X_test)

# Datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_train_enc),
    y_train
)).batch(8)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_test_enc),
    y_test
)).batch(8)

# Load XLM-R model
model = TFXLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)

# Compile and train (same as mBERT)
model.compile(
    optimizer=tf.keras.optimizers.Adam(2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=3
)

# Evaluate
logits = model.predict(test_dataset).logits
y_pred = tf.argmax(logits, axis=1)
print("XLM-RoBERTa Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"recall_score: {recall_score(y_test, y_pred):.4f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

NameError: name 'encode_texts' is not defined

In [7]:
!pip install --upgrade numpy
!pip install --force-reinstall gensim

Collecting numpy
  Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.2.5 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.5 which is incom

In [None]:
####PART 1
######Q2
import pandas as pd
import numpy as np
import re  # Added missing import
from gensim.models import Word2Vec, FastText, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_hub as hub
import os

# --- 1. Load and Prepare Data ---
def load_data(file_path):
    data = []
    with open(file_path, encoding="utf-8") as f:
        lines = f.readlines()[1:]  # Skip header

        for line in lines:
            line = line.strip()
            if not line:
                continue
            match = re.match(r"^(.*)\s([PN])$", line)
            if match:
                text = match.group(1).strip()
                label = 1 if match.group(2) == 'P' else 0
                data.append((text, label))

    return pd.DataFrame(data, columns=["text", "label"])

# Load data
df = load_data("urdu-sentiment-corpus-v1.tsv.txt")

# --- 2. Text Preprocessing ---
# Basic text cleaning
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()

df['text'] = df['text'].apply(clean_text)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])

# Padding
max_len = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_len, padding='post')
y = df['label'].values

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vocabulary size
max_words = len(tokenizer.word_index) + 1

# --- 3. Define Base BiLSTM Model ---
def build_bilstm(embedding_matrix=None):
    model = tf.keras.Sequential()

    if embedding_matrix is not None:
        model.add(Embedding(
            input_dim=embedding_matrix.shape[0],
            output_dim=embedding_matrix.shape[1],
            weights=[embedding_matrix],
            input_length=max_len,
            trainable=False))
    else:
        model.add(Embedding(
            input_dim=max_words,
            output_dim=256,
            input_length=max_len))

    model.add(Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3)))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy'])
    return model

# --- 4. Train/Prepare All Embeddings ---

# Word2Vec
print("Training Word2Vec...")
tokenized_texts = [text.split() for text in df['text']]
w2v_model = Word2Vec(
    sentences=tokenized_texts,
    vector_size=300,
    window=5,
    min_count=2,
    workers=4)

w2v_embedding = np.zeros((max_words, 300))
for word, i in tokenizer.word_index.items():
    if i < max_words and word in w2v_model.wv:
        w2v_embedding[i] = w2v_model.wv[word]

# FastText
print("Training FastText...")
ft_model = FastText(
    sentences=tokenized_texts,
    vector_size=300,
    window=5,
    min_count=2,
    workers=4)

ft_embedding = np.zeros((max_words, 300))
for word, i in tokenizer.word_index.items():
    if i < max_words and word in ft_model.wv:
        ft_embedding[i] = ft_model.wv[word]

# GloVe (Pretrained)
print("Preparing GloVe...")
# First download glove.6B.300d.txt from https://nlp.stanford.edu/projects/glove/
# Then convert to word2vec format
if not os.path.exists('glove_word2vec_format.txt'):
    glove2word2vec('glove.6B.300d.txt', 'glove_word2vec_format.txt')

glove_model = KeyedVectors.load_word2vec_format('glove_word2vec_format.txt', binary=False)
glove_embedding = np.zeros((max_words, 300))
for word, i in tokenizer.word_index.items():
    if i < max_words and word in glove_model:
        glove_embedding[i] = glove_model[word]

# ELMo (Using TFHub)
print("Preparing ELMo...")
elmo = hub.load("https://tfhub.dev/google/elmo/3")
def elmo_embed(texts):
    return elmo.signatures["default"](tf.constant(texts))["default"]

print("Creating ELMo embeddings for training set...")
batch_size = 32
X_train_elmo = []

# First, verify all indices are valid
if max(X_train) >= len(df):
    raise ValueError("X_train contains indices larger than the DataFrame")

for i in range(0, len(X_train), batch_size):
    batch_indices = X_train[i:i+batch_size].tolist()
    # Skip empty batches
    if not batch_indices:
        continue
    batch = df['text'].iloc[batch_indices]
    embeddings = elmo_embed(batch)
    X_train_elmo.append(embeddings)

X_train_elmo = np.concatenate(X_train_elmo) if X_train_elmo else np.array([])

# Prepare ELMo embeddings for test set
print("Creating ELMo embeddings for test set...")
X_test_elmo = []
for i in range(0, len(X_test), batch_size):
    batch = df['text'].iloc[X_test[i:i+batch_size].tolist()]
    X_test_elmo.append(elmo_embed(batch))
X_test_elmo = np.concatenate(X_test_elmo)

# --- 5. Train and Evaluate All Models ---
results = []

# 1. Baseline (No Pretrained Embeddings)
print("\nEvaluating Baseline BiLSTM...")
model = build_bilstm()
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1)

y_pred = (model.predict(X_test) > 0.5).astype(int)
results.append({
    'Model': 'BiLSTM (No Embedding)',
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred),
    'Recall': recall_score(y_test, y_pred),
    'F1': f1_score(y_test, y_pred)
})

# 2. Word2Vec
print("\nEvaluating Word2Vec BiLSTM...")
model = build_bilstm(w2v_embedding)
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1)

y_pred = (model.predict(X_test) > 0.5).astype(int)
results.append({
    'Model': 'BiLSTM + Word2Vec',
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred),
    'Recall': recall_score(y_test, y_pred),
    'F1': f1_score(y_test, y_pred)
})

# 3. FastText
print("\nEvaluating FastText BiLSTM...")
model = build_bilstm(ft_embedding)
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1)

y_pred = (model.predict(X_test) > 0.5).astype(int)
results.append({
    'Model': 'BiLSTM + FastText',
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred),
    'Recall': recall_score(y_test, y_pred),
    'F1': f1_score(y_test, y_pred)
})

# 4. GloVe
print("\nEvaluating GloVe BiLSTM...")
model = build_bilstm(glove_embedding)
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1)

y_pred = (model.predict(X_test) > 0.5).astype(int)
results.append({
    'Model': 'BiLSTM + GloVe',
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred),
    'Recall': recall_score(y_test, y_pred),
    'F1': f1_score(y_test, y_pred)
})

# 5. ELMo
print("\nEvaluating ELMo Model...")
elmo_model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(1024,)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
elmo_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
elmo_model.fit(
    X_train_elmo, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1)

y_pred = (elmo_model.predict(X_test_elmo) > 0.5).astype(int)
results.append({
    'Model': 'BiLSTM + ELMo',
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred),
    'Recall': recall_score(y_test, y_pred),
    'F1': f1_score(y_test, y_pred)
})

# --- 6. Present Results ---
results_df = pd.DataFrame(results)
print("\n=== Final Results ===")
print(results_df.to_markdown(index=False))

# Save to CSV
results_df.to_csv("embedding_results.csv", index=False)

In [None]:
####PART 2
#### Q3

!pip install tensorflow nltk pandas scikit-learn
import nltk
nltk.download('punkt')

import pandas as pd

# Load dataset
with open("/content/english-corpus.txt", encoding='utf-8') as f:
    english = f.read().splitlines()

# Load Urdu sentences
with open("/content/urdu-corpus.txt", encoding='utf-8') as f:
    urdu = f.read().splitlines()

# Clean and normalize (optional, but useful)
eng_sentences = [s.strip().lower() for s in english]
urdu_sentences = [s.strip().lower() for s in urdu]


# Optional: Lowercase
eng_sentences = [s.lower() for s in eng_sentences]
urdu_sentences = [s.lower() for s in urdu_sentences]


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize English
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(eng_sentences)
eng_seq = eng_tokenizer.texts_to_sequences(eng_sentences)
eng_pad = pad_sequences(eng_seq, padding='post')

# Tokenize Urdu
urdu_tokenizer = Tokenizer()
urdu_tokenizer.fit_on_texts(urdu_sentences)
urdu_seq = urdu_tokenizer.texts_to_sequences(urdu_sentences)
urdu_pad = pad_sequences(urdu_seq, padding='post')

# Vocab sizes
eng_vocab_size = len(eng_tokenizer.word_index) + 1
urdu_vocab_size = len(urdu_tokenizer.word_index) + 1

# Pad lengths
eng_max_len = eng_pad.shape[1]
urdu_max_len = urdu_pad.shape[1]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(eng_pad, urdu_pad, test_size=0.1)


####MODEL RNN TO SEQTOSEQ
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, Dense, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam

def build_rnn_model():
    encoder_inputs = Input(shape=(eng_max_len,))
    x = Embedding(eng_vocab_size, 256, mask_zero=True)(encoder_inputs)
    encoder = SimpleRNN(256, return_state=True)
    encoder_outputs, state_h = encoder(x)

    decoder_inputs = Input(shape=(urdu_max_len,))
    y = Embedding(urdu_vocab_size, 256, mask_zero=True)(decoder_inputs)
    decoder_rnn = SimpleRNN(256, return_sequences=True)
    decoder_outputs = decoder_rnn(y, initial_state=state_h)
    decoder_dense = Dense(urdu_vocab_size, activation='softmax')
    output = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

#######Model 2: Bi-RNN Seq2Seq
def build_birnn_model():
    encoder_inputs = Input(shape=(eng_max_len,))
    x = Embedding(eng_vocab_size, 256, mask_zero=True)(encoder_inputs)
    encoder = Bidirectional(SimpleRNN(256, return_state=True))
    encoder_outputs, forward_h, backward_h = encoder(x)
    state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])

    decoder_inputs = Input(shape=(urdu_max_len,))
    y = Embedding(urdu_vocab_size, 512, mask_zero=True)(decoder_inputs)
    decoder_rnn = SimpleRNN(512, return_sequences=True)
    decoder_outputs = decoder_rnn(y, initial_state=state_h)
    decoder_dense = Dense(urdu_vocab_size, activation='softmax')
    output = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


#Model 3: LSTM Seq2Seq

def build_lstm_model():
    encoder_inputs = Input(shape=(eng_max_len,))
    x = Embedding(eng_vocab_size, 256, mask_zero=True)(encoder_inputs)
    encoder_lstm, state_h, state_c = LSTM(256, return_state=True)(x)

    decoder_inputs = Input(shape=(urdu_max_len,))
    y = Embedding(urdu_vocab_size, 256, mask_zero=True)(decoder_inputs)
    decoder_lstm = LSTM(256, return_sequences=True)
    decoder_outputs = decoder_lstm(y, initial_state=[state_h, state_c])
    decoder_dense = Dense(urdu_vocab_size, activation='softmax')
    output = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

##Model 4: Transformer Model (Simple)
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dropout

def build_transformer_model():
    inputs = Input(shape=(eng_max_len,))
    x = Embedding(eng_vocab_size, 256)(inputs)

    attn = MultiHeadAttention(num_heads=4, key_dim=64)(x, x)
    attn = Dropout(0.1)(attn)
    x = LayerNormalization()(x + attn)

    ffn = Dense(512, activation='relu')(x)
    ffn = Dense(256)(ffn)
    x = LayerNormalization()(x + ffn)

    outputs = Dense(urdu_vocab_size, activation='softmax')(x)

    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model
###Training Function (Fair for All Models

###evaluation with bleu score
def train_model(model, name):
    history = model.fit(
        [X_train, y_train], y_train.reshape(y_train.shape[0], y_train.shape[1], 1),
        validation_split=0.1,
        epochs=2,
        batch_size=64
    )
    model.save(f"{name}_model.h5")
    return history

###Final BLEU Score Table and Inference

models = {
    "RNN": build_rnn_model(),
    "BiRNN": build_birnn_model(),
    "LSTM": build_lstm_model(),
    "Transformer": build_transformer_model()
}

import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer_urdu = Tokenizer()
tokenizer_urdu.fit_on_texts(urdu_sentences)  # urdu_sentences = list of Urdu strings

# Convert text to sequences
urdu_sequences = tokenizer_urdu.texts_to_sequences(urdu_sentences)


def evaluate_model(model, X_test, y_test, tokenizer_urdu, max_len_urdu):
    smooth_fn = SmoothingFunction().method1
    bleu_scores = []

    for i in range(len(X_test)):
        # Predict translation
        prediction = model.predict(np.array([X_test[i]]))
        predicted_seq = np.argmax(prediction[0], axis=-1)

        # Convert indices to words
        pred_tokens = [tokenizer_urdu.index_word.get(idx, '') for idx in predicted_seq if idx != 0]
        ref_tokens = [tokenizer_urdu.index_word.get(idx, '') for idx in y_test[i] if idx != 0]

        # Calculate BLEU score
        bleu = sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smooth_fn)
        bleu_scores.append(bleu)

    return np.mean(bleu_scores)

bleu_scores = {}

for name, model in models.items():
    print(f"Training {name}...")
    train_model(model, name)
    bleu = evaluate_model(model, X_test[:100], y_test[:100], tokenizer_urdu, max_len_urdu)
    bleu_scores[name] = bleu
    print(f"{name} BLEU Score: {bleu}")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Training RNN...
Epoch 1/2
[1m311/311[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m353s[0m 1s/step - accuracy: 0.0705 - loss: 5.5428 - val_accuracy: 0.1979 - val_loss: 1.9873
Epoch 2/2
[1m279/311[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m34s[0m 1s/step - accuracy: 0.2086 - loss: 1.6157

In [None]:
###PART2
##Q4
def load_glove_embeddings(glove_file_path):
    embeddings_index = {}
    with open(glove_file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coeffs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coeffs
    return embeddings_index

glove_embeddings = load_glove_embeddings('/content/glove.6B.300d.txt')

###Create Embedding Matrix (matching your tokenizer)
tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(eng_sentences)
vocab_size_eng = len(tokenizer_eng.word_index) + 1

embedding_dim = 100
embedding_matrix = np.zeros((vocab_size_eng, embedding_dim))

for word, i in tokenizer_eng.word_index.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

def build_rnn_model_random(vocab_size_eng, vocab_size_urdu, max_len_eng, max_len_urdu, embedding_dim=100):
    encoder_inputs = Input(shape=(max_len_eng,))
    x = Embedding(vocab_size_eng, embedding_dim)(encoder_inputs)
    encoder = SimpleRNN(256, return_sequences=False, return_state=True)
    encoder_output, state_h = encoder(x)

    decoder_inputs = Input(shape=(max_len_urdu,))
    decoder_embedding = Embedding(vocab_size_urdu, embedding_dim)(decoder_inputs)
    decoder_rnn = SimpleRNN(256, return_sequences=True)
    decoder_output = decoder_rnn(decoder_embedding, initial_state=state_h)

    output = TimeDistributed(Dense(vocab_size_urdu, activation='softmax'))(decoder_output)

    model = Model([encoder_inputs, decoder_inputs], output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def build_rnn_model_glove(vocab_size_eng, vocab_size_urdu, max_len_eng, max_len_urdu, embedding_matrix):
    encoder_inputs = Input(shape=(max_len_eng,))
    x = Embedding(vocab_size_eng, embedding_matrix.shape[1],
                  weights=[embedding_matrix], trainable=False)(encoder_inputs)
    encoder = SimpleRNN(256, return_sequences=False, return_state=True)
    encoder_output, state_h = encoder(x)

    decoder_inputs = Input(shape=(max_len_urdu,))
    decoder_embedding = Embedding(vocab_size_urdu, embedding_matrix.shape[1])(decoder_inputs)
    decoder_rnn = SimpleRNN(256, return_sequences=True)
    decoder_output = decoder_rnn(decoder_embedding, initial_state=state_h)

    output = TimeDistributed(Dense(vocab_size_urdu, activation='softmax'))(decoder_output)

    model = Model([encoder_inputs, decoder_inputs], output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

##trainning both models

# Prepare input/output sequences
# Assume X_train_eng, X_train_urdu_input, and y_train_urdu_output are ready

model_random = build_rnn_model_random(vocab_size_eng, vocab_size_urdu, max_len_eng, max_len_urdu)
model_random.fit([X_train_eng, X_train_urdu_input], y_train_urdu_output, epochs=50, batch_size=64)

model_glove = build_rnn_model_glove(vocab_size_eng, vocab_size_urdu, max_len_eng, max_len_urdu, embedding_matrix)
model_glove.fit([X_train_eng, X_train_urdu_input], y_train_urdu_output, epochs=50, batch_size=64)


bleu_random = evaluate_model(model_random, X_test_eng, y_test_urdu, tokenizer_urdu, max_len_urdu)
bleu_glove = evaluate_model(model_glove, X_test_eng, y_test_urdu, tokenizer_urdu, max_len_urdu)

print(f"Random Embedding BLEU Score: {bleu_random:.4f}")
print(f"GloVe Embedding BLEU Score: {bleu_glove:.4f}")

