In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import re

In [2]:
train_df = pd.read_csv(r'D:/NITW_PROJECT/data/train.csv')
val_df = pd.read_csv(r'D:/NITW_PROJECT/data/validation.csv')
test_df = pd.read_csv(r'D:/NITW_PROJECT/data/test.csv')

In [3]:
def simple_tokenizer(text):
    if isinstance(text, str):
        return re.findall(r"\b\w+\b", text.lower())
    return []  

word2idx = {"<PAD>": 0, "<OOV>": 1}
idx = 2
all_text = pd.concat([train_df["Headline"], val_df["Headline"], test_df["Headline"]]).fillna("")

for text in all_text:
    for word in simple_tokenizer(text):
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1

def encode_text(text):
    return [word2idx.get(word, 1) for word in simple_tokenizer(str(text))]

# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(train_df["Language"].astype(str))
num_classes = len(label_encoder.classes_)

In [4]:
class HeadlineDataset(Dataset):
    def __init__(self, df):
        self.sequences = [torch.tensor(encode_text(text), dtype=torch.long) for text in df["Headline"]]
        self.labels = torch.tensor(label_encoder.transform(df["Language"]), dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

def collate_batch(batch):
    sequences, labels = zip(*batch)
    padded_seqs = pad_sequence(sequences, batch_first=True, padding_value=0)
    return padded_seqs, torch.tensor(labels)

batch_size = 32
train_loader = DataLoader(HeadlineDataset(train_df), batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(HeadlineDataset(test_df), batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [5]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        _, (hn, _) = self.lstm(x)
        return self.fc(hn[-1])

class GRUClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        _, hn = self.gru(x)
        return self.fc(hn[-1])

class CNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.conv = nn.Conv1d(in_channels=embed_dim, out_channels=128, kernel_size=5)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.embedding(x).transpose(1, 2)
        x = torch.relu(self.conv(x))
        x = self.pool(x).squeeze(-1)
        return self.fc(x)


In [6]:
def train_model(model, train_loader, num_epochs=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(num_epochs):
        model.train()
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
    return model

def evaluate_model(model, test_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            x_batch = x_batch.to(device)
            outputs = model(x_batch)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(y_batch.numpy())
    report = classification_report(all_labels, all_preds, target_names=label_encoder.classes_, digits=4)
    print(report)


In [7]:
class CBHG(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(CBHG, self).__init__()
        self.embedding = nn.Embedding(128, input_dim)
        self.conv1 = nn.Conv1d(input_dim, hidden_dim, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.gru = nn.GRU(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)  # (B, T, D)
        x = x.transpose(1, 2)  # (B, D, T)
        x = self.relu(self.conv1(x))  # (B, H, T)
        x = x.transpose(1, 2)  # (B, T, H)
        output, _ = self.gru(x)
        x = output[:, -1, :]  # take last time step
        return self.fc(x)

In [8]:
class LanguageDataset(Dataset):
    def __init__(self, df):
        self.texts = df["Headline"].tolist()
        self.labels = df["label"].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        seq = text_to_sequence(self.texts[idx])
        padded_seq = seq + [0]*(100 - len(seq))  # pad to 100
        return torch.tensor(padded_seq, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

In [9]:
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["Language"])
val_df["label"] = label_encoder.transform(val_df["Language"])
test_df["label"] = label_encoder.transform(test_df["Language"])
num_classes = len(label_encoder.classes_)

In [10]:
train_loader = DataLoader(LanguageDataset(train_df), batch_size=32, shuffle=True)
val_loader = DataLoader(LanguageDataset(val_df), batch_size=32)
test_loader = DataLoader(LanguageDataset(test_df), batch_size=32)

In [11]:
def build_vocab(dataframes):
    vocab = set()
    for df in dataframes:
        df["Headline"] = df["Headline"].fillna("")  
        for text in df["Headline"]:
            vocab.update(set(str(text)))  
    char2idx = {c: i + 1 for i, c in enumerate(sorted(vocab))}  
    return char2idx

char2idx = build_vocab([train_df, val_df, test_df])
vocab_size = len(char2idx) + 1  

def text_to_sequence(text, max_len=100):
    text = str(text)  
    seq = [char2idx.get(c, 0) for c in text]
    if len(seq) < max_len:
        seq += [0] * (max_len - len(seq))  
    return seq[:max_len]  

train_df["input"] = train_df["Headline"].apply(text_to_sequence)
val_df["input"] = val_df["Headline"].apply(text_to_sequence)
test_df["input"] = test_df["Headline"].apply(text_to_sequence)

class CBHG(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(CBHG, self).__init__()
        self.embedding = nn.Embedding(vocab_size, input_dim, padding_idx=0)
        self.conv1 = nn.Conv1d(input_dim, hidden_dim, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.gru = nn.GRU(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)        
        x = x.transpose(1, 2)        
        x = self.relu(self.conv1(x))
        x = x.transpose(1, 2)        
        output, _ = self.gru(x)
        x = output[:, -1, :]         
        return self.fc(x)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CBHG(input_dim=64, hidden_dim=128, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(3):
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        # DEBUG: Check input range
        max_index = inputs.max().item()
        vocab_limit = model.embedding.num_embeddings
        if max_index >= vocab_limit:
            print(f"Skipping batch - Max index {max_index} >= vocab size {vocab_limit}")
            continue  # Or clamp below

        # Optional: Clamp to avoid crash
        inputs = torch.clamp(inputs, max=vocab_limit - 1)

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 122.4745
Epoch 2, Loss: 2.0239
Epoch 3, Loss: 0.9899


In [13]:
from sklearn.metrics import classification_report, accuracy_score
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

acc = accuracy_score(all_labels, all_preds)
report = classification_report(all_labels, all_preds, target_names=label_encoder.classes_)

print(f"\nAccuracy: {acc:.4f}")
print("Classification Report:\n", report)


Accuracy: 0.9998
Classification Report:
               precision    recall  f1-score   support

     English       1.00      1.00      1.00      1473
       Hindi       1.00      1.00      1.00      2021
     Kannada       1.00      1.00      1.00       398
       Tamil       1.00      1.00      1.00       479
      Telugu       1.00      1.00      1.00       535

    accuracy                           1.00      4906
   macro avg       1.00      1.00      1.00      4906
weighted avg       1.00      1.00      1.00      4906



In [14]:
vocab_size = len(word2idx)
embed_dim = 100
hidden_dim = 128

models = {
    "LSTM": LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes),
    "GRU": GRUClassifier(vocab_size, embed_dim, hidden_dim, num_classes),
    "CNN": CNNClassifier(vocab_size, embed_dim, num_classes),
}

for name, model in models.items():
    print(f"\nTraining {name} model...")
    trained = train_model(model, train_loader)
    print(f"Performance of {name}:")
    evaluate_model(trained, test_loader)



Training LSTM model...
Performance of LSTM:
              precision    recall  f1-score   support

     English     0.9993    0.9939    0.9966      1473
       Hindi     1.0000    0.9792    0.9895      2021
     Kannada     1.0000    0.9975    0.9987       398
       Tamil     1.0000    0.9979    0.9990       479
      Telugu     0.9097    0.9981    0.9519       535

    accuracy                         0.9890      4906
   macro avg     0.9818    0.9933    0.9871      4906
weighted avg     0.9899    0.9890    0.9892      4906


Training GRU model...
Performance of GRU:
              precision    recall  f1-score   support

     English     1.0000    1.0000    1.0000      1473
       Hindi     1.0000    1.0000    1.0000      2021
     Kannada     1.0000    0.9975    0.9987       398
       Tamil     0.9979    1.0000    0.9990       479
      Telugu     0.9981    0.9981    0.9981       535

    accuracy                         0.9996      4906
   macro avg     0.9992    0.9991    0.9992

In [15]:
import torch.nn as nn

# Basic Decoder class
class Decoder(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Decoder, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)


In [16]:
input_dim = 128      
hidden_dim = 64
num_classes = 5        
output_dim = num_classes
vocab_size = 10000     
embed_dim = 128
# CNN
cnn_encoder = CNNClassifier(input_dim, hidden_dim, output_dim)
cnn_decoder = Decoder(hidden_dim, output_dim)

# GRU
gru_encoder = GRUClassifier(vocab_size, embed_dim, hidden_dim, output_dim)
gru_decoder = Decoder(hidden_dim, output_dim)

# LSTM
lstm_encoder = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim)
lstm_decoder = Decoder(hidden_dim, output_dim)

# Save models
torch.save(cnn_encoder.state_dict(), "cnn_encoder.pth")
torch.save(cnn_decoder.state_dict(), "cnn_decoder.pth")

torch.save(gru_encoder.state_dict(), "gru_encoder.pth")
torch.save(gru_decoder.state_dict(), "gru_decoder.pth")

torch.save(lstm_encoder.state_dict(), "lstm_encoder.pth")
torch.save(lstm_decoder.state_dict(), "lstm_decoder.pth")


In [17]:
# Load CNN
cnn_encoder_loaded = CNNClassifier(input_dim, hidden_dim, output_dim)
cnn_encoder_loaded.load_state_dict(torch.load("cnn_encoder.pth"))
cnn_encoder_loaded.eval()

cnn_decoder_loaded = Decoder(hidden_dim, output_dim)
cnn_decoder_loaded.load_state_dict(torch.load("cnn_decoder.pth"))
cnn_decoder_loaded.eval()

Decoder(
  (linear): Linear(in_features=64, out_features=5, bias=True)
)

In [18]:
# Load GRU
gru_encoder_loaded = GRUClassifier(vocab_size, embed_dim, hidden_dim, output_dim)
gru_encoder_loaded.load_state_dict(torch.load("gru_encoder.pth"))
gru_encoder_loaded.eval()

gru_decoder_loaded = Decoder(hidden_dim, output_dim)
gru_decoder_loaded.load_state_dict(torch.load("gru_decoder.pth"))
gru_decoder_loaded.eval()

Decoder(
  (linear): Linear(in_features=64, out_features=5, bias=True)
)

In [19]:
# Load LSTM
lstm_encoder_loaded = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim)
lstm_encoder_loaded.load_state_dict(torch.load("lstm_encoder.pth"))
lstm_encoder_loaded.eval()

lstm_decoder_loaded = Decoder(hidden_dim, output_dim)
lstm_decoder_loaded.load_state_dict(torch.load("lstm_decoder.pth"))
lstm_decoder_loaded.eval()

Decoder(
  (linear): Linear(in_features=64, out_features=5, bias=True)
)

In [20]:
import torch
import torch.nn as nn
import numpy as np
import librosa
from langdetect import detect
from googletrans import Translator
from gtts import gTTS
from pydub import AudioSegment
import pygame
import os
from scipy.io.wavfile import write as wav_write
import soundfile as sf



pygame 2.6.1 (SDL 2.28.4, Python 3.9.7)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [21]:
# --- Text to sequence ---
def text_to_sequence(text):
    return [ord(c) for c in text.lower() if c.isalnum() or c == ' ']

In [22]:
# --- Encoder Models ---
class LSTMEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=256):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
    def forward(self, x):
        output, _ = self.lstm(x)
        return output


In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CBHG(nn.Module):
    def __init__(self, input_dim, hidden_dim, K=16):
        super(CBHG, self).__init__()
        self.conv1d_banks = nn.ModuleList(
            [nn.Conv1d(input_dim, hidden_dim, kernel_size=k, padding=k // 2) for k in range(1, K+1)]
        )
        self.batch_norms = nn.ModuleList(
            [nn.BatchNorm1d(hidden_dim) for _ in range(K)]
        )
        self.max_pool = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)
        self.projection1 = nn.Conv1d(K * hidden_dim, hidden_dim, kernel_size=3, padding=1)
        self.projection2 = nn.Conv1d(hidden_dim, input_dim, kernel_size=3, padding=1)
        self.highway = nn.Linear(input_dim, hidden_dim)
        self.rnn = nn.GRU(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)

    def forward(self, x):
        x = x.transpose(1, 2) 
        conv_outputs = []
        for conv, bn in zip(self.conv1d_banks, self.batch_norms):
            c = F.relu(bn(conv(x)))
            conv_outputs.append(c)
        x = torch.cat(conv_outputs, dim=1)
        x = self.max_pool(x)[:, :, :-1]  
        x = F.relu(self.projection1(x))
        x = self.projection2(x)
        x = x.transpose(1, 2)  
        highway = self.highway(x)
        out, _ = self.rnn(highway)
        return out


In [24]:
class GRUEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=256):
        super().__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, batch_first=True)
    def forward(self, x):
        output, _ = self.gru(x)
        return output


In [25]:
class CNNEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=256):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(input_dim, hidden_dim, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1)
        )
    def forward(self, x):
        x = x.transpose(1, 2)
        x = self.conv(x)
        return x.transpose(1, 2)


In [26]:

def hmm_synthesis(text):
    print("Generating speech using HMM model (simulated)")
    duration = max(len(text) * 80, 4000)
    waveform = np.sin(np.linspace(0, duration * np.pi / 100, duration))
    return waveform

In [27]:
def gtts_speak(text, lang_code='en'):
    import time
    import os
    from gtts import gTTS

    filename = f"gtts_output_{int(time.time())}.mp3"
    try:
        tts = gTTS(text=text, lang=lang_code)
        tts.save(filename)
        print(f"gTTS saved as {filename}")
        os.system(f'start {filename}')  # For Windows
    except PermissionError as e:
        print(f"Permission error: {e}")
    except Exception as e:
        print(f"Failed to generate gTTS audio: {e}")


In [28]:

# --- Language & Model Mapping ---
translator = Translator()
LANG_MODEL_MAP = {
    'en': 'lstm',
    'hi': 'gru',
    'te': 'cnn',
    'ta': 'cnn',
    'kn': 'hmm'
}
GTTS_LANG_CODES = {
    'en': 'en',
    'hi': 'hi',
    'te': 'te',
    'ta': 'ta',
    'kn': 'kn'
}

In [29]:

# --- Main Function ---
def multilingual_tts_interactive():
    num_classes=1
    input_text = input("Enter a sentence: ").strip()
    if not input_text:
        print("No input provided.")
        return

    target_lang = input("Enter target language code (en, hi, te, ta, kn): ").strip().lower()
    if target_lang not in LANG_MODEL_MAP:
        print("Invalid language code.")
        return

    device = torch.device("cpu")
    detected_lang = detect(input_text)
    print(f"Detected input language: {detected_lang}")

    try:
        translated = translator.translate(input_text, src=detected_lang, dest=target_lang).text
        print(f"Translated ({detected_lang} → {target_lang}): {translated}")
    except Exception as e:
        print(f"Translation failed: {e}")
        translated = input_text

    model_choice = LANG_MODEL_MAP[target_lang]
    print(f"Selected model: {model_choice.upper()}")

    if model_choice == 'hmm':
        waveform = hmm_synthesis(translated)
    else:
        seq = text_to_sequence(translated)
        if len(seq) < 10:
            print("Input text too short, padding")
            seq += [32] * (10 - len(seq))
        input_tensor = torch.tensor(seq, dtype=torch.float32).unsqueeze(0).unsqueeze(-1).to(device)

        hidden_dim = 64        
        output_dim = 80       

        hidden_dim = 64
        output_dim = 80  

        if model_choice == 'lstm':
            encoder = LSTMEncoder(input_dim=1, hidden_dim=hidden_dim).to(device)
        elif model_choice == 'gru':
            encoder = GRUEncoder(input_dim=1, hidden_dim=hidden_dim).to(device)
        elif model_choice == 'cnn':
            encoder = CNNEncoder(input_dim=1, hidden_dim=hidden_dim).to(device)
        else:
            print("Unsupported model")
            return

        decoder = Decoder(hidden_dim, output_dim).to(device)


        with torch.no_grad():
            encoded = encoder(input_tensor)
            mel = decoder(encoded).squeeze(0).transpose(0, 1)
            mel_np = mel.cpu().numpy()
            mel_np = np.clip(mel_np, a_min=1e-5, a_max=None)
            waveform = librosa.feature.inverse.griffinlim(mel_np, n_iter=60)

    if waveform is not None and np.max(np.abs(waveform)) > 0:
        waveform = waveform / np.max(np.abs(waveform))
        import time
        output_file = f"output_{int(time.time())}.wav"
        sf.write(output_file, waveform, 22050)

        print(f"Audio saved as '{output_file}'")

        pygame.mixer.init()
        pygame.mixer.music.load(output_file)

        while pygame.mixer.music.get_busy():
            pass
    else:
        print("No valid waveform generated.")

    print("Also generating gTTS speech for clarity:")
    gtts_speak(translated, GTTS_LANG_CODES.get(target_lang, 'en'))

# --- Run ---
if __name__ == "__main__":
    multilingual_tts_interactive()


Detected input language: en
Translated (en → hi): नमस्ते, आप कैसे हैं
Selected model: GRU
Audio saved as 'output_1749779997.wav'
Also generating gTTS speech for clarity:
gTTS saved as gtts_output_1749779998.mp3
