In [24]:
!pip install spacy torch==2.0.0 torchtext==0.15.1 indic-nlp-library



In [2]:
from google.colab import drive
import json

drive.mount('/content/drive')



Mounted at /content/drive


In [3]:
import math
import time
import spacy
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
from torchtext.data.functional import to_map_style_dataset
from torch.nn.functional import pad
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
from torchtext.data.utils import get_tokenizer
from torch.utils.data import DataLoader,Dataset,random_split



In [4]:
import os
import torch
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset, random_split
from indicnlp.tokenize import indic_tokenize  # Telugu tokenizer from indic-nlp-library
import random
random.seed(42)
torch.manual_seed(42)

# Function to read your custom dataset
def read_telugu_english_data(file_paths):
    with open(file_paths, 'r', encoding='utf-8') as file:
        raw_datas = []
        for line in file:
            telugu_sentence, english_sentence = line.strip().split('>>')
            raw_datas.append((telugu_sentence, english_sentence))
    return raw_datas

# Splitting the dataset
def split_datasets(data, train_split=0.7, val_split=0.15, test_split=0.15):
    total_sizes = len(data)
    train_sizes = int(total_sizes * train_split)
    val_sizes = int(total_sizes * val_split)
    test_sizes = total_sizes - train_sizes - val_sizes
    train_data, remaining_data = random_split(data, [train_sizes, total_sizes - train_sizes])
    val_data, test_data = random_split(remaining_data, [val_sizes, test_sizes])
    return list(train_data), list(val_data), list(test_data)

# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

    def get_raw_textss(self):
        return [(srcs, trgs) for srcs, trgs in self.data]

tokenizer_te = spacy.blank("xx").tokenizer  # Telugu tokenizer
tokenizer_ens = spacy.blank("en").tokenizer  # English tokenizer

# Build vocabulary function
def build_vocabularys(tokenizer, dataset, min_freq=2):
    def yield_tokenss(data):
        for srcs, trgs in data:
            yield [token.text for token in tokenizer(srcs)]
            yield [token.text for token in tokenizer(trgs)]

    vocabs = build_vocab_from_iterator(yield_tokenss(dataset.get_raw_textss()), specials=["<unk>", "<pad>", "<bos>", "<eos>"], min_freq=min_freq)
    vocabs.set_default_index(vocabs['<unk>'])  # Set default index for unknown tokenss
    return vocabs

# Read the dataset
file_paths = '/content/drive/MyDrive/eng-tel.txt'
raw_datas = read_telugu_english_data(file_paths)
train_data_raw, val_data_raw, test_data_raw = split_datasets(raw_datas)

# Create datasets
train_datasets = CustomDataset(train_data_raw)
valid_datasets = CustomDataset(val_data_raw)
test_datasets = CustomDataset(test_data_raw)

# Load vocabularies
vocab_src = build_vocabularys(tokenizer_ens, train_datasets)
vocab_trg = build_vocabularys(tokenizer_te, train_datasets)
# Build vocabs
vocab_en = build_vocabularys(tokenizer_ens, train_datasets)
vocab_te = build_vocabularys(tokenizer_te, train_datasets)
import pickle

# # Save the vocabs to your Google Drive
# with open('/content/drive/MyDrive/vocabs-eng.pkl', 'wb') as f:
#     pickle.dump(vocab_en, f)
# with open('/content/drive/MyDrive/vocabs-tel.pkl', 'wb') as f:
#     pickle.dump(vocab_te, f)
# Batch generation function
def generate_batchs(data_batch):
    de_batch, en_batch = [], []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for (de_item, en_item) in data_batch:
        # Convert list of indices into tensors
        de_indices = torch.tensor([vocab_src[token.text] for token in tokenizer_te(de_item)], dtype=torch.long)
        en_indices = torch.tensor([vocab_trg[token.text] for token in tokenizer_ens(en_item)], dtype=torch.long)

        # Concatenate BOS, indices, EOS
        de_temp = torch.cat([torch.tensor([vocab_src['<bos>']], dtype=torch.long), de_indices, torch.tensor([vocab_src['<eos>']], dtype=torch.long)], dim=0).to(device)
        en_temp = torch.cat([torch.tensor([vocab_trg['<bos>']], dtype=torch.long), en_indices, torch.tensor([vocab_trg['<eos>']], dtype=torch.long)], dim=0).to(device)

        # Pad sequences to ensure consistent length
        padded_de = F.pad(de_temp, (0, MAX_PADDING - len(de_temp)), value=vocab_src['<pad>'])
        padded_en = F.pad(en_temp, (0, MAX_PADDING - len(en_temp)), value=vocab_trg['<pad>'])

        de_batch.append(padded_de)
        en_batch.append(padded_en)

    return torch.stack(de_batch), torch.stack(en_batch)

# DataLoader setup
BATCH_SIZE = 32
train_iters = DataLoader(train_datasets, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, collate_fn=generate_batchs)
valid_iters = DataLoader(valid_datasets, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, collate_fn=generate_batchs)
test_iters = DataLoader(test_datasets, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, collate_fn=generate_batchs)

BOS_IDX = vocab_trg['<bos>']
EOS_IDX = vocab_trg['<eos>']
PAD_IDX = vocab_trg['<pad>']
MAX_PADDING = 150
BATCH_SIZE= 32

class Embeddingss(nn.Module):
  def __init__(self, vocab_size: int, d_model: int):
    """
    Args:
      vocab_size:    size of vocabulary
      d_model:       dimension of embeddings
    """
    super().__init__()
    self.lut = nn.Embedding(vocab_size, d_model)
    self.d_model = d_model

  def forward(self, xs):
    """
    Args:
      xs:        input tensor (batch_sizes, sseq_lenght)

      returns:  embedding vector
    """
    # xs = xs.to(self.lut.weight.devices)
    return (self.lut(xs) * math.sqrt(self.d_model))


class PositionalEncodings(nn.Module):
  def __init__(self, d_model: int, dropout: float = 0.1, max_length: int = 5000):
    """
    Args:
      d_model:     dimension of embeddings
      dropout:     randomly zeroes-out some of the input
      max_lenght:  amx sequence length
    """

    super().__init__()
    self.dropout=nn.Dropout(p=dropout)
    pes=torch.zeros(max_length,d_model)
    for k in np.arange(max_length):
      for i in np.arange(d_model//2):
        thetas = k / (100** ((2*i)/d_model))


        pes[k, 2*i] = math.sin(thetas)


        pes[k, 2*i+1] = math.cos(thetas)
        self.register_buffer("pes",pes)

  def forward(self, xs):
    """
    Args:
      xs:        embeddings (batch_sizes, seq_lenght, d_model)
      returns:  embeddings + positonal encodings (batch_sizes, seq_lengths, d_model)
    """
    self.pes=self.pes.to(xs.device)
    xs = xs + self.pes[:xs.size(1), :].requires_grad_(False)
    return self.dropout(xs)


class MultiHeadAttentions(nn.Module):
  def __init__(self, d_model, n_heads, dropout: float = 0.1):
    """
    Args:
      d_model:      dimension of embeddings
      n_heads:      number of self attention heads
      dropout:      probability of dropout occuring
    """
    super().__init__()
    self.d_model = d_model
    self.n_heads = n_heads
    self.d_key = d_model // n_heads

    # create query, key, value, outputs weights
    self.Wq = nn.Linear(d_model, d_model)
    self.Wk = nn.Linear(d_model,d_model)
    self.Wv = nn.Linear(d_model,d_model)
    self.Wo = nn.Linear(d_model,d_model)

    self.dropout = nn.Dropout(p = dropout)

  def forward(self, query, key, value, masks = None):
    """
    Args:
      query:    query vector (batch_sizes, q_length, d_model)
      key:      key vector (batch_sizes, k_length, d_model)
      value:    value vector (batch_sizes, s_length, d_model)
      masks:     masks for decoders

    Returns:
      outputs:    attention values (batch_sizes, q_lenght, d_model)
      attn_probss:  softmax scoress (batchsize, n_heads, q_length, k_length)
    """
    batch_sizes = query.shape[0]

    # calculate query, key, and value tensors
    Qs = self.Wq(query)
    Ks = self.Wk(key)          # (32, 10, 512) xs (512, 512) = (32, 10, 512)
    Vs = self.Wv(value)

    # split each tensor into n_heads to compute attention

    # query tensor
    Qs = Qs.view(batch_sizes,
               -1,                                    # (32, 10, 512) -> (32, 10, 8 ,64)
               self.n_heads,                          # -1 = q_lenght
               self.d_key).permute(0, 2, 1, 3)        # (32, 10, 8, 64) -> (32, 8, 10, 64)

    # key tensor
    Ks = Ks.view(batch_sizes,
               -1,
               self.n_heads,
               self.d_key).permute(0, 2, 1, 3)

    # value tensor
    Vs = Vs.view(batch_sizes,
               -1,
               self.n_heads,
               self.d_key).permute(0, 2, 1, 3)

    # computes attention
    # scalled dot product -> QK^{T}
    scaled_dot_prod = torch.matmul(Qs, Ks.permute(0, 1, 3, 2)) / math.sqrt(self.d_key)

    # scaled_dot_prod = scaled_dot_prod.to(masks.devices)

    # fill thoes positions of product as (-1e10) where masks positions are 0
    if masks is not None:
      scaled_dot_prod = scaled_dot_prod.masked_fill(masks == 0, -1e10)

    attn_probss = torch.softmax(scaled_dot_prod, dim = -1)

    # attn_probss = attn_probss.to(Vs.devices)

    # multiply by values to get attention
    As = torch.matmul(self.dropout(attn_probss), Vs)


    # reshape attention back to (32, 10, 512)
    As = As.permute(0,2,1,3).contiguous()               # (32, 8, 10, 64) -> (32, 10, 8 ,64)
    As = As.view(batch_sizes, -1, self.n_heads*self.d_key)     # (32, 10, 8, 64) -> (32, 10, 8*64) = (32, 10, 512)

    outputs = self.Wo(As)

    return outputs, attn_probss


class PositionwiseFeedForwards(nn.Module):
  def __init__(self, d_model: int, d_ffn: int, dropout: float = 0.1):
    """
    Args:
      d_model:      dimension of embeddings
      d_ffn:        dimension of feed-forwards network
      dropout:      probability of dropout occuring
    """

    super().__init__()
    self.linear_layer_1 = nn.Linear(d_model, d_ffn)
    self.linear_layer_2 = nn.Linear(d_ffn, d_model)
    self.dropout = nn.Dropout(p=dropout)

  def forward(self, xs):
    """
    Args:
      xs:        outputs from attention (batch_sizes, seq_lengths, d_model)

    Returns:
      expanded-and-contracted representation (batch_sizes, seq_lengths, d_model)
    """

    return self.linear_layer_2(self.dropout(self.linear_layer_1(xs).relu()))



class EncoderLayers(nn.Module):
  def __init__(self, d_model: int, n_heads: int, d_ffn: int, dropout: float):
    """
    Args:
      d_model:      dimension of embeddings
      n_heads:      number of heads
      d_ffn:        dimension of feed-forwards network
      dropout:      probability of dropout ocurring
    """
    super().__init__()
    self.attention = MultiHeadAttentions(d_model, n_heads, dropout)
    self.attn_layer_norm = nn.LayerNorm(d_model)
    self.positionwise_fnn = PositionwiseFeedForwards(d_model, d_ffn, dropout)
    self.fnn_layer_norm = nn.LayerNorm(d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self, srcs, src_masks):
    """
    Args:
      srcs:      positionally embedded sequences (batch_sizes, seq_lengths, d_model)
      src_masks: masks for the sequences (batch_sizes, 1, 1, seq_lenght)
    Returns:
      srcs:      Sequences after self-attention (batch_sizes, seq_lengths, d_model)
    """

    _srcs, attn_probss = self.attention(srcs, srcs, srcs, src_masks)

    srcs = self.attn_layer_norm(srcs + self.dropout(_srcs))

    _srcs = self.positionwise_fnn(srcs)

    srcs = self.fnn_layer_norm(srcs + self.dropout(_srcs))

    return srcs, attn_probss


class Encoders(nn.Module):
  def __init__(self, d_model: int, n_layers: int, n_heads: int, d_ffn: int, dropout: float = 0.1):
    """
    Args:
      d_model:      dimension of embeddings
      n_layers:     number of encoders layers
      n_heads:      number of heads
      d_ffn:        dimension of feed-forwards network
      dropout:      probability of dropout occuring
    """
    super().__init__()

    # create n_layers encoders
    self.layers = nn.ModuleList([EncoderLayers(d_model, n_heads, d_ffn, dropout) for _ in range(n_layers)])

    self.dropout = nn.Dropout(dropout)

  def forward(self, srcs, src_masks):
    """
    Args:
      srcs:      positionally embedded sequences (batch_sizes, seq_lengths, d_model)
      src_masks: masks for the sequences (batch_sizes, 1, 1, seq_lenght)
    Returns:
      srcs:      Sequences after self-attention (batch_sizes, seq_lengths, d_model)
    """

    # Pass the sequence through each encoders
    for layer in self.layers:
      srcs, attn_probss = layer(srcs, src_masks)

    self.attn_probss = attn_probss
    return srcs


class DecoderLayers(nn.Module):
  def __init__(self, d_model: int, n_heads: int, d_ffn: int, dropout: float):
    """
    Args:
      d_model:      dimension of embeddings
      n_heads:      number of heads
      d_ffn:        dimension of feed-forwards network
      dropout:      probability of dropout occuring
    """
    super().__init__()
    self.masked_attention = MultiHeadAttentions(d_model, n_heads, dropout)
    self.masked_attn_layer_norm = nn.LayerNorm(d_model)
    self.attention = MultiHeadAttentions(d_model, n_heads, dropout)
    self.attn_layer_norm = nn.LayerNorm(d_model)
    self.positionwise_fnn = PositionwiseFeedForwards(d_model, d_ffn, dropout)
    self.fnn_layer_norm = nn.LayerNorm(d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self, trgs, srcs, trg_masks, src_masks):
    """
    Args:
      trgs:          embedded sequences (batch_sizes, trg_seq_length, d_model)
      srcs:          embedded sequences (batch_sizes, src_seq_length, d_model)
      trg_masks:     masks for the sequences (batch_sizes, 1, trg_seq_length, trg_seq_lengt
      src_masks:     masks for the sequences (batch_sizes, 1, 1, src_seq_length)

    Returns:
      trgs: sequences after self-attention (batch_sizes, trg_seq_length, d_model)
      attn_probss: self-attention softmax scoress (batch_sizes, n_heads, trg_seq_length, src_seq_lenght)
    """

    _trgs, attn_probss = self.masked_attention(trgs, trgs, trgs, trg_masks)

    trgs = self.masked_attn_layer_norm(trgs + self.dropout(_trgs))

    _trgs, attn_probss = self.attention(trgs, srcs, srcs, src_masks)

    trgs = self.attn_layer_norm(trgs + self.dropout(_trgs))

    _trgs = self.positionwise_fnn(trgs)

    trgs = self.fnn_layer_norm(trgs + self.dropout(_trgs))

    return trgs, attn_probss

class Decoders(nn.Module):
  def __init__(self, vocab_size: int, d_model: int, n_layers: int, n_heads: int, d_ffn: int, dropout: float = 0.1):
    """
    Args:
      vocab_size:     size of the target vocabulary
      d_model:        dimension of embeddings
      n_layers:       number of encoders layers
      n_heads:        number of heads
      d_ffn:          dimension of feed-forwards network
      dropout:        probability of dropout occurring
    """
    super().__init__()

    # create n_layers encoders
    self.layers = nn.ModuleList([DecoderLayers(d_model, n_heads, d_ffn, dropout) for _ in range(n_layers)])

    self.dropout = nn.Dropout(dropout)

    # set outputs layer
    self.Wo = nn.Linear(d_model, vocab_size)

  def forward(self, trgs, srcs, trg_masks, src_masks):
      """
      Args:
        trgs:          embedded sequences (batch_sizes, trg_seq_length, d_model)
        srcs:          embedded sequences (batch_sizes, src_seq_length, d_model)
        trg_masks:     masks for the sequences (batch_sizes, 1, trg_seq_length, trg_seq_lengt
        src_masks:     masks for the sequences (batch_sizes, 1, 1, src_seq_length)

      Returns:
        outputs:       sequences after decoders (batch_sizes, trg_seq_length, vocab_size)
        attn_probss:   self-attention softmax scoress (batch_sizes, n_heads, trg_seq_length, src_seq

      """

      # pass the sequences through each decoders
      for layer in self.layers:
        trgs , attn_probss = layer(trgs, srcs, trg_masks, src_masks)

      self.attn_probss = attn_probss
      return self.Wo(trgs)


class Transformers(nn.Module):
  def __init__(self, encoders: Encoders, decoders: Decoders, src_embeds: Embeddingss,
               trg_embeds: Embeddingss, src_pad_idx: int, trg_pad_idx: int, device):
    """
    Args:
      encoders:        encoders stack
      decoders:        decoders stack
      src_embeds:      source embeddings and encodings
      trg_embeds:      target embeddings and encodings
      src_pad_idx:    padding index

      trg_pad_idx:    padding index
      devices:         cpu or gpu

    Returns:
      outputs:         sequences after decoders (batch_sizes, trg_seq_length, vocab_size)
    """
    super().__init__()

    self.encoders = encoders
    self.decoders = decoders
    self.src_embeds = src_embeds
    self.trg_embeds = trg_embeds
    self.device = device
    self.src_pad_idx = src_pad_idx
    self.trg_pad_idx = trg_pad_idx

  def make_src_masks(self, srcs):
    """
    Args:
      srcs:        raws sequence with padding     (batch_sizes, seq_lengths)

    Returns:
      src_masks:   masks for each sequence        (batch_sizes, 1, 1, seq_lenght)
    """
    # assign 1 to tokenss that need attended to and 0 to padding tokenss, then add 2 dimensions
    src_masks = (srcs != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
    return src_masks

  def make_trg_masks(self, trgs):
    """
    Args:
      trgs:        raws sequence with padding     (batch_sizes, seq_lengths)

    Returns:
      trg_masks:   masks for each sequence        (batch_sizes, 1, seq_lengths, seq_lenght)
    """
    seq_lengths = trgs.shape[1]

    # assign True to tokenss that need attended to and False to padding tokenss, then add 2 dimensions
    trg_masks = (trgs != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)

    # generate subsequent masks
    trg_sub_masks = torch.tril(torch.ones((seq_lengths, seq_lengths), device = self.device )).bool()

    # bitwise "and" operator
    trg_masks = trg_masks & trg_sub_masks
    return trg_masks

  def forward(self, srcs, trgs):
    """
    Args
      trgs:        raws target sequence (batch_sizes, trg_seq_length)
      srcs:        raws srcs sequences (batch_sizes, src_seq_length)

    Returns:
      outputs:     sequences after decoders   (batch_sizes, trg_seq_length, output_dim)
    """

    # create source and target masks
    src_masks = self.make_src_masks(srcs)    #(batch_sizes, 1, 1, src_seq_length)
    trg_masks = self.make_trg_masks(trgs)    #(batch_sizes, 1, trg_seq_length, trg_seq_length)

    # push the srcs through the encoders layers
    srcs = self.encoders(self.src_embeds(srcs), src_masks)   # (batch_sizes, src_seq_length, d_model)

    # decoders outputs and attention probabilities
    outputs = self.decoders(self.trg_embeds(trgs), srcs, trg_masks, src_masks)

    return outputs


def make_models(device, src_vocab, trg_vocab, n_layers: int = 3, d_model: int = 128,
               d_ffn: int = 256, n_heads: int = 8, dropout: float = 0.1,
               max_length : int = 80):
  """
    Construct a models when provided parameters.

    Args:
      src_vocab:      source vocubulary
      trg_vocab:      target vocubulary
      n_layers:       Number of encoders and decoders
      d_model:        dimension of embeddinsg
      d_ffn:          dimension of feed-forwaed network
      n_heads:        number of heads
      dropout:        probability of dropout ocurring
      max_length:     maximum sequence length for positional encodings

    Returns:
      Transformers models based on hyperparameters
  """
  encoders = Encoders(d_model, n_layers, n_heads, d_ffn, dropout)
  decoders = Decoders(len(trg_vocab), d_model, n_layers, n_heads, d_ffn, dropout)
  pos_enc_src = PositionalEncodings(d_model, dropout, max_length)
  pos_enc_trg = PositionalEncodings(d_model, dropout, max_length)

  src_embeds = nn.Sequential(Embeddingss(len(src_vocab), d_model), pos_enc_src)
  trg_embeds = nn.Sequential(Embeddingss(len(trg_vocab), d_model), pos_enc_trg)


  # create the Transformers models
  models = Transformers(encoders, decoders, src_embeds, trg_embeds,
                    src_pad_idx = src_vocab['<pad>'],
                    trg_pad_idx = trg_vocab['<pad>'],
                    device = device)

   # initialize parameters with Xavier/Glorot
  for p in models.parameters():
    if p.dim() > 1:
      nn.init.xavier_uniform_(p)

  # models.to(devices)
  return models


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-package

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

models = make_models(device, vocab_src, vocab_trg,
                   n_layers=3, n_heads=8, d_model=256,
                   d_ffn=512, max_length=150)

models.to(device)

Transformers(
  (encoders): Encoders(
    (layers): ModuleList(
      (0-2): 3 x EncoderLayers(
        (attention): MultiHeadAttentions(
          (Wq): Linear(in_features=256, out_features=256, bias=True)
          (Wk): Linear(in_features=256, out_features=256, bias=True)
          (Wv): Linear(in_features=256, out_features=256, bias=True)
          (Wo): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (positionwise_fnn): PositionwiseFeedForwards(
          (linear_layer_1): Linear(in_features=256, out_features=512, bias=True)
          (linear_layer_2): Linear(in_features=512, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (fnn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p

In [7]:
checkpoints = torch.load('/content/drive/MyDrive/transformer-model_tel_checkpoint.pt', map_location=device)

# Resize position embeddings
checkpoints['model_state_dict']['src_embed.1.pe'] = checkpoints['model_state_dict']['src_embed.1.pe'][:300]
checkpoints['model_state_dict']['trg_embed.1.pe'] = checkpoints['model_state_dict']['trg_embed.1.pe'][:300]

models.load_state_dict(checkpoints['model_state_dict'])


KeyError: 'src_embed.1.pe'

In [8]:
import random
import torch
import pickle
from torchtext.data.utils import get_tokenizer

# Set seeds
random.seed(42)
torch.manual_seed(42)

# Load your parallel data (modify this based on your custom loading function)
# Example: each line is "english \t telugu"
def read_parallel_datass(path):
    with open(path, encoding='utf-8') as f:
        lines = f.readlines()
    datass = []
    for line in lines:
        parts = line.strip().split('\t')
        if len(parts) == 2:
            src, tgt = parts
            datass.append((src, tgt))
    return datass

# Load parallel data
raw_datass = read_parallel_datass('/content/drive/MyDrive/eng-tel.txt')

# Assume you have a split function if needed
train_data_raws, val_data_raws, test_data_raws = split_datasets(raw_datass)

# Define a TranslationDataset (or modify your QADataset accordingly)
train_dataset = CustomDataset(train_data_raws)  # Use the renamed dataset class

# Load source (English) and target (Telugu) vocabs
with open('/content/drive/MyDrive/vocabs-eng.pkl', 'rb') as f:
    vocab_srcs = pickle.load(f)

with open('/content/drive/MyDrive/vocabs-tel.pkl', 'rb') as f:
    vocab_tgts = pickle.load(f)

# Tokenizers
tokenizer_srcs = get_tokenizer('basic_english')  # For English
tokenizer_tgts = lambda x: list(x.strip())       # For Telugu (character-level or use custom)

# Special tokens
PAD_IDX = vocab_tgts['<pad>']
SOS_IDX = vocab_tgts['<sos>']
EOS_IDX = vocab_tgts['<eos>']

# Setup device and model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
models = make_models(device, vocab_srcs, vocab_tgts,
                       n_layers=3, n_heads=8, d_model=256,
                       d_ffn=512, max_length=150)

# Load model weights from checkpoint
checkpointss = torch.load('/content/drive/MyDrive/transformer-model_tel_checkpoint.pt', map_location=device)
models.load_state_dict(checkpoints['model_state_dict'])

# Final setup
models.to(device)
models.eval()


Transformers(
  (encoders): Encoders(
    (layers): ModuleList(
      (0-2): 3 x EncoderLayers(
        (attention): MultiHeadAttentions(
          (Wq): Linear(in_features=256, out_features=256, bias=True)
          (Wk): Linear(in_features=256, out_features=256, bias=True)
          (Wv): Linear(in_features=256, out_features=256, bias=True)
          (Wo): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (positionwise_fnn): PositionwiseFeedForwards(
          (linear_layer_1): Linear(in_features=256, out_features=512, bias=True)
          (linear_layer_2): Linear(in_features=512, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (fnn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p

In [9]:
def translate_multiline(text, models, devices, vocab_src, vocab_trg, tokenizer_te, max_length=150):
    lines = text.strip().split('\n')  # Split the input text into individual lines
    all_translations = []

    for line in lines:
        if line.strip():  # Skip empty lines
            srcs, trg_tokenss = translate_sentence(line.strip(), models, devices, vocab_src, vocab_trg, tokenizer_te, max_length)
            translated_line = " ".join(trg_tokenss)
            all_translations.append(translated_line)

            # Optional: print source and target for each line
            #print(f"source --> {' '.join(srcs[1:-1])}")  # remove <bos> and <eos>
            #print(f"target translation --> {translated_line}\n")

    return "\n".join(all_translations)


In [10]:

import torch

def translate_sentence(sentence, models, devices, vocab_src, vocab_trg, tokenizer_te, max_length=150):
    models.eval()

    # Check if the input is a string and tokenize accordingly

    if isinstance(sentence, str):
        # Tokenize the sentence using the Telugu tokenizer
        tokenss = tokenizer_te(sentence)
        srcs = ['<bos>'] + [token.text.lower() for token in tokenss] + ['<eos>']
    else:
        srcs = ['<bos>'] + sentence + ['<eos>']

    # Map the tokenss to their respective indices in the source vocabulary
    src_indexess = [vocab_src[token] if token in vocab_src else vocab_src['<unk>'] for token in srcs]

    # Convert the list of indices to a tensor and add a batch dimension
    src_tensors = torch.tensor(src_indexess, dtype=torch.long).unsqueeze(0).to(devices)

    # Initialize the list of target indices with the index of ''
    trg_indexess = [vocab_trg['<bos>']]

    # Initialize the loop to generate tokenss up to a maximum length
    for i in range(max_length):
        # Convert the current list of target indices to a tensor and add a batch dimension
        trg_tensors = torch.tensor(trg_indexess, dtype=torch.long).unsqueeze(0).to(devices)

        with torch.no_grad():
            # Feed the source and target tensors to the models to get the logitss
            outputs = models(src_tensors, trg_tensors)
            pred_tokens = outputs.argmax(2)[:, -1].item()

            # Check if the predicted token is '' or the maximum length is reached
            if pred_tokens == vocab_trg['<eos>'] or i == (max_length - 1):
                # Convert indices to tokenss
                trg_tokenss = [vocab_trg.lookup_token(index) for index in trg_indexess[1:]]  # Skip ''
                return srcs, trg_tokenss

            # Append the predicted token to the list of target indices
            trg_indexess.append(pred_tokens)


src_text = "how was it?"
models = models  # Replace with your actual models
devices = 'cuda' if torch.cuda.is_available() else 'cpu'  # Assuming CUDA is available and appropriate
srcs, trg_tokenss = translate_sentence(src_text, models, devices, vocab_src, vocab_trg, tokenizer_te)
print(f'source --> {" ".join(srcs[1:-1])}')
print(f'target translation --> {" ".join(trg_tokenss)}')

source --> how was it ?
target translation -->   ఎలా <unk> ?


In [11]:
import re

def prettify_text(text):
    # 1. Fix spacing issues around punctuation
    text = re.sub(r"\s+([.,!?'])", r"\1", text)  # remove spaces before punctuation
    text = re.sub(r"([.,!?])([^\s])", r"\1 \2", text)  # ensure space after punctuation

    # 2. Prevent breaking after short exclamations
    text = re.sub(r"([a-zA-Z]+[!?])\s*", r"\1", text)

    # ✅ NEW: Add line breaks before numbered steps (e.g., "1.", "2." etc.)
    text = re.sub(r"\b(\d)\s*\.\s*", r"\n\1. ", text)

    # 3. Break sentences after '.', '!', '?', except for numbers
    text = re.sub(r"(?<!\d)\. ", ".\n", text)
    text = re.sub(r"([!?])\s+", r"\1\n", text)

    # 4. Remove extra blank lines (optional)
    text = re.sub(r"\n{2,}", "\n", text)

    # 5. Strip leading/trailing spaces
    text = text.strip()

    return text

In [12]:

multiline_input = """
I'm here to assist you in finding the nearest ATM in your area. To help me provide accurate information.
i ' ll be glad to assist you with that . to initiate the dispute process , please follow these steps.

"""
prettified_input = prettify_text(multiline_input)
translated_text = translate_multiline(prettified_input, models, device, vocab_src, vocab_trg, tokenizer_te)
#translated_text = translate_multiline(multiline_input, model, device, vocab_src, vocab_trg, tokenizer_te)
print("Full Translation:\n", translated_text)


Full Translation:
   మీ ప్రాంతంలో సమీప <unk> కనుగొనడంలో మీకు సహాయం చేయడానికి నేను ఇక్కడ ఉన్నాను .
  నాకు ఖచ్చితమైన సమాచారాన్ని అందించడంలో సహాయం చేయడానికి నేను సంతోషిస్తున్నాను .
  నేను మీకు సహాయం చేయడానికి సంతోషిస్తాను .
  వివాద ప్రక్రియను ప్రారంభించడానికి , దయచేసి ఈ దశలను అనుసరించండి :


In [13]:

multiline_input = """
absolutely ! i ' m sorry to hear that you ' re having trouble blocking your card . i ' m here to assist you with that . to block your card , please follow these steps 1 . contact our customer support team at { { customer support phone number } } or visit our website at { { company website url } } . 2 . provide them with the necessary details , such as your card number , expiration date , and any other required information , and any additional information , and any other relevant information   expiration date of the card

"""
prettified_input = prettify_text(multiline_input)
translated_text = translate_multiline(prettified_input, models, device, vocab_src, vocab_trg, tokenizer_te)

print("Full Translation:\n", translated_text)



Full Translation:
   మీ కార్డును తిరిగి పొందడంలో మీరు ఎదుర్కొన్న ఏ ' ఎటిఎం <unk> ద్వారా మేము మిమ్మల్ని నవీకరిస్తాము .
  దానికి మీకు సహాయం చేయడానికి మేము ఇక్కడ ఉన్నాము .
  మీ కార్డును నిరోధించడానికి , దయచేసి ఈ దశలను అనుసరించండి :
  1 . { { కస్టమర్ సపోర్ట్ ఫోన్ నంబర్ } at వద్ద మా కస్టమర్ మద్దతు బృందాన్ని సంప్రదించండి లేదా { { కంపెనీ వెబ్‌సైట్ url } at వద్ద మా వెబ్‌సైట్‌ను సందర్శించండి .
  2 . మీ కార్డ్ నంబర్ , గడువు తేదీ మరియు అవసరమైన ఇతర సమాచారం వంటి అవసరమైన వివరాలను వారికి అందించండి , ఇవి కార్డు సంఖ్య మరియు కార్డు యొక్క తేదీ , మొత్తం మరియు ఇతర సంబంధిత సమాచారం .


In [14]:
import math
import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter


# Read QA dataset (JSON format)
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset, random_split
from collections import Counter
import json

import random
random.seed(42)
torch.manual_seed(42)


# Read QA dataset (JSON format)
def read_qa_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        raw = json.load(f)
    raw_data = []
    for item in raw:
        question = item["question"].strip()
        context = item["context"].strip()
        answer = item["answers"]["text"][0].strip()
        input_text = question + " [SEP] " + context
        raw_data.append((input_text, answer))
    return raw_data

# Splitting the dataset
def split_dataset(data, train_split=0.7, val_split=0.15, test_split=0.15):
    total_size = len(data)
    train_size = int(total_size * train_split)
    val_size = int(total_size * val_split)
    test_size = total_size - train_size - val_size
    train_data, remaining_data = random_split(data, [train_size, total_size - train_size])
    val_data, test_data = random_split(remaining_data, [val_size, test_size])
    return list(train_data), list(val_data), list(test_data)

# Custom Dataset class
class QADataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

    def get_raw_texts(self):
        return [(src, trg) for src, trg in self.data]

# Use basic English tokenizer
tokenizer_en = get_tokenizer('basic_english')

# Build vocabulary
def build_vocabulary(tokenizer, dataset, min_freq=2):
    def yield_tokens(data):
        for src, tgt in data:
            yield tokenizer(src)
            yield tokenizer(tgt)

    vocab = build_vocab_from_iterator(
        yield_tokens(dataset.get_raw_texts()),
        specials=["<pad>", "<unk>", "<sos>", "<eos>"],
        min_freq=min_freq
    )
    vocab.set_default_index(vocab["<unk>"])
    return vocab

# Constants
MAX_PADDING = 300
BATCH_SIZE = 64

# Read and split data
file_path = "/content/drive/MyDrive/dataset.json"
raw_data = read_qa_data(file_path)
train_data_raw, val_data_raw, test_data_raw = split_dataset(raw_data)

# Create datasets
train_dataset = QADataset(train_data_raw)
valid_dataset = QADataset(val_data_raw)
test_dataset = QADataset(test_data_raw)

# Build vocab
vocab = build_vocabulary(tokenizer_en, train_dataset)
import pickle

# # Save the vocab to your Google Drive
# with open('/content/drive/MyDrive/vocab.pkl', 'wb') as f:
#     pickle.dump(vocab, f)


PAD_IDX = vocab['<pad>']
SOS_IDX = vocab['<sos>']
EOS_IDX = vocab['<eos>']

# Batch generation function
def generate_batch(data_batch):
    src_batch, tgt_batch = [], []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for (src_text, tgt_text) in data_batch:
        src_indices = torch.tensor([vocab[token] for token in tokenizer_en(src_text)], dtype=torch.long)
        tgt_indices = torch.tensor([vocab[token] for token in tokenizer_en(tgt_text)], dtype=torch.long)

        src_tensor = torch.cat([torch.tensor([SOS_IDX]), src_indices, torch.tensor([EOS_IDX])]).to(device)
        tgt_tensor = torch.cat([torch.tensor([SOS_IDX]), tgt_indices, torch.tensor([EOS_IDX])]).to(device)

        src_padded = F.pad(src_tensor, (0, MAX_PADDING - len(src_tensor)), value=PAD_IDX)
        tgt_padded = F.pad(tgt_tensor, (0, MAX_PADDING - len(tgt_tensor)), value=PAD_IDX)

        src_batch.append(src_padded)
        tgt_batch.append(tgt_padded)

    return torch.stack(src_batch), torch.stack(tgt_batch)

# DataLoader setup
train_iter = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, collate_fn=generate_batch)
valid_iter = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, collate_fn=generate_batch)
test_iter = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, collate_fn=generate_batch)
print(len(vocab))

class Embeddings(nn.Module):
    def __init__(self, vocab_size: int, d_model: int):
        """
        Args:
          vocab_size:    size of vocabulary
          d_model:       dimension of embeddings
        """
        super().__init__()
        self.lut = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, x):
        """
        Args:
          x:        input tensor (batch_size, seq_length)

        Returns:  embedding vector
        """
        return self.lut(x) * math.sqrt(self.d_model)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_length: int = 5000):
        """
        Args:
            d_model:     dimension of embeddings
            dropout:     dropout probability
            max_length:  max sequence length for positional encoding
        """
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_length, d_model)
        for pos in range(max_length):
            for i in range(0, d_model // 2):
                theta = pos / (100 ** ((2 * i) / d_model))
                pe[pos, 2 * i] = math.sin(theta)
                pe[pos, 2 * i + 1] = math.cos(theta)

        self.register_buffer("pe", pe)

    def forward(self, x):
        """
        Args:
            x: embeddings (batch_size, seq_length, d_model)
        Returns:
            embeddings + positional encodings (batch_size, seq_length, d_model)
        """
        x = x + self.pe[:x.size(1)].unsqueeze(0).requires_grad_(False)
        return self.dropout(x)

import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout: float = 0.1):
        """
        Args:
            d_model:      dimension of embeddings
            n_heads:      number of attention heads
            dropout:      dropout probability
        """
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_key = d_model // n_heads

        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)
        self.Wv = nn.Linear(d_model, d_model)
        self.Wo = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        """
        Args:
            query: (batch_size, q_len, d_model)
            key:   (batch_size, k_len, d_model)
            value: (batch_size, v_len, d_model)
            mask:  optional mask (batch_size, 1, 1, k_len)  or (batch_size, 1, q_len, k_len)

        Returns:
            output:     (batch_size, q_len, d_model)
            attn_probs: (batch_size, n_heads, q_len, k_len)
        """
        batch_size = query.size(0)

        Q = self.Wq(query)  # (batch_size, q_len, d_model)
        K = self.Wk(key)    # (batch_size, k_len, d_model)
        V = self.Wv(value)  # (batch_size, v_len, d_model)

        # Split heads
        Q = Q.view(batch_size, -1, self.n_heads, self.d_key).transpose(1, 2)  # (batch_size, n_heads, q_len, d_key)
        K = K.view(batch_size, -1, self.n_heads, self.d_key).transpose(1, 2)  # (batch_size, n_heads, k_len, d_key)
        V = V.view(batch_size, -1, self.n_heads, self.d_key).transpose(1, 2)  # (batch_size, n_heads, v_len, d_key)

        # Attention scores
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_key)  # (batch_size, n_heads, q_len, k_len)

        if mask is not None:
            # mask shape must broadcast to (batch_size, n_heads, q_len, k_len)
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn_probs = F.softmax(scores, dim=-1)
        attn_probs = self.dropout(attn_probs)

        A = torch.matmul(attn_probs, V)  # (batch_size, n_heads, q_len, d_key)

        A = A.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * self.d_key)  # (batch_size, q_len, d_model)
        output = self.Wo(A)

        return output, attn_probs


def create_padding_mask(seq, pad_idx):
    """
    Args:
      seq: tensor (batch_size, seq_len)
      pad_idx: padding token index

    Returns:
      mask: (batch_size, 1, 1, seq_len)
    """
    mask = (seq != pad_idx).unsqueeze(1).unsqueeze(2)  # (batch_size, 1, 1, seq_len)
    return mask


class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model: int, d_ffn: int, dropout: float = 0.1):
        """
        Args:
            d_model: embedding dimension
            d_ffn: hidden dimension of feed-forward layer
            dropout: dropout probability
        """
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ffn)
        self.linear2 = nn.Linear(d_ffn, d_model)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x):
        """
        Args:
            x: input tensor (batch_size, seq_length, d_model)

        Returns:
            transformed tensor (batch_size, seq_length, d_model)
        """
        return self.linear2(self.dropout(F.relu(self.linear1(x))))


class EncoderLayer(nn.Module):
      def __init__(self, d_model: int, n_heads: int, d_ffn: int, dropout: float):
          """
          Single encoder block for answer generation (same as translation)
          """
          super().__init__()
          self.attention = MultiHeadAttention(d_model, n_heads, dropout)
          self.attn_layer_norm = nn.LayerNorm(d_model)
          self.positionwise_fnn = PositionwiseFeedForward(d_model, d_ffn, dropout)
          self.fnn_layer_norm = nn.LayerNorm(d_model)
          self.dropout = nn.Dropout(dropout)

      def forward(self, src, src_mask):
          """
          src: (batch_size, seq_len, d_model)
          src_mask: optional padding mask
          """
          _src, attn_probs = self.attention(src, src, src, src_mask)
          src = self.attn_layer_norm(src + self.dropout(_src))

          _src = self.positionwise_fnn(src)
          src = self.fnn_layer_norm(src + self.dropout(_src))

          return src, attn_probs


class Encoder(nn.Module):
      def __init__(self, d_model: int, n_layers: int, n_heads: int, d_ffn: int, dropout: float = 0.1):
          """
          Stack of EncoderLayers
          """
          super().__init__()
          self.layers = nn.ModuleList([
              EncoderLayer(d_model, n_heads, d_ffn, dropout)
              for _ in range(n_layers)
          ])
          self.dropout = nn.Dropout(dropout)

      def forward(self, src, src_mask):
          """
          src: embedded input sequence (batch, seq_len, d_model)
          src_mask: padding mask
          """
          for layer in self.layers:
              src, attn_probs = layer(src, src_mask)

          self.attn_probs = attn_probs  # Save last attention map if needed
          return src

class DecoderLayer(nn.Module):
    def __init__(self, d_model: int, n_heads: int, d_ffn: int, dropout: float):
        super().__init__()
        self.masked_attention = MultiHeadAttention(d_model, n_heads, dropout)
        self.masked_attn_layer_norm = nn.LayerNorm(d_model)

        self.attention = MultiHeadAttention(d_model, n_heads, dropout)
        self.attn_layer_norm = nn.LayerNorm(d_model)

        self.positionwise_fnn = PositionwiseFeedForward(d_model, d_ffn, dropout)
        self.fnn_layer_norm = nn.LayerNorm(d_model)

        self.dropout = nn.Dropout(dropout)

    def forward(self, trg, src, trg_mask, src_mask):
        """
        trg: (batch, tgt_len, d_model) - decoder input
        src: (batch, src_len, d_model) - encoder output
        trg_mask: (batch, 1, tgt_len, tgt_len) - causal mask
        src_mask: (batch, 1, 1, src_len) - padding mask
        """
        _trg, attn_probs = self.masked_attention(trg, trg, trg, trg_mask)
        trg = self.masked_attn_layer_norm(trg + self.dropout(_trg))

        _trg, attn_probs = self.attention(trg, src, src, src_mask)
        trg = self.attn_layer_norm(trg + self.dropout(_trg))

        _trg = self.positionwise_fnn(trg)
        trg = self.fnn_layer_norm(trg + self.dropout(_trg))

        return trg, attn_probs


class Decoder(nn.Module):
    def __init__(self, vocab_size: int, d_model: int, n_layers: int, n_heads: int, d_ffn: int, dropout: float = 0.1):
        super().__init__()
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, n_heads, d_ffn, dropout)
            for _ in range(n_layers)
        ])
        self.dropout = nn.Dropout(dropout)
        self.Wo = nn.Linear(d_model, vocab_size)

    def forward(self, trg, src, trg_mask, src_mask):
        """
        trg: embedded target tokens (batch, tgt_len, d_model)
        src: encoder output (batch, src_len, d_model)
        trg_mask: causal mask for target (batch, 1, tgt_len, tgt_len)
        src_mask: padding mask for encoder output (batch, 1, 1, src_len)
        """
        for layer in self.layers:
            trg, attn_probs = layer(trg, src, trg_mask, src_mask)

        self.attn_probs = attn_probs
        return self.Wo(trg)


class Transformer(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: Embeddings,
                 trg_embed: Embeddings, src_pad_idx: int, trg_pad_idx: int, device):
        """
        Args:
            encoder:        encoder stack
            decoder:        decoder stack
            src_embed:      question/context embeddings
            trg_embed:      answer embeddings
            src_pad_idx:    padding index for input
            trg_pad_idx:    padding index for output
            device:         cpu or gpu
        """
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.trg_embed = trg_embed
        self.device = device
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx

    def make_src_mask(self, src):
        """
        Creates padding mask for encoder input
        """
        return (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

    def make_trg_mask(self, trg):
        """
        Creates padding + causal mask for decoder input
        """
        seq_length = trg.size(1)
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        trg_sub_mask = torch.tril(torch.ones((seq_length, seq_length), device=self.device)).bool()
        return trg_pad_mask & trg_sub_mask

    def forward(self, src, trg):
        """
        src: (batch_size, src_seq_length) - question + context
        trg: (batch_size, trg_seq_length) - answer (shifted)
        Returns:
            logits: (batch_size, trg_seq_length, vocab_size)
        """
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)

        memory = self.encoder(self.src_embed(src), src_mask)
        output = self.decoder(self.trg_embed(trg), memory, trg_mask, src_mask)

        return output




def make_model(device, src_vocab, trg_vocab, n_layers: int = 3, d_model: int = 256,
               d_ffn: int = 2048, n_heads: int = 8, dropout: float = 0.1,
               max_length: int = 5000):
    """
    Constructs a Transformer model for answer generation.

    Args:
        src_vocab: source vocabulary (e.g. for question + context)
        trg_vocab: target vocabulary (e.g. for answer)
        device: torch device
        n_layers: number of encoder & decoder layers
        d_model: embedding dimension
        d_ffn: feed-forward hidden dimension
        n_heads: number of attention heads
        dropout: dropout probability
        max_length: max sequence length for positional encoding

    Returns:
        A full Transformer model instance
    """
    encoder = Encoder(d_model, n_layers, n_heads, d_ffn, dropout)
    decoder = Decoder(len(trg_vocab), d_model, n_layers, n_heads, d_ffn, dropout)

    src_embed = Embeddings(len(src_vocab), d_model)
    trg_embed = Embeddings(len(trg_vocab), d_model)
    pos_enc = PositionalEncoding(d_model, dropout, max_length)

    model = Transformer(
        encoder,
        decoder,
        nn.Sequential(src_embed, pos_enc),
        nn.Sequential(trg_embed, pos_enc),
        src_pad_idx=src_vocab.get_stoi().get("<pad>", 0),
        trg_pad_idx=trg_vocab.get_stoi().get("<pad>", 0),
        device=device
    )

    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return model


import math
# Set device for model (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Build vocab from source and target datasets
# Assuming you have already built vocab_src and vocab_trg using the build_vocabulary function.
# If not, you need to do that before calling make_model. Here is how to get them:
# vocab_src = build_vocabulary(tokenizer_en, train_dataset)  # For question + context
# vocab_trg = build_vocabulary(tokenizer_en, train_dataset)  # For answer (it might differ in some cases)

# Create the model using make_model function
model = make_model(device, vocab, vocab,  # Assuming vocab_src and vocab_trg are the same for now
                   n_layers=3, n_heads=8, d_model=256,
                   d_ffn=512, max_length=300)

# Move model to the correct device (GPU or CPU)
model.to(device)

6681


Transformer(
  (encoder): Encoder(
    (layers): ModuleList(
      (0-2): 3 x EncoderLayer(
        (attention): MultiHeadAttention(
          (Wq): Linear(in_features=256, out_features=256, bias=True)
          (Wk): Linear(in_features=256, out_features=256, bias=True)
          (Wv): Linear(in_features=256, out_features=256, bias=True)
          (Wo): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (positionwise_fnn): PositionwiseFeedForward(
          (linear1): Linear(in_features=256, out_features=512, bias=True)
          (linear2): Linear(in_features=512, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (fnn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)

In [15]:
# Set seeds
import random
import torch
import pickle
from torchtext.data.utils import get_tokenizer

random.seed(42)
torch.manual_seed(42)

# Load your raw data
raw_data = read_qa_data('/content/drive/MyDrive/dataset.json')

# Recreate train/val/test splits
train_data_raw, val_data_raw, test_data_raw = split_dataset(raw_data)

# Recreate datasets
train_dataset = QADataset(train_data_raw)

# Load saved vocab instead of rebuilding
with open('/content/drive/MyDrive/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

# Tokenizer (should be the same as used when saving vocab)
tokenizer_en = get_tokenizer('basic_english')

# Special token indices
PAD_IDX = vocab['<pad>']
SOS_IDX = vocab['<sos>']
EOS_IDX = vocab['<eos>']

# Recreate model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = make_model(device, vocab, vocab,  # vocab_src and vocab_tgt are the same
                   n_layers=3, n_heads=8, d_model=256,
                   d_ffn=512, max_length=300)

# Load trained weights
model.load_state_dict(torch.load('/content/drive/MyDrive/transformer1_model.pth', map_location=device))
model.to(device)
model.eval()


Transformer(
  (encoder): Encoder(
    (layers): ModuleList(
      (0-2): 3 x EncoderLayer(
        (attention): MultiHeadAttention(
          (Wq): Linear(in_features=256, out_features=256, bias=True)
          (Wk): Linear(in_features=256, out_features=256, bias=True)
          (Wv): Linear(in_features=256, out_features=256, bias=True)
          (Wo): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (positionwise_fnn): PositionwiseFeedForward(
          (linear1): Linear(in_features=256, out_features=512, bias=True)
          (linear2): Linear(in_features=512, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (fnn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)

In [16]:
def generate_answer(model, src_sentence, src_vocab, trg_vocab, tokenizer, max_len=300, device='cuda'):
    """
    Generate an answer from a given input sentence using the trained Transformer model.

    Args:
        model: Trained Transformer model.
        src_sentence: Source sentence (question + context) as a string.
        src_vocab: Vocabulary object for source side.
        trg_vocab: Vocabulary object for target side.
        tokenizer: Tokenizer function to split sentence into tokens.
        max_len: Maximum length of generated output.
        device: CPU or GPU.

    Returns:
        generated_answer: The generated answer as a string.
    """
    model.eval()

    # Tokenize and numericalize
    tokens = tokenizer(src_sentence.lower())
    tokens = ['<sos>'] + tokens + ['<eos>']  # adding special tokens if needed

    src_indexes = [src_vocab.get_stoi().get(token, src_vocab.get_stoi()['<unk>']) for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)  # shape (1, src_len)

    src_mask = model.make_src_mask(src_tensor)

    # Encode the source
    with torch.no_grad():
        memory = model.encoder(model.src_embed(src_tensor), src_mask)

    # Start decoding
    trg_indexes = [trg_vocab.get_stoi()['<sos>']]  # start with <sos>

    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)  # shape (1, len_so_far)
        trg_mask = model.make_trg_mask(trg_tensor)

        with torch.no_grad():
            output = model.decoder(model.trg_embed(trg_tensor), memory, trg_mask, src_mask)

        pred_token = output.argmax(-1)[:, -1].item()  # last token prediction

        trg_indexes.append(pred_token)

        if pred_token == trg_vocab.get_stoi()['<eos>']:
            break

    # Convert generated indexes back to tokens
    trg_tokens = [trg_vocab.get_itos()[i] for i in trg_indexes]

    # Remove special tokens
    generated_answer = trg_tokens[1:-1]  # skip <sos> and <eos>

    return ' '.join(generated_answer)

# Example usage
src_sentence = "how do i activate my card?"
print(src_sentence)
generated = generate_answer(model, src_sentence, vocab, vocab, tokenizer_en, device=device)
print("Generated Answer:", generated)


src_sentence = "how to apply for a loan?"
print(src_sentence)
generated = generate_answer(model, src_sentence, vocab, vocab, tokenizer_en, device=device)
print("Generated Answer:", generated)


how do i activate my card?
Generated Answer: sure ! i ' m here to assist you with activating your card for international usage . to activate your card , please follow these steps 1 . contact our customer support team at {{customer support phone number}} or visit our website at {{company website url}} . 2 . provide them with your card details , such as the card number , expiration date , and cvv code . 3 . they will guide you through the activation process and ensure that your card is activated for international transactions . if you have any further questions or need additional assistance , feel free to let me know . i ' m here to help ! let me know if you need any more details .
how to apply for a loan?
Generated Answer: absolutely ! i ' m here to assist you with applying for a loan . applying for a loan can be a significant decision , and i ' ll guide you through the process . here ' s what you need to do 1 . gather your financial information this includes your income statements , id

In [17]:
src_sentence = "how to apply for a loan?"
print(src_sentence)
generated = generate_answer(model, src_sentence, vocab, vocab, tokenizer_en, device=device)
#print("Generated Answer:", generated)

prettified=prettify_text(generated)
#print(prettified)
translated_text = translate_multiline(prettified, models, device, vocab_src, vocab_trg, tokenizer_te)
print(translated_text)

how to apply for a loan?
  రుణం కోసం దరఖాస్తు చేయడంలో మీకు సహాయం చేయడానికి మేము ఇక్కడ ఉన్నాము .
  రుణం కోసం దరఖాస్తు చేసుకోవడం ఒక ముఖ్యమైన నిర్ణయం , మరియు నేను ఈ ప్రక్రియ ద్వారా మీకు మార్గనిర్దేశం చేస్తాను .
  మీరు ఏమి చేయాలో ఇక్కడ ఉంది :
  1 . మీ ఆదాయ ప్రకటనలు , గుర్తింపు పత్రాలు మరియు ఇతర సంబంధిత ఆర్థిక సమాచారాన్ని సేకరించండి .
  2 . వేర్వేరు రుణదాతలను పరిశోధించడానికి మరియు వారి వడ్డీ రేట్లు , నిబంధనలు మరియు అర్హత ప్రమాణాలను పోల్చడానికి కొంత సమయం కేటాయించండి .
  ఇది మీ అవసరాలకు ఉత్తమంగా సరిపోయేలా కనుగొనడంలో మీకు సహాయపడుతుంది .
  3 . మీ అవసరాలు మరియు ఆర్థిక పరిస్థితులకు అనుగుణంగా చర్చించడానికి రుణదాతలు లేదా రుణ సంఘాలను సంప్రదించండి .
  వారు మీకు అవసరమైన ఫారాలను అందిస్తారు మరియు అప్లికేషన్ ప్రాసెస్ ద్వారా మీకు మార్గనిర్దేశం చేస్తారు .
  4 . దరఖాస్తు ఫారమ్‌ను ఖచ్చితంగా నింపండి మరియు అవసరమైన అన్ని సమాచారాన్ని అందించండి .
  మొత్తం సమాచారాన్ని సమర్పించే ముందు రెండుసార్లు తనిఖీ చేయాలని నిర్ధారించుకోండి .
  5 . మీ దరఖాస్తును సమర్పించిన తర్వాత , రుణదాత దానిని సమీక్షిస్తాడు మరియు మీ అర్హతను అం

In [18]:
!pip install fastapi uvicorn nest-asyncio pyngrok


Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.9-py3-none-any.whl.metadata (9.3 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Downloading fastapi-0.115.12-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading uvicorn-0.34.2-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.9-py3-none-any.whl (25 kB)
Downloading starlette-0.46.2-py3-none-any.whl (72 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uvicorn, pyngrok, s

In [19]:
!ngrok config add-authtoken 2wZz95J3CJIULHdK7wQoAh6H7oA_749tnWhD5qsfHSwzUqgmu

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from fastapi import FastAPI, Request
from pydantic import BaseModel
import uvicorn
import nest_asyncio
from pyngrok import ngrok

from fastapi.middleware.cors import CORSMiddleware

# Apply Colab compatibility
nest_asyncio.apply()

# Define FastAPI app
app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Request body structure
class QueryInput(BaseModel):
    question: str

# Route
@app.post("/generate")
async def generate_route(query: QueryInput):
    src_sentence = query.question
    answer = generate_answer(model, src_sentence, vocab, vocab, tokenizer_en, device=device)
    answer = generate_answer(model, src_sentence, vocab, vocab, tokenizer_en, device=device)
    prettified = prettify_text(answer)
    translated_text = translate_multiline(prettified, models, device, vocab_src, vocab_trg, tokenizer_te)
    return {"question": src_sentence, "answer": translated_text}

# Start server via ngrok
public_url = ngrok.connect(8000)
print("🚀 FastAPI app running at:", public_url)

uvicorn.run(app, port=8000)

In [20]:
!pip install gTTS

Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting click<8.2,>=7.1 (from gTTS)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: click, gTTS
  Attempting uninstall: click
    Found existing installation: click 8.2.1
    Uninstalling click-8.2.1:
      Successfully uninstalled click-8.2.1
Successfully installed click-8.1.8 gTTS-2.5.4


In [21]:
from gtts import gTTS
from IPython.display import Audio
import tempfile

def speak_telugu(text):
    # Step 1: Convert text to Telugu TTS
    tts = gTTS(text=text, lang='te')

    # Step 2: Save to a temporary file (auto-cleaned, no manual handling)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
        tts.save(fp.name)
        return Audio(fp.name, autoplay=True)

In [22]:
speak_telugu(translated_text)

In [23]:
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn
import nest_asyncio
from pyngrok import ngrok
from fastapi.middleware.cors import CORSMiddleware
from gtts import gTTS
from IPython.display import Audio
import tempfile
import base64
import io

nest_asyncio.apply()

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

class QueryInput(BaseModel):
    question: str

# Your existing TTS function adapted to return base64 audio string (better for API response)
def speak_telugu(text):
    tts = gTTS(text=text, lang='te')
    audio_fp = io.BytesIO()
    tts.write_to_fp(audio_fp)
    audio_fp.seek(0)
    audio_base64 = base64.b64encode(audio_fp.read()).decode('utf-8')
    return audio_base64

@app.post("/generate")
async def generate_route(query: QueryInput):
    src_sentence = query.question

    # Your existing answer generation pipeline (placeholder)
    answer = generate_answer(model, src_sentence, vocab, vocab, tokenizer_en, device=device)
    prettified = prettify_text(answer)
    translated_text = translate_multiline(prettified, models, device, vocab_src, vocab_trg, tokenizer_te)

    # Get audio in base64
    audio_base64 = speak_telugu(translated_text)

    return {
        "question": src_sentence,
        "answer": translated_text,
        "audio_base64": audio_base64,
    }

# Start ngrok tunnel and Uvicorn server
public_url = ngrok.connect(8000)
print("🚀 FastAPI app running at:", public_url)

uvicorn.run(app, port=8000)


INFO:     Started server process [2008]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


🚀 FastAPI app running at: NgrokTunnel: "https://fb25-35-240-245-249.ngrok-free.app" -> "http://localhost:8000"
INFO:     2409:40f0:f:ade5:a994:d0bc:af15:cf2f:0 - "OPTIONS /generate HTTP/1.1" 200 OK
INFO:     2409:40f0:f:ade5:a994:d0bc:af15:cf2f:0 - "POST /generate HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [2008]
