#### Connect Google Drive

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive/')
os.chdir('/content/drive/MyDrive/Colab/bangLaSTM')

os.getcwd()


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


'/content/drive/.shortcut-targets-by-id/1PnMuZB2WBL9gApz52tmg-tlFJkJ8O4vn/bangLaSTM'

#### Imports

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence
from torch.utils.data import Dataset, DataLoader
from gensim.models import FastText
import numpy as np
import pandas as pd
import re
from tqdm.auto import tqdm
import torch.nn.functional as F
import random
from collections import Counter

Choose model

In [3]:
#chosen_embeddings = FastText.load("embeddings/cc.bn.300.model")
#chosen_embeddings = FastText.load("embeddings/ai4b_subset_sg.model")
chosen_embeddings = FastText.load("embeddings/ai4b_subset_fair.model")

Create embedding matrix

In [4]:
def build_simple_embedding(gensim_model:FastText, keep_n = 150000):
    wv = gensim_model.wv
    gensim_weights = torch.FloatTensor(wv.vectors[:keep_n])
    # sorted, so keeping top 150000 works

    pad_weight = torch.zeros(1, wv.vector_size)                     # <PAD> gets zeros
    special_weights = torch.randn(3, wv.vector_size) * 0.1          # <BOS>, <EOS>, <UNK> get random noise
    # scale down (x0.1) to match sparseness of other token vecs

    # combine <PAD>, <BOS>, <EOS> and <UNK> with other tokens
    all_weights = torch.cat([pad_weight, special_weights, gensim_weights], dim=0)

    # make the full embedding
    embedding_layer = nn.Embedding.from_pretrained(all_weights, freeze=False, padding_idx=0)

    # create mapping dictionary for token in new vocab, to index
    word2idx = {'<PAD>': 0, '<BOS>': 1, '<EOS>': 2, '<UNK>': 3}
    for idx, word in enumerate(wv.index_to_key[:keep_n]):
        word2idx[word] = idx + 4

    return embedding_layer, word2idx

In [5]:
embedding_layer, word_to_index = build_simple_embedding(chosen_embeddings)
print('Vocabulary size:', len(word_to_index))

Vocabulary size: 150004


(free up RAM)

In [6]:
del chosen_embeddings

Setup tokenization

In [7]:
def tokenize_bangla(text):
    text_spaced = re.sub(r'([^\u0980-\u09FF\u200C\u200D0-9\s])', r' \1 ', str(text))
    return text_spaced.split()

def detokenize_bangla(word_list):
    return " ".join(word_list) # can't and won't handle punctuation
    # because on reconstruction we have no way to reconcile stuff like close) and open( parenthesese
    # or symmetric 'quote' vs. apostroph'e

### Encoder - BiLSTM

In [8]:
class BiLSTMEncoder(nn.Module):
    def __init__(self, embedding_layer, hidden_size):
        super(BiLSTMEncoder, self).__init__()

        self.embedding = embedding_layer # loads embedding made with gensim
        self.hidden_size = hidden_size # neural net hidden size
        embed_size = embedding_layer.embedding_dim # 300 for us

        self.lstm = nn.LSTM(
            input_size=embed_size, # 300
            hidden_size=hidden_size, # suppose 256, for subsequent example
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )

    def forward(self, x, lengths):
        # dry run: consider a batch (size 4) of question vectors,
        # with the longest sequence's length as 8.
            # [ 1, 45,  89,  12,  56,  90,  34,   2]
            # [ 1, 19, 102,  77, 210,  14,   2,   0]
            # [ 1, 65,  23,  11,   2,   0,   0,   0]
            # [ 1, 99,  41,   2,   0,   0,   0,   0]
        # this is loaded into the 2D tensor, x | shape: (4, 8)
        # more generally, x | shape: (batch_size, longest_seq_len)

        # length (1D) is the length of each sequence in x
        # lengths = [8, 7, 5, 4] | shape: (4,)
        # more generally, lengths | shape: (batch_size,)


        # convert each word index to its vector with the embedding.
        # for our example, that's 4 sequences, with each 8 tokens each, and each
        # token having a 'depth' of 300 (it's a vector now)

        # this is a 3D tensor, embedded | shape: (4, 8, 300)
        # more generally, embedded | shape: (batch_size, longest_seq_len, word_vec_embedding_dim)
        embedded = self.embedding(x)

        # tells PyTorch to mathematically gloss over <PAD> tokens by ignoring them based on the
        # values in the length vector (1D tensor)

        # tells the neural net to fully ignore <PAD> tokens.
        # even though they are zeroed out, the LSTM tries to do some math
        # when encountering it using its 3 gates. this adds some redundancy
        # and learning that it really doesn't need.
        packed_embedded = pack_padded_sequence(
            embedded,
            lengths.cpu(),
            batch_first=True, # our formatting puts the batch_size first
            enforce_sorted=False # sort the batch by sequence length (high to low)
        )

        # run the nice embeddings through the BiLSTM

        # hidden --- final hidden state (short term memory) | shape: (2, 4, 256)
        # 2 : forward + backward, 4 : sequences, 256 : hidden-size

        # cell --- final cell state (long term memory) | shape: (2, 4, 256)
        # (same logic)

        _, (hidden, cell) = self.lstm(packed_embedded)

        # hidden/cell tensors have shape (num_layers * num_directions, batch_size, hidden_size)
        # index 0 -> forward LSTM's final state; index 1 -> backward LSTM's final state

        h_forward = hidden[0, :, :] # shape: (1, 4, 256) [take forward direction]
        h_backward = hidden[1, :, :] # shape: (1, 4, 256) [take backward direction]
        # recall, hidden | shape: (2, 4, 256)

        # same logic
        c_forward = cell[0, :, :]
        c_backward = cell[1, :, :]

        # concatenate along the hidden_size dimension (dim=1)
        # h_context (c_context) | shape: (batch_size, hidden_size * 2) = (4, 256*2) = (4, 512)

        h_context = torch.cat((h_forward, h_backward), dim=1)
        c_context = torch.cat((c_forward, c_backward), dim=1)

        # compressing the context of each question (long term and short term)
        # into two vectors of size 2*256 = 512, for every sentence in the batch
        return h_context, c_context

### Decoder - LSTM

In [9]:
class LSTMDecoder(nn.Module):
    def __init__(self, embedding_layer, hidden_size, vocab_size):
        super(LSTMDecoder, self).__init__()

        # same as before
        self.embedding = embedding_layer
        embed_size = embedding_layer.embedding_dim

        # double of BiLSTM hidden size
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(
            input_size=embed_size,
            hidden_size=hidden_size,
            num_layers=1,
            batch_first=True
        )

        # setup to hold hidden dim vectors streched out as probabilities
        # over tokens in the vocab
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        # x: input token for current step, shape: (batch_size) = (4) [suppose]

        # LSTM requires 3D input: (batch_size, sequence_length, embed_size).
        # since we process exactly 1 token at a time (per batch), the sequence_length is always 1
        # shape: (4, 1)
        x = x.unsqueeze(1)

        # convert each token to its vector
        # embedded | shape: (4, 1, 300)
        embedded = self.embedding(x)

        # pass embedded word and BiLSTM question contexts (long term, short term) into LSTM
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        # output shape: (batch_size, 1, hidden_size) = (4, 1, 512)

        # squeeze out the sequence length dimension because it is no longer needed
        # shape: (4, 512)
        output = output.squeeze(1)

        # push to linear layer to make prediction for current word
        # shape: (batch_size, vocab_size) = (4, 1485027)
        prediction = self.fc(output)

        # return guess and forward directional memory for next word
        return prediction, hidden, cell

### Seq2Seq setup with Teacher Forcing

In [10]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, src_lengths, trg, teach_prob=0.5):

        # src: (batch_size, max_src_len) - padded bangla questions
        # src_lengths: (batch_size) - true lengths of the questions
        # trg: (batch_size, max_trg_len) - ground truth bangla answers

        batch_size = trg.shape[0]
        max_trg_len = trg.shape[1]
        vocab_size = self.decoder.fc.out_features

        # empty tensor to hold word by word predictions
        # outputs | shape: (batch_size, max_trg_len, vocab_size)
        outputs = torch.zeros(batch_size, max_trg_len, vocab_size, device=self.device)
        # shape: (4, 8, 1485027)

        # encode question
        h_context, c_context = self.encoder(src, src_lengths)

        # format for decoder: (batch_size, 512) -> (1, batch_size, 512)
        # LSTM class only accepts in this format
        hidden = h_context.unsqueeze(0)
        cell = c_context.unsqueeze(0)

        # first input to the decoder is ALWAYS the <BOS> token.
        input_token = trg[:, 0]
        # this is a column vector of <BOS> tokens, [<BOS>]

        # from first word onwards...
        for t in range(1, max_trg_len):

            # pass the current word and the memory states into the decoder
            output, hidden, cell = self.decoder(input_token, hidden, cell)
            # shape: (4, 1485027)

            # store the prediction in our outputs tensor
            outputs[:, t, :] = output
            # t'th word across all batches and full vocabulary has been saved as output

            should_teach = random.random() < teach_prob
            # TEACHER FORCING: - 50% of the time
            # ignore whatever the model outputs
            # force the next input to be the TRUE target token from the dataset.
            input_token = trg[:, t] if should_teach else output.argmax(1)
            # take next column of true answer words as input

        return outputs

Data wrappers

In [11]:
class BanglaQADataset(Dataset):
    def __init__(self, dataframe, word2idx):

        self.questions = dataframe['question'].tolist()
        self.answers = dataframe['answer'].tolist()
        self.word2idx = word2idx

        self.unk_idx = word2idx['<UNK>']
        self.bos_idx = word2idx['<BOS>']
        self.eos_idx = word2idx['<EOS>']

    def tokenize_and_map(self, sentence):
        # tokenise the same way as word vecs
        tokens = tokenize_bangla(sentence)

        # map words to integers, use <UNK> (index 3) if not found
        indices = [self.word2idx.get(token, self.unk_idx) for token in tokens]
        # wrap with <BOS> and <EOS>
        sequence = [self.bos_idx] + indices + [self.eos_idx]

        return torch.tensor(sequence, dtype=torch.long)

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        q_tensor = self.tokenize_and_map(self.questions[idx])
        a_tensor = self.tokenize_and_map(self.answers[idx])
        return q_tensor, a_tensor

def pad_collate_fn(batch):

    questions = []
    answers = []
    lengths = []

    for q, a in batch:
        questions.append(q)
        answers.append(a)
        lengths.append(len(q))

    # pad the sequences with 0 (<PAD>)
    # batch_first=True makes the output shape (batch_size, max_seq_length)
    padded_questions = pad_sequence(questions, batch_first=True, padding_value=0)
    padded_answers = pad_sequence(answers, batch_first=True, padding_value=0)

    # Convert lengths to a tensor
    lengths_tensor = torch.tensor(lengths, dtype=torch.long)

    return padded_questions, lengths_tensor, padded_answers

### Training and Evaluation

Setup

In [12]:
seed_generator = torch.Generator()
seed_generator.manual_seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

torch.backends.cudnn.benchmark = True

BATCH_SIZE = 128
ENC_HIDDEN_DIM = 256
DEC_HIDDEN_DIM = 512
VOCAB_SIZE = len(word_to_index)

Using device: cpu


In [None]:
df_train = pd.read_parquet('data/question_answer/subsets/bn_train.parquet')
train_dataset = BanglaQADataset(df_train, word_to_index)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=pad_collate_fn,
    generator=seed_generator,
    num_workers=2,
    pin_memory=True,
    persistent_workers=True
)

df_val = pd.read_parquet('data/question_answer/subsets/bn_val.parquet')
val_dataset = BanglaQADataset(df_val, word_to_index)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=pad_collate_fn,
    num_workers=2,
    pin_memory=True,
    persistent_workers=True
)

In [None]:
enc = BiLSTMEncoder(embedding_layer, ENC_HIDDEN_DIM)
dec = LSTMDecoder(embedding_layer, DEC_HIDDEN_DIM, VOCAB_SIZE)
seq2seq_model = Seq2Seq(enc, dec, device).to(device)

optimizer = optim.Adam(seq2seq_model.parameters(), lr=0.001, weight_decay=5e-5)
scaler = torch.amp.GradScaler('cuda')

pad_idx = word_to_index['<PAD>']
loss_func = nn.CrossEntropyLoss(ignore_index=pad_idx)

Train and Eval functions

In [None]:
def train_epoch(seq2seq_model, iterator, optimizer, loss_func, scaler, clip=1.0):
    seq2seq_model.train() # train time
    epoch_loss = 0

    for src, src_lengths, trg in tqdm(iterator, desc="Training"):
        src, src_lengths, trg = src.to(device), src_lengths.to(device), trg.to(device)

        optimizer.zero_grad()

        with torch.autocast(device_type='cuda', dtype=torch.float16):
            output = seq2seq_model(src, src_lengths, trg, teach_prob = 0.5) # partial teacher forcing enabled

            # torch's CE loss accepts things only in a certain format. the
            # pre loss calculation block is formatting it

            # output | shape: (batch_size, seq_len, vocab_size)
            output_dim = output.shape[-1]
            # get vocab size, 150004
            output = output[:, 1:, :].reshape(-1, output_dim)
            # [:, 1:, :] -- take everything except [<BOS>] column
            # reshape -- convert into a (batch_size * seq_len) list of predictions (150004 options)
            # **technically (seq_len - 1) because we ignore [<BOS>]
            # same logic with ground truth, except no predictions
            trg = trg[:, 1:].reshape(-1)

            # calculate loss and backpropagate
            loss = loss_func(output, trg)

        scaler.scale(loss).backward()

        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(seq2seq_model.parameters(), clip)

        scaler.step(optimizer)
        scaler.update()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def evaluate_epoch(seq2seq_model, iterator, loss_func):
    seq2seq_model.eval() # eval mode
    epoch_loss = 0

    with torch.no_grad(): # disable gradient tracking to save RAM
        for src, src_lengths, trg in tqdm(iterator, desc='Validating'):
            src, src_lengths, trg = src.to(device), src_lengths.to(device), trg.to(device)

            with torch.autocast(device_type='cuda', dtype=torch.float16):
              output = seq2seq_model(src, src_lengths, trg, teach_prob = 0) # teacher forcing disabled

              output_dim = output.shape[-1]
              output = output[:, 1:, :].reshape(-1, output_dim)
              trg = trg[:, 1:].reshape(-1)

              loss = loss_func(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [None]:
EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(EPOCHS):
    print(f'\n--- Epoch {epoch+1} ---')

    train_loss = train_epoch(seq2seq_model, train_loader, optimizer, loss_func, scaler)
    valid_loss = evaluate_epoch(seq2seq_model, val_loader, loss_func)

    print(f'Train Loss: {train_loss:.4f}')
    print(f'Validation Loss: {valid_loss:.4f}')

Save!

In [None]:
checkpoint = {
    'model_state_dict': seq2seq_model.state_dict(),
}

torch.save(checkpoint, 'models/ai4b_qna_sg_model.pt')

### Metrics & Inference

Rerun any necessary cells from previous setup if running this for the first time.

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cpu


In [15]:
df_test = pd.read_parquet('data/question_answer/subsets/bn_test.parquet')

test_dataset = BanglaQADataset(df_test, word_to_index)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=pad_collate_fn,
    num_workers=2,
    pin_memory=True,
    persistent_workers=True
)

In [13]:
checkpoint = torch.load('models/ai4b_qna_sg_model.pt', map_location=device, weights_only=False)

word_to_index = checkpoint['word_to_index']
embedding_layer = checkpoint['embedding_layer']
vocab_size = len(word_to_index)
index_to_word = {idx: word for word, idx in word_to_index.items()}

model = Seq2Seq(
    BiLSTMEncoder(embedding_layer, ENC_HIDDEN_DIM),
    LSTMDecoder(embedding_layer, DEC_HIDDEN_DIM, vocab_size),
    device
).to(device)

model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

Seq2Seq(
  (encoder): BiLSTMEncoder(
    (embedding): Embedding(150004, 300, padding_idx=0)
    (lstm): LSTM(300, 256, batch_first=True, bidirectional=True)
  )
  (decoder): LSTMDecoder(
    (embedding): Embedding(150004, 300, padding_idx=0)
    (lstm): LSTM(300, 512, batch_first=True)
    (fc): Linear(in_features=512, out_features=150004, bias=True)
  )
)

Inference

In [16]:
def generate_answer(model, question, word_to_index, index_to_word, device, max_len=50):
    model.eval()

    tokens = tokenize_bangla(question)
    indices = [word_to_index['<BOS>']] + \
              [word_to_index.get(word, word_to_index['<UNK>']) for word in tokens] + \
              [word_to_index['<EOS>']]

    src_tensor = torch.LongTensor(indices).unsqueeze(0).to(device)
    src_len = torch.LongTensor([len(indices)])

    with torch.no_grad():
        h, c = model.encoder(src_tensor, src_len)
        hidden = h.unsqueeze(0)
        cell = c.unsqueeze(0)

        input_token = torch.LongTensor([word_to_index['<BOS>']]).to(device)
        result = []

        for _ in range(max_len):
            output, hidden, cell = model.decoder(input_token, hidden, cell)
            top_token = output.argmax(1)

            if top_token.item() == word_to_index['<EOS>']:
                break

            result.append(index_to_word.get(top_token.item(), '<UNK>'))
            input_token = top_token

    return detokenize_bangla(result)


def generate_answer_batched(model, src_tensor, src_len, word_to_index, index_to_word, device, max_len=50):
    model.eval()
    batch_size = src_tensor.shape[0]

    with torch.no_grad():
        h, c = model.encoder(src_tensor, src_len)
        hidden = h.unsqueeze(0)
        cell = c.unsqueeze(0)

        input_token = torch.full((batch_size,), word_to_index['<BOS>'], dtype=torch.long, device=device)
        batch_results = [[] for _ in range(batch_size)]

        unfinished = torch.ones(batch_size, dtype=torch.bool, device=device)

        for _ in range(max_len):
            output, hidden, cell = model.decoder(input_token, hidden, cell)

            top_tokens = output.argmax(1)

            for i in range(batch_size):
                if unfinished[i]:
                    token_id = top_tokens[i].item()
                    if token_id == word_to_index['<EOS>']:
                        unfinished[i] = False
                    else:
                        batch_results[i].append(index_to_word.get(token_id, '<UNK>'))

            if not unfinished.any():
                break

            input_token = top_tokens

    return [detokenize_bangla(res) for res in batch_results]

def generate_stepwise(model, question, word_to_index, index_to_word, device, max_len=50):
    model.eval()

    tokens = tokenize_bangla(question)
    indices = [word_to_index['<BOS>']] + \
              [word_to_index.get(word, word_to_index['<UNK>']) for word in tokens] + \
              [word_to_index['<EOS>']]

    src_tensor = torch.LongTensor(indices).unsqueeze(0).to(device)
    src_len = torch.LongTensor([len(indices)])

    print(f"Question: {question}\n")

    with torch.no_grad():
        h, c = model.encoder(src_tensor, src_len)
        hidden = h.unsqueeze(0)
        cell = c.unsqueeze(0)

        input_token = torch.LongTensor([word_to_index['<BOS>']]).to(device)
        result = []

        for step in range(max_len):
            output, hidden, cell = model.decoder(input_token, hidden, cell)

            probs = F.softmax(output, dim=1)
            top_probs, top_indices = torch.topk(probs, 5, dim=1)

            print(f"--- Token {step + 1} ---")
            for i in range(5):
                word = index_to_word.get(top_indices[0][i].item(), '<UNK>')
                prob = top_probs[0][i].item() * 100
                print(f"  {i+1}. {word} ({prob:.2f}%)")

            top_token = output.argmax(1)
            chosen_word = index_to_word.get(top_token.item(), '<UNK>')
            print(f">> Model chose: '{chosen_word}'\n")

            if top_token.item() == word_to_index['<EOS>']:
                print(">> Reached <EOS>.")
                break

            result.append(chosen_word)
            input_token = top_token

            user_input = input("Proceed? [ENTER/q]\n")
            if user_input.lower() == 'q':
                print("\n>> Stopped.")
                break
            print()

    return detokenize_bangla(result)

Testing

In [17]:
def get_metrics(predicted_sentence, truth_sentence):
  pred_tokens = tokenize_bangla(predicted_sentence)
  truth_tokens = tokenize_bangla(truth_sentence)

  exact_match = pred_tokens == truth_tokens
  # (almost exact match. we can't handle assymmetric vs symmetric punctuation
  # with our tokenization method. think of an apostroph'e and a 'quote', or a opening parentheses vs a closing)


  if not pred_tokens or not truth_tokens:
      return (0.0, 0.0, 0.0, exact_match)

  common = Counter(pred_tokens) & Counter(truth_tokens)
  num_same = sum(common.values())

  precision = num_same / len(pred_tokens) if len(pred_tokens) > 0 else 0
  recall = num_same / len(truth_tokens) if len(truth_tokens) > 0 else 0
  f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

  return precision, recall, f1, exact_match

GPU Predictions

In [None]:
all_predictions = []
for src_tensor, src_len, _ in tqdm(test_loader, desc="Inference"):
    src_tensor = src_tensor.to(device)
    src_len = src_len.to(device)

    batch_preds = generate_answer_batched(model, src_tensor, src_len, word_to_index, index_to_word, device)
    all_predictions.extend(batch_preds)

Comparison

In [None]:
results = []
for pred, truth in zip(all_predictions, df_test['answer']):
    metrics = get_metrics(pred, truth)
    results.append(metrics)

p, r, f1, em = zip(*results)
print(f"\n--- Final Performance ---")
print(f"Precision:   {sum(p)/len(p):.4f}")
print(f"Recall:      {sum(r)/len(r):.4f}")
print(f"F1-Score:    {sum(f1)/len(f1):.4f}")
print(f"Exact Match: {sum(em)/len(em)*100:.2f}%")

In [28]:
sample_question = 'দিল্লি কোন দেশের রাজধানী?'
generate_stepwise(model, sample_question, word_to_index, index_to_word, device)
#generate_answer(model, sample_question, word_to_index, index_to_word, device)

Question: দিল্লি কোন দেশের রাজধানী?

--- Token 1 ---
  1. পাকিস্তান (7.59%)
  2. দক্ষিণ (7.27%)
  3. বাংলাদেশ (5.21%)
  4. আফগানিস্তান (4.92%)
  5. <UNK> (3.12%)
>> Model chose: 'পাকিস্তান'


--- Token 2 ---
  1. । (98.25%)
  2. আরব (0.25%)
  3. যুক্তরাষ্ট্র (0.20%)
  4. ও (0.18%)
  5. - (0.14%)
>> Model chose: '।'


--- Token 3 ---
  1. <EOS> (97.57%)
  2. । (1.90%)
  3. আমিরাত (0.11%)
  4. টোবাগো (0.06%)
  5. <UNK> (0.03%)
>> Model chose: '<EOS>'

>> Reached <EOS>.


'পাকিস্তান ।'