In [18]:

import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from nltk.translate.bleu_score import sentence_bleu
from tqdm import tqdm
from transformers import BertTokenizer, BertModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n[-1])
        return out

# BERT Model
class BERTModel(nn.Module):
    def __init__(self):
        super(BERTModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        return pooled_output

# Ensemble Model
class EnsembleModel(nn.Module):
    def __init__(self, bert_model, lstm_model, output_size):
        super(EnsembleModel, self).__init__()
        self.bert_model = bert_model
        self.lstm_model = lstm_model
        self.fc = nn.Linear(2 * output_size, output_size) # Adjust based on models output

    def forward(self, input_ids, attention_mask, lstm_input):
        bert_output = self.bert_model(input_ids, attention_mask)
        lstm_output = self.lstm_model(lstm_input)
        combined_output = torch.cat((bert_output, lstm_output), dim=1)
        final_output = self.fc(combined_output)
        return final_output

# Dataset handling
class QADataset(Dataset):
    def __init__(self, data_path, max_length, batch_size):
        self.data = pd.read_csv(data_path)
        self.tokenizer = BertTokenizer.from_pretrained('sagorsarker/bangla-bert-base')
        self.model = BertModel.from_pretrained('sagorsarker/bangla-bert-base')
        
        tokenized_data = [self.tokenize_and_encode(q, c, max_length) for q, c in zip(self.data['QUESTION'], self.data['sentence segment 4'])]
        max_len = max(len(item['input_ids'][0]) for item in tokenized_data)

        input_ids = torch.stack([torch.cat([item['input_ids'][0].long(), torch.zeros(max_len - len(item['input_ids'][0]), dtype=torch.long)], dim=0) for item in tokenized_data], dim=0)
        attention_masks = torch.stack([torch.cat([item['attention_mask'][0], torch.zeros(max_len - len(item['attention_mask'][0]))], dim=0) for item in tokenized_data], dim=0)

        self.input_data = TensorDataset(input_ids, attention_masks)
        self.labels = torch.tensor(pd.factorize(self.data['ANSWER'])[0])

    def tokenize_and_encode(self, question, context, max_length):
        input_text = f"প্রশ্ন: {question} প্রস্তুতি: {context}"
        inputs = self.tokenizer.encode_plus(input_text, add_special_tokens=True, return_tensors="pt", max_length=max_length, truncation=True)
        return inputs

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.input_data[idx], self.labels[idx]
hidden_size = 256
lr = 2e-4
num_epochs = 5
batch_size = 8
max_length = 40

dataset = QADataset('/mnt/f/Huggingface/rubayet/alldata.xlsx.csv', max_length, batch_size)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
input_size = dataset.model.config.hidden_size
lstm_model = LSTMModel(input_size, hidden_size, output_size=len(torch.unique(dataset.labels))).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=lr)

for epoch in range(num_epochs):
    lstm_model.train()
    total_loss = 0

    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        inputs, labels = inputs[:, 0, :].to(device), labels.squeeze(1).to(device)



        optimizer.zero_grad()
        outputs = lstm_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} - Average Loss: {average_loss}")

lstm_model.eval()
predictions = []
true_labels = []

for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
    inputs, labels = inputs[:, 0, :].to(device), labels.squeeze(1).to(device)



    optimizer.zero_grad()
    outputs = lstm_model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    with torch.no_grad():
        outputs = lstm_model(inputs)

    predictions.extend(outputs.argmax(dim=1).cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

predicted_answers = predictions
true_answers = true_labels

accuracy = accuracy_score(true_answers, predicted_answers) * 100
f1 = f1_score(true_answers, predicted_answers, average='weighted') * 100

references = [[str(answer)] for answer in true_answers]
bleu_scores = [sentence_bleu(references, predicted.split()) for predicted in predicted_answers]

print(f"Accuracy: {accuracy:.2f}%")
print(f"F1 Score: {f1:.2f}")
print(f"BLEU Score: {sum(bleu_scores) / len(bleu_scores):.4f}")

    # Modify this class based on the specific dataset format

# Add training, testing, and evaluation methods here
# Include methods to compute accuracy, F1 score, and GLUE score (for a specific task)



Epoch 1:   0%|                                                                                 | 0/1004 [00:00<?, ?it/s]


TypeError: list indices must be integers or slices, not tuple