In [None]:
import torch
torch.cuda.empty_cache()
from transformers import (
    AutoTokenizer, AutoModelForQuestionAnswering,
    AdamW, get_linear_schedule_with_warmup
)
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
from faiss import IndexFlatL2
import numpy as np
import json
from typing import List, Dict, Tuple
import nltk
import pickle

nltk.download('punkt_tab')

# Use the loaded tokenizer for sentence tokenization
tokenizer = nltk.tokenize.PunktSentenceTokenizer()


class DocumentLoader:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def load_and_split(self, filepath: str, chunk_size: int = 3) -> List[str]:
        """
        Load document and split into chunks of sentences
        """
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()

        # Use the passed tokenizer
        sentences = self.tokenizer.tokenize(text)

        # Create chunks of sentences
        chunks = []
        for i in range(0, len(sentences), chunk_size):
            chunk = ' '.join(sentences[i:i + chunk_size])
            chunks.append(chunk)

        return chunks



class QADataset(Dataset):
    def __init__(self, questions, contexts, answers, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.questions = questions
        self.contexts = contexts
        self.answers = answers
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.questions[idx],
            self.contexts[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Convert answer text to token positions
        answer_start_char = self.answers[idx]['answer_start']
        answer_text = self.answers[idx]['text']

        # Find the token positions that correspond to the character positions
        context_encoding = self.tokenizer(self.contexts[idx], return_offsets_mapping=True)
        offset_mapping = context_encoding.offset_mapping

        start_position = end_position = 0
        for idx, (start, end) in enumerate(offset_mapping):
            if start <= answer_start_char < end:
                start_position = idx
            if start < answer_start_char + len(answer_text) <= end:
                end_position = idx
                break

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'start_positions': torch.tensor(start_position),
            'end_positions': torch.tensor(end_position)
        }

class RAGEnhancedQA:
    def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.device = device
        self.model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.qa_model = AutoModelForQuestionAnswering.from_pretrained(self.model_name).to(device)

        # Initialize embedding model for retrieval
        self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        self.index = None
        self.passages = []

    def load_documents_from_file(self, filepath: str):
      """Load and index documents from a text file"""
      # Pass the tokenizer to DocumentLoader
      doc_loader = DocumentLoader(nltk.tokenize.PunktSentenceTokenizer())
      chunks = doc_loader.load_and_split(filepath)
      self.index_documents(chunks)
      print(f"Loaded and indexed {len(chunks)} passages from {filepath}")


    def index_documents(self, documents, batch_size=4):
      self.passages = documents
      print("Encoding documents...")

      # Initialize FAISS index
      embedding_dim = self.embedding_model.get_sentence_embedding_dimension()
      self.index = IndexFlatL2(embedding_dim)

      # Stream embeddings into FAISS
      for i in range(0, len(documents), batch_size):
          batch = documents[i:i+batch_size]
          embeddings = self.embedding_model.encode(batch, convert_to_tensor=True).cpu().numpy()
          self.index.add(embeddings)

      print("Documents indexed successfully")



    def retrieve_relevant_passages(self, query, top_k=5):
        """
        Retrieve top_k relevant passages for the given query using FAISS index.
        """
        # Encode the query using the same embedding model
        query_embedding = self.embedding_model.encode(query, convert_to_tensor=True)
        query_embedding = query_embedding.cpu().numpy()

        # Reshape to 2D if it's 1D
        if len(query_embedding.shape) == 1:
            query_embedding = query_embedding.reshape(1, -1)

        # Perform similarity search in FAISS index
        distances, indices = self.index.search(query_embedding, top_k)

        # Retrieve the top_k relevant passages
        relevant_passages = [self.passages[idx] for idx in indices[0] if idx != -1]

        return relevant_passages


    def answer_question(self, question, max_length=384):
        """Answer question using RAG approach"""
        if self.index is None:
            raise ValueError("No documents have been indexed. Please index documents first.")

        # Retrieve relevant passages
        relevant_passages = self.retrieve_relevant_passages(question)
        combined_context = " ".join(relevant_passages)

        # Generate answer using QA model
        inputs = self.tokenizer(
            question,
            combined_context,
            max_length=256,  # Reduced from 512
            truncation=True,
            return_tensors='pt'
        ).to(self.device)


        # Get model predictions
        with torch.no_grad():
            outputs = self.qa_model(**inputs)
            start_logits = outputs.start_logits[0]
            end_logits = outputs.end_logits[0]
            start_idx = torch.argmax(start_logits)
            end_idx = torch.argmax(end_logits)
            tokens = self.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
            answer_tokens = tokens[start_idx:end_idx + 1]
            answer = self.tokenizer.convert_tokens_to_string(answer_tokens).strip()

            if not answer or answer.isspace():
                answer = "Could not find a relevant answer in the context."

        return answer, relevant_passages
    def prepare_training_data(self, squad_file: str) -> QADataset:
        """Prepare training data from SQuAD format JSON"""
        with open(squad_file, 'r', encoding='utf-8') as f:
            squad_data = json.load(f)

        questions = []
        contexts = []
        answers = []

        for article in squad_data['data']:
            for paragraph in article['paragraphs']:
                context = paragraph['context']
                for qa in paragraph['qas']:
                    if qa['answers']:  # Only use examples with answers
                        questions.append(qa['question'])
                        contexts.append(context)
                        answers.append({
                            'text': qa['answers'][0]['text'],
                            'answer_start': qa['answers'][0]['answer_start']
                        })

        return QADataset(questions, contexts, answers, self.tokenizer)
    def fine_tune(self, train_dataset, epochs=3, batch_size=1, learning_rate=5e-5, gradient_accumulation_steps=4):
      print("Starting fine-tuning process...")
      train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

      optimizer = AdamW(self.qa_model.parameters(), lr=learning_rate)
      total_steps = len(train_dataloader) * epochs
      scheduler = get_linear_schedule_with_warmup(
          optimizer,
          num_warmup_steps=total_steps // 10,
          num_training_steps=total_steps
      )

      self.qa_model.train()
      for epoch in range(epochs):
          total_loss = 0
          optimizer.zero_grad()

          for batch_idx, batch in enumerate(train_dataloader):
              input_ids = batch['input_ids'].to(self.device)
              attention_mask = batch['attention_mask'].to(self.device)
              start_positions = batch['start_positions'].to(self.device)
              end_positions = batch['end_positions'].to(self.device)

              outputs = self.qa_model(
                  input_ids=input_ids,
                  attention_mask=attention_mask,
                  start_positions=start_positions,
                  end_positions=end_positions
              )

              loss = outputs.loss
              loss = loss / gradient_accumulation_steps
              loss.backward()
              total_loss += loss.item()

              if (batch_idx + 1) % gradient_accumulation_steps == 0:
                  torch.nn.utils.clip_grad_norm_(self.qa_model.parameters(), 1.0)
                  optimizer.step()
                  scheduler.step()
                  optimizer.zero_grad()

              if (batch_idx + 1) % 50 == 0:
                  print(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx+1}/{len(train_dataloader)}, Loss: {loss.item():.4f}")

          avg_loss = total_loss / len(train_dataloader)
          print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")



def main():
    # Initialize enhanced QA system
    qa_system = RAGEnhancedQA(device='cpu')

    # Load and index documents from Moral_economics.txt
    qa_system.load_documents_from_file('MoralEconomics.txt')

    # Example of fine-tuning (if you have SQuAD format data)
    squad_file = "squad_manchester_united.json"
    train_dataset = qa_system.prepare_training_data(squad_file)
    qa_system.fine_tune(train_dataset, epochs=3, batch_size=1)

    # Test the system
    while True:
        question = input("\nEnter your question (or 'quit' to exit): ")
        if question.lower() == 'quit':
            break

        answer, relevant_passages = qa_system.answer_question(question)
        print(f"\nAnswer: {answer}")
        print("\nRelevant passages:")
        for i, passage in enumerate(relevant_passages, 1):
            print(f"{i}. {passage}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expec

Encoding documents...
Documents indexed successfully
Loaded and indexed 105 passages from MoralEconomics.txt
Starting fine-tuning process...




Epoch 1/3, Average Loss: 2.8515
Epoch 2/3, Average Loss: 1.6483
Epoch 3/3, Average Loss: 1.0098

Enter your question (or 'quit' to exit): Manchester United announced they were joining 11 other European clubs as founding members of what?

Answer: the european super league

Relevant passages:
1. [78]

On 18 April 2021, Manchester United announced they were joining 11 other European clubs as founding members of the European Super League, a proposed 20-team competition intended to rival the UEFA Champions League. [79] The announcement drew a significant backlash from supporters, other clubs, media partners, sponsors, players and the UK Government, forcing the club to withdraw just two days later. [80][81][82][83][84] The failure of the project led to the resignation of executive vice-chairman Ed Woodward, while resultant protests against Woodward and the Glazer family led to a pitch invasion ahead of a league match against Liverpool on 2 May 2021, which saw the first postponement of a Prem