In [1]:
!pip install datasets faiss-cpu langchain-community sentence-transformers nltk rouge-score evaluate




[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!pip install datasets



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [3]:
pip install datasets faiss-cpu langchain-community sentence-transformers nltk rouge-score 

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
pip install rouge-score sacrebleu


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
from datasets import load_dataset

ds = load_dataset("squad_v2", split="train")

# Take a random sample of 2,000
ds = ds.select(range(2000))


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import re
import unicodedata

def preprocess_batch(batch):
    def clean_text(text):
        WHITESPACE_PATTERN = re.compile(r"\s+")
        SPECIAL_CHAR_PATTERN = re.compile(r"[^a-zA-Z0-9áéíóúüñ&?.!,;:()'\"\s-]")  # Allow accents & hyphens

        if not isinstance(text, str) or not text.strip():
            return ""  # Handle empty input safely

        text = unicodedata.normalize("NFC", text)  # Preserve accents correctly
        
        # Remove IPA pronunciation (anything in parentheses containing a "/")
        text = re.sub(r"\([^)]*\/[^)]*\)", "", text)

        text = text.lower().strip()  # Normalize case & spaces
        text = WHITESPACE_PATTERN.sub(" ", text)  # Normalize whitespace
        text = SPECIAL_CHAR_PATTERN.sub("", text)  # Remove unwanted characters but keep allowed ones
        return text

    processed_data = {
        "document": [clean_text(doc) for doc in batch["context"]],
        "question": [clean_text(q) for q in batch["question"]],
        "answers": [[clean_text(ans) for ans in a["text"]] if isinstance(a, dict) and "text" in a else [""] for a in batch["answers"]]
    }

    return processed_data

# ✅ Run debugging on a small sample
merged_dataset = ds.map(preprocess_batch, batched=True, batch_size=2)

In [7]:
from sklearn.preprocessing import normalize
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# 🔹 Load Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 🔹 Create FAISS Vector Store
documents = merged_dataset["document"]
document_embeddings = embedding_model.embed_documents(documents)

# ✅ Fix: Zip documents with their embeddings
text_embedding_pairs = list(zip(documents, document_embeddings))

# ✅ Correctly pass the zipped list
vector_db = FAISS.from_embeddings(text_embedding_pairs, embedding=embedding_model)

# 🔹 Save & Reload FAISS
vector_db.save_local("faiss_narrativeqa")
vector_db = FAISS.load_local("faiss_narrativeqa", embedding_model, allow_dangerous_deserialization=True)

# 🔹 Retrieval Function
def retrieve_context(query, top_k=3):
    query_embedding = embedding_model.embed_query(query)  # No normalization needed
    docs = vector_db.similarity_search_by_vector(query_embedding, k=top_k)
    return " ".join([doc.page_content for doc in docs])


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [8]:
# 🔹 Retrieval Function
def retrieve_context(query, top_k=3):
    query_embedding = embedding_model.embed_query(query)
    query_embedding = normalize([query_embedding], norm="l2")[0]  # Normalize query embedding
    docs = vector_db.similarity_search_by_vector(query_embedding, k=top_k)
    return " ".join([doc.page_content for doc in docs])

query = "What is the main theme of Chopin's works?"
retrieved_context = retrieve_context(query)
print("🔹 Retrieved Context:\n", retrieved_context)


🔹 Retrieved Context:
 j. barrie jones suggests that "amongst the works that chopin intended for concert use, the four ballades and four scherzos stand supreme", and adds that "the barcarolle op. 60 stands apart as an example of chopin's rich harmonic palette coupled with an italianate warmth of melody." temperley opines that these works, which contain "immense variety of mood, thematic material and structural detail", are based on an extended "departure and return" form; "the more the middle section is extended, and the further it departs in key, mood and theme, from the opening idea, the more important and dramatic is the reprise when it at last comes." j. barrie jones suggests that "amongst the works that chopin intended for concert use, the four ballades and four scherzos stand supreme", and adds that "the barcarolle op. 60 stands apart as an example of chopin's rich harmonic palette coupled with an italianate warmth of melody." temperley opines that these works, which contain "imme

In [10]:
# Load Tokenizer and Model
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "facebook/bart-base"  # ✅ Using BART for better performance in QA
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# Apply LoRA to BART
lora_config = LoraConfig(
    r=8, lora_alpha=32, lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]  # ✅ Corrected for BART
)
model = get_peft_model(model, lora_config)

# Ensure PAD token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize Dataset
def tokenize_function(example):
    return tokenizer(
        example["document"],  # Ensure this field exists in your dataset
        padding="max_length",
        truncation=True,
        max_length=200,
    )

tokenized_dataset = merged_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 2000/2000 [00:00<00:00, 2403.72 examples/s]


In [11]:
# Split Dataset (80% Train, 10% Validation, 10% Test)
dataset_split = tokenized_dataset.train_test_split(test_size=0.2)
temp_split = dataset_split["test"].train_test_split(test_size=0.5)

train_dataset = dataset_split["train"]
eval_dataset = temp_split["train"]
test_dataset = temp_split["test"]

# Add Labels to Dataset
def add_labels(batch):
    batch["labels"] = batch["input_ids"].copy()
    return batch

train_dataset = train_dataset.map(add_labels)
eval_dataset = eval_dataset.map(add_labels)
test_dataset = test_dataset.map(add_labels)

# Print Dataset Structure
print("Train Dataset Columns:", train_dataset.column_names)
print("Validation Dataset Columns:", eval_dataset.column_names)
print("Test Dataset Columns:", test_dataset.column_names)

Map: 100%|██████████| 1600/1600 [00:00<00:00, 2855.75 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 2597.12 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 2574.01 examples/s]

Train Dataset Columns: ['id', 'title', 'context', 'question', 'answers', 'document', 'input_ids', 'attention_mask', 'labels']
Validation Dataset Columns: ['id', 'title', 'context', 'question', 'answers', 'document', 'input_ids', 'attention_mask', 'labels']
Test Dataset Columns: ['id', 'title', 'context', 'question', 'answers', 'document', 'input_ids', 'attention_mask', 'labels']





In [12]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq

training_args = TrainingArguments(
    output_dir="./bookintel",
    per_device_train_batch_size=2,  
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    num_train_epochs=3,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="epoch",
    fp16=torch.cuda.is_available(),  
    remove_unused_columns=True,
    report_to="none",
)

# Use Data Collator to handle padding correctly
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator  
)

# Train the Model
trainer.train()



  trainer = Trainer(
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,2.6895,No log
2,1.5908,No log
3,1.4253,No log


TrainOutput(global_step=600, training_loss=2.074619394938151, metrics={'train_runtime': 861.546, 'train_samples_per_second': 5.571, 'train_steps_per_second': 0.696, 'total_flos': 574175969280000.0, 'train_loss': 2.074619394938151, 'epoch': 3.0})

In [14]:
eval_results = trainer.evaluate(eval_dataset)
print(eval_results)  # Debugging step


{'eval_runtime': 2.275, 'eval_samples_per_second': 87.913, 'eval_steps_per_second': 10.989, 'epoch': 3.0}


In [15]:
test_results = trainer.evaluate(test_dataset)
print(test_results)  # Debugging step


{'eval_runtime': 1.9173, 'eval_samples_per_second': 104.313, 'eval_steps_per_second': 13.039, 'epoch': 3.0}


In [91]:
print(model)

PeftModel(
  (base_model): LoraModel(
    (model): BartForConditionalGeneration(
      (model): BartModel(
        (shared): BartScaledWordEmbedding(50265, 768, padding_idx=1)
        (encoder): BartEncoder(
          (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
          (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
          (layers): ModuleList(
            (0-5): 6 x BartEncoderLayer(
              (self_attn): BartSdpaAttention(
                (k_proj): Linear(in_features=768, out_features=768, bias=True)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
            