## Import the required library

In [1]:
pip install transformers peft accelerate bitsandbytes faiss-cpu datasets torch sentence-transformers

Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take

## Load the dataset squad_v2

In [2]:
from datasets import load_dataset

# Load SQuAD v2 dataset
dataset = load_dataset("squad_v2")

# Check a sample from the dataset
print(dataset["train"][0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

{'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'When did Beyonce start becoming popular?', 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}


## Tokenize the question + context pairs

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer


HUGGINGFACE_TOKEN = "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"

# Load the model with authentication
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    use_auth_token=HUGGINGFACE_TOKEN
)

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    use_auth_token=HUGGINGFACE_TOKEN
)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]



config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from transformers import AutoTokenizer

# Load tokenizer for LLaMA-2
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

# Tokenize function
def preprocess(data):
    inputs = "Question: " + data["question"] + " Context: " + data["context"]
    inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)

    outputs = tokenizer(data["answers"]["text"][0], padding="max_length",
                        truncation=True, max_length=128)

    inputs["labels"] = outputs["input_ids"]
    return inputs

# Apply tokenization
train_data = dataset["train"].map(preprocess, batched=True)
test_data = dataset["validation"].map(preprocess, batched=True)

##Fine-tune LLaMA with LoRA

In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

# Load LLaMA-2-7B in 8-bit precision
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf", load_in_8bit=True, device_map="auto"
)

# Configure LoRA
lora_config = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Training configuration
training_args = TrainingArguments(
    output_dir="./lora_llama",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs=1,
    learning_rate=5e-5,
)

# Initialize Trainer
trainer = Trainer(
    model=model, args=training_args,
    train_dataset=train_data, eval_dataset=test_data
)

# Start training
trainer.train()

## Generate Embeddings for Contextual Search

In [None]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Extract contexts from the dataset
contexts = [item["context"] for item in dataset["train"]]

# Generate embeddings for all contexts
context_embeddings = embedding_model.encode(contexts, convert_to_numpy=True)

## Store Embeddings in FAISS

In [None]:
import faiss
import numpy as np

# Initialize FAISS index with L2 distance metric
dimension = context_embeddings.shape[1]  # Embedding size
faiss_index = faiss.IndexFlatL2(dimension)

# Add the context embeddings to the index
faiss_index.add(context_embeddings)

# Check the number of embeddings stored
print(f"Total contexts indexed: {faiss_index.ntotal}")

## Perform Retrieval and Augment the Query

In [None]:
def retrieve_relevant_context(query):
    # Generate embedding for the query
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)

    # Retrieve top 3 similar contexts
    distances, indices = faiss_index.search(query_embedding, k=3)

    # Get the corresponding contexts
    relevant_contexts = [contexts[i] for i in indices[0]]
    return " ".join(relevant_contexts)

# Example query
query = "Who were the Normans?"
relevant_text = retrieve_relevant_context(query)
print(relevant_text)

## Generate Answer Using LLaMA

In [None]:
def generate_answer(question, context):
    # Format the input for the model
    input_text = f"Context: {context}\n\nQuestion: {question}"
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

    # Generate response
    output = model.generate(**inputs, max_new_tokens=50)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Generate answer
answer = generate_answer(query, relevant_text)
print(f"Answer: {answer}")

Testing the System