In [2]:
!pip install torch langchain langchain-community langchain-huggingface transformers datasets peft trl gradio faiss-cpu sentence-transformers evaluate accelerate sentencepiece



In [3]:
!pip install -U bitsandbytes



In [4]:
!pip install --upgrade huggingface_hub



In [5]:
pip install rouge_score



In [6]:
import json
import torch
import logging
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from evaluate import load
from sentence_transformers import SentenceTransformer, util
import gradio as gr
import os

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Verify GPU availability
logger.info("Checking GPU availability...")
!nvidia-smi
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

Sat Aug 16 13:23:43 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   56C    P8             10W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [8]:
# Load and preprocess all 14 JSON datasets
qa_data = []
for i in range(1, 15):
    file_name = f"/content/ml_qa_synthetic_set_{i}.json"
    try:
        with open(file_name, "r") as f:
            data = json.load(f)
            qa_data.extend(data)
            logger.info(f"Loaded {file_name} with {len(data)} Q&A pairs")
    except FileNotFoundError:
        logger.warning(f"{file_name} not found. Skipping...")
        continue

if not qa_data:
    logger.error("No JSON files loaded. Please upload the files to /content/")
    raise ValueError("No JSON files loaded. Please upload the files to /content/")

# Extract documents and metadata for embedding
documents = [f"Question: {item['question']}\nAnswer: {item['answer']}" for item in qa_data]
metadata = [{"id": item["id"], "source": item["source"]} for item in qa_data]

# Prepare data for fine-tuning (instruction format)
fine_tune_data = [
    {
        "text": f"### Instruction: Answer the following question concisely and accurately.\n### Question: {item['question']}\n### Answer: {item['answer']}"
    } for item in qa_data
]

# Split data into train and validation
train_data, val_data = train_test_split(fine_tune_data, test_size=0.1, random_state=42)

# Save train and validation data as JSONL
with open("/content/train_data.jsonl", "w") as f:
    for item in train_data:
        json.dump(item, f)
        f.write("\n")
with open("/content/val_data.jsonl", "w") as f:
    for item in val_data:
        json.dump(item, f)
        f.write("\n")

logger.info(f"Loaded {len(qa_data)} Q&A pairs. Train: {len(train_data)}, Validation: {len(val_data)}")

In [9]:
logger.info("Setting up embeddings and FAISS vector store...")
from langchain_huggingface import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
vector_store = FAISS.from_texts(documents, embedding_model, metadatas=metadata)
retriever = vector_store.as_retriever(search_kwargs={"k": 5})  # Increased k for more context
logger.info("FAISS vector store and base retriever initialized")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


**RUN THIS CELL TO TRAIN**

In [21]:
# # Set up model with 8-bit quantization
# model_name = "facebook/opt-1.3b"
# bnb_config = BitsAndBytesConfig(
#     load_in_8bit=True,
#     bnb_8bit_compute_dtype=torch.bfloat16
# )
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
#     device_map="auto",
#     torch_dtype=torch.bfloat16
# )

# # Remove any existing PEFT adapters to avoid multiple adapter warnings
# if hasattr(model, "peft_config"):
#     model.unload()
#     logger.info("Removed existing PEFT adapters from the model")

# # Configure LoRA
# lora_config = LoraConfig(
#     r=16,
#     target_modules=["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
#     lora_alpha=32,
#     lora_dropout=0.1,
#     task_type="CAUSAL_LM",
#     base_model_name_or_path=model_name
# )
# model = get_peft_model(model, lora_config)

# # Set up training arguments
# # Note: bitsandbytes may warn about casting inputs to float16 during quantization; this is expected
# training_args = TrainingArguments(
#     output_dir="/content/finetune_opt_1.3b",
#     per_device_train_batch_size=2,
#     gradient_accumulation_steps=4,
#     warmup_steps=20,
#     num_train_epochs=3,
#     learning_rate=2e-4,
#     fp16=True,
#     logging_steps=5,
#     logging_strategy="steps",
#     save_strategy="epoch",
#     eval_strategy="epoch",
#     save_steps=50,
#     optim="paged_adamw_8bit",
#     report_to="none",
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_loss",
#     lr_scheduler_type="cosine",
#     push_to_hub=True,
#     hub_model_id="sravan837/ML_RAG_MODEl"  # Replace with your Hugging Face username
# )

# # Load datasets
# train_dataset = load_dataset("json", data_files="/content/train_data.jsonl", split="train")
# val_dataset = load_dataset("json", data_files="/content/val_data.jsonl", split="train")

# # Tokenize datasets
# def tokenize_function(examples):
#     return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
# tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# # Define early stopping callback
# class EarlyStoppingCallback(TrainerCallback):
#     def __init__(self, early_stopping_patience=3):
#         self.early_stopping_patience = early_stopping_patience
#         self.best_loss = float('inf')
#         self.patience_counter = 0

#     def on_evaluate(self, args, state, control, metrics, **kwargs):
#         eval_loss = metrics.get("eval_loss")
#         if eval_loss < self.best_loss:
#             self.best_loss = eval_loss
#             self.patience_counter = 0
#         else:
#             self.patience_counter += 1
#         if self.patience_counter >= self.early_stopping_patience:
#             control.should_training_stop = True

# # Define custom callback to print metrics
# class PrintMetricsCallback(TrainerCallback):
#     def on_log(self, args, state, control, logs=None, **kwargs):
#         if logs:
#             train_loss = logs.get("loss")
#             eval_loss = logs.get("eval_loss")
#             if train_loss is not None:
#                 logger.info(f"Step {state.global_step}: Training Loss = {train_loss:.4f}")
#             if eval_loss is not None:
#                 logger.info(f"Step {state.global_step}: Validation Loss = {eval_loss:.4f}")

# # Initialize trainer
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# trainer = SFTTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_train_dataset,
#     eval_dataset=tokenized_val_dataset,
#     peft_config=lora_config,
#     data_collator=data_collator,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=3), PrintMetricsCallback()]
# )
# # Log in to Hugging Face Hub
# from huggingface_hub import notebook_login
# notebook_login()

# # Fine-tune the model
# logger.info("Starting fine-tuning...")
# trainer.train()
# logger.info("Saving fine-tuned model locally...")
# model.save_pretrained("/content/finetune_opt_1.3b/final_checkpoint")
# tokenizer.save_pretrained("/content/finetune_opt_1.3b/final_checkpoint")
# logger.info("Pushing fine-tuned model to Hugging Face Hub...")
# trainer.push_to_hub()

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/501 [00:00<?, ? examples/s]



Truncating train dataset:   0%|          | 0/4500 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/501 [00:00<?, ? examples/s]

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…



Epoch,Training Loss,Validation Loss
1,1.1876,1.107462
2,0.8625,0.95752
3,0.8033,0.936291




adapter_model.safetensors:   0%|          | 0.00/56.7M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sravan837/ML_RAG_MODEl/commit/43cd86d8ea32e4502292a07bb94e5d25f7cd4696', commit_message='End of training', commit_description='', oid='43cd86d8ea32e4502292a07bb94e5d25f7cd4696', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sravan837/ML_RAG_MODEl', endpoint='https://huggingface.co', repo_type='model', repo_id='sravan837/ML_RAG_MODEl'), pr_revision=None, pr_num=None)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [10]:
# Load fine-tuned model
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, pipeline
from peft import PeftModel
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
import torch
import gc # Import garbage collection

# Set up model with 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.bfloat16
)

model_name = "facebook/opt-1.3b" # Original base model name

# Explicitly delete model and clear cache to ensure a clean load
if 'model' in locals():
    del model
if torch.cuda.is_available():
    torch.cuda.empty_cache()
gc.collect()


tokenizer = AutoTokenizer.from_pretrained(model_name) # Load tokenizer from base model

# Load the base model first
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

# Load the PEFT adapter from the checkpoint
adapter_path = "/content/finetune_opt_1.3b/final_checkpoint" # Path to your saved adapter weights
model = PeftModel.from_pretrained(model, adapter_path)

# Set up text generation pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    do_sample=True,
    temperature=0.6,
    top_p=0.85,
    max_new_tokens=100 # Reduced max_new_tokens
)
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Set up reranking (Optional - uncomment to use)
# compressor = LLMChainExtractor.from_llm(llm)
# if 'retriever' in locals(): # Check if retriever is defined
#     compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)
#     logger.info("Reranking retriever initialized")
# else:
#     logger.warning("Retriever not found. Skipping reranking initialization.")
#     compression_retriever = None # Set to None if retriever is not available


# Refined prompt template
prompt_template = """Based on the following Q&A pairs, provide a concise and accurate answer to the user's question. Use only the most relevant information from the context and avoid adding unnecessary details or multiple answers unless explicitly requested.

Retrieved Context:
{context}

User Question: {question}

Answer: """
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

# Set up RAG pipeline
# Use the base retriever directly, without compression
if 'retriever' in locals(): # Ensure retriever is available before creating the chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever, # Use the base retriever
        return_source_documents=True,
        chain_type_kwargs={"prompt": prompt}
    )
    logger.info("RAG pipeline initialized using base retriever")
else:
    logger.error("Retriever is not defined. Cannot initialize RAG pipeline.")
    qa_chain = None # Set to None if retriever is not available

Device set to use cuda:0


In [11]:
# Query function with error handling, returning answer and retrieval details
def query_rag(question, show_retrieval=False):
    try:
        result = qa_chain.invoke({"query": question})
        answer = result["result"].strip()
        sources = [doc.metadata["source"] for doc in result["source_documents"]]
        context = [doc.page_content for doc in result["source_documents"]]
        logger.info(f"Processed question: {question}, Answer: {answer}")

        # Default output: answer only
        output = f"**Answer**: {answer}"

        # If show_retrieval is True, append retrieval details
        if show_retrieval:
            output += f"\n\n**Retrieval Details**:\n- **Retrieved Context**:\n"
            for i, ctx in enumerate(context, 1):
                output += f"  {i}. {ctx}\n"
            output += f"- **Sources**: {', '.join(sources)}"

        return output
    except Exception as e:
        logger.error(f"Error processing question '{question}': {str(e)}")
        return f"Error: Unable to process question. Please try again or rephrase."

# Set up Gradio interface
with gr.Blocks() as iface:
    gr.Markdown("# Machine Learning Q&A RAG Model")
    gr.Markdown("Ask questions about machine learning based on 5,000 Q&A pairs. Supports short or full-form questions.")

    with gr.Row():
        question_input = gr.Textbox(label="Enter your question (e.g., 'What's gradient boosting?')", lines=2)

    with gr.Row():
        show_retrieval = gr.Checkbox(label="Show Retrieval Details (Context and Sources)", value=False)

    output = gr.Markdown(label="Answer")

    submit_button = gr.Button("Submit")

    submit_button.click(
        fn=query_rag,
        inputs=[question_input, show_retrieval],
        outputs=output
    )

# Launch Gradio interface
logger.info("Launching Gradio interface...")
try:
    iface.launch()
except Exception as e:
    logger.error(f"Gradio launch failed: {str(e)}")
    logger.info("Trying ngrok fallback...")

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4e8ca22ea62ffd42df.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [12]:
# Manual testing function
def query_rag_manual(question):
    return query_rag(question)

# Example usage
print(query_rag_manual("What's gradient boosting?"))

Both `max_new_tokens` (=100) and `max_length`(=21) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=100) and `max_length`(=21) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


**Answer**: Based on the following Q&A pairs, provide a concise and accurate answer to the user's question. Use only the most relevant information from the context and avoid adding unnecessary details or multiple answers unless explicitly requested.

Retrieved Context:
Question: What is gradient boosting in machine learning?
Answer: Gradient boosting is an ensemble method that builds sequential weak learners, typically decision trees, to minimize a loss function using gradient descent, improving predictive accuracy.

Question: What is gradient boosting in supervised learning?
Answer: Gradient boosting builds an ensemble of weak learners, typically decision trees, by iteratively minimizing a loss function using gradient descent, improving prediction accuracy.

Question: What is gradient boosting in supervised learning?
Answer: Gradient boosting builds an ensemble of decision trees, iteratively minimizing a loss function using gradient descent.

Question: How does gradient boosting work?

**Security Warning:** Hardcoding your Hugging Face token directly into the notebook is not recommended. Consider using Colab's Secrets Manager for better security.