#OCR

In [None]:
def read_text_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            ocrtext = file.read()
        return ocrtext
    except FileNotFoundError:
        return "Error: File not found."
    except Exception as e:
        return f"Error: {str(e)}"


ocrtext1 = read_text_file("/content/ch1.txt")
ocrtext34 = read_text_file("/content/ch3.txt")
ocrtext8 = read_text_file("/content/ch8.txt")
ocrtext2 = read_text_file("/content/ch2.txt")
ocrtext5 = read_text_file("/content/ch5.txt")
ocrtext6 = read_text_file("/content/ch6.txt")
ocrtext7 = read_text_file("/content/ch7.txt")
ocrtext9 = read_text_file("/content/ch9.txt")
ocrtexta = read_text_file("/content/appendix.txt")


In [None]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m80.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.1


#Parsing


In [None]:
!pip uninstall numpy sentence-transformers transformers -y
!pip install numpy==1.25.2 sentence-transformers==3.0.1 nltk

In [None]:
!pip install numpy==1.25.2

In [None]:
import numpy
import sentence_transformers
print(f"NumPy version: {numpy.__version__}")
print(f"Sentence-Transformers version: {sentence_transformers.__version__}")
print("Imports successful!")

NumPy version: 1.25.2
Sentence-Transformers version: 4.1.0
Imports successful!


In [None]:
!pip uninstall numpy scipy gensim transformers sentence-transformers -y
!pip install numpy==1.24.4 scipy==1.10.1 gensim==4.3.2 transformers==4.44.2 sentence-transformers

In [None]:
import nltk
import re
import json
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
import torch

import os
import fitz  # PyMuPDF

# Check versions
#print(f"NumPy version: {np.__version__}")
#print(f"Transformers version: {transformers.__version__}")

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

[]

In [None]:

# Preprocessing
def preprocess_text(text):
    # Remove headers and noise
    text = re.sub(r'Introduction to Data Privacy\n\d+\n', '', text)
    text = re.sub(r'Data Privacy: Principles and Practice\n\d+\n', '', text)
    text = re.sub(r'^\d+\n', '', text, flags=re.MULTILINE)
    text = text.lower()
    # Tokenize sentences
    sentences = sent_tokenize(text)
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    cleaned_sentences = []
    for sent in sentences:
        words = word_tokenize(sent)
        words = [w for w in words if w.isalnum() and w not in stop_words]
        cleaned_sentences.append(' '.join(words))
    return cleaned_sentences

# Chunking
def chunk_text(sentences, chunk_size=300):
    chunks = []
    current_chunk = []
    word_count = 0
    for sent in sentences:
        word_count += len(word_tokenize(sent))
        current_chunk.append(sent)
        if word_count >= chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            word_count = 0
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

# Generate embeddings
def generate_embeddings(chunks, model_name="all-MiniLM-L6-v2"):
    try:
        model = SentenceTransformer(model_name)
        embeddings = model.encode(chunks, convert_to_numpy=True, show_progress_bar=True)
        return embeddings
    except Exception as e:
        print(f"Embedding error: {str(e)}")
        return []



In [None]:
# Main pipeline
try:
    cleaned_sentences = preprocess_text(ocrtext)
    chunks = chunk_text(cleaned_sentences)
    print(f"Number of chunks: {len(chunks)}")

    # Generate embeddings
    embeddings = generate_embeddings(chunks)

    # Structured output
    output = {
        "document": "Data Privacy: Chapter 8",
        "pages": "155-163",
        "chunks": [
            {
                "chunk_id": i + 1,
                "text": chunk,
                "embedding": embedding.tolist(),  # Convert numpy array to list for JSON
                "page": i + 1
            }
            for i, (chunk, embedding) in enumerate(zip(chunks, embeddings))
        ]
    }

    with open("chunk_embeddings.json", "w") as f:
        json.dump(output, f, indent=2)
    print("Output saved to chunk_embeddings.json")
except Exception as e:
    print(f"Pipeline error: {str(e)}")

Number of chunks: 3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Output saved to chunk_embeddings.json


#COMBINE EMBEDDINGS AND CLUSTERING


In [None]:
ocrtext1 = read_text_file("/content/drive/MyDrive/ch1.txt")
ocrtext3 = read_text_file("/content/drive/MyDrive/ch3.txt")
ocrtext8 = read_text_file("/content/drive/MyDrive/ch8.txt")

In [None]:
import re
import json
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer, CrossEncoder
from scipy.spatial.distance import cosine
from sklearn.preprocessing import MinMaxScaler

import sklearn
import sentence_transformers
import scipy

# --- NLTK Setup ---
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

print(f"NumPy version: {np.__version__}")
print(f"Sentence-Transformers version: {sentence_transformers.__version__}")
print(f"NLTK version: {nltk.__version__}")
print(f"Scikit-Learn version: {sklearn.__version__}")
print(f"SciPy version: {scipy.__version__}")

# --- Text Cleaning ---
def clean_text(text):
    text = re.sub(r'[\u2018\u2019\u2013\u2014\u2032]', "'", text)
    text = re.sub(r'[“”"`]', '"', text)
    text = re.sub(r'ﬁ', 'fi', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# --- Preprocessing ---
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'(Introduction to Data Privacy|Data Privacy: Principles and Practice)\n\d+\n', '', text)
    text = re.sub(r'^\d+\n', '', text, flags=re.MULTILINE)
    text = text.lower()
    sentences = sent_tokenize(text)
    cleaned = []
    for sent in sentences:
        words = word_tokenize(sent)
        words = [lemmatizer.lemmatize(w) for w in words if w.isalnum() and w not in stop_words]
        cleaned.append(' '.join(words))
    return cleaned

# --- Chunking ---
def chunk_sentences(sentences, chunk_size=300):
    chunks, current_chunk, word_count = [], [], 0
    for sent in sentences:
        word_count += len(word_tokenize(sent))
        current_chunk.append(sent)
        if word_count >= chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk, word_count = [], 0
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

def chunk_raw_text(text, chunk_size=200, stride=100):
    tokens = word_tokenize(text)
    return [' '.join(tokens[i:i + chunk_size]) for i in range(0, len(tokens) - chunk_size + 1, stride)]

# --- Embedding Generation ---
def generate_embeddings(documents, model_name="intfloat/e5-base-v2", chunk_size=200, stride=100):
    model = SentenceTransformer(model_name)
    all_data = []
    for doc in documents:
        text = clean_text(doc["text"])
        chunks = chunk_raw_text(text, chunk_size, stride)

        # E5-specific: add passage prefix
        chunks = [f"passage: {chunk}" for chunk in chunks]

        embeddings = model.encode(chunks, convert_to_numpy=True, show_progress_bar=True)
        doc_data = {
            "document": doc["name"],
            "chunks": [
                {"text": chunk, "embedding": emb.tolist(), "page": 1}
                for chunk, emb in zip(chunks, embeddings)
            ]
        }
        all_data.append(doc_data)
    return all_data




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


NumPy version: 2.0.2
Sentence-Transformers version: 4.1.0
NLTK version: 3.9.1
Scikit-Learn version: 1.6.1
SciPy version: 1.15.3


In [None]:
documents = [
    {"name": "ch1.pdf", "text": ocrtext1},
    {"name": "ch3-4.pdf", "text": ocrtext34},
    {"name": "ch8.pdf", "text": ocrtext8},
    {"name": "ch2.pdf", "text": ocrtext2},
    {"name": "ch5.pdf", "text": ocrtext5},
    {"name": "ch6.pdf", "text": ocrtext6},
    {"name": "ch7.pdf", "text": ocrtext7},
    {"name": "ch9.pdf", "text": ocrtext9},
    {"name": "appendix.pdf", "text": ocrtexta}
]

embeddings_data = generate_embeddings(documents, model_name="intfloat/e5-base-v2")
with open("/content/drive/MyDrive/book_embeddings.json", "w") as f:
    json.dump(embeddings_data, f, indent=2)
print("✅ New embeddings saved using e5-base-v2.")

In [None]:
import re
import json
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer, CrossEncoder
from scipy.spatial.distance import cosine
from sklearn.preprocessing import MinMaxScaler

import sklearn
import sentence_transformers
import scipy
from collections import defaultdict
# --- Load Embeddings ---
def load_embeddings(file_path, target_docs):
    try:
        with open(file_path, "r") as f:
            data = json.load(f)
        chunks, embeddings, docs, pages = [], [], [], []
        chunk_counts = {}

        for doc in data:
            doc_name = doc["document"]
            chunk_counts[doc_name] = len(doc["chunks"])
            if doc_name in target_docs:
                for chunk in doc["chunks"]:
                    text = clean_text(chunk["text"])
                    chunks.append(text)
                    embeddings.append(chunk["embedding"])
                    docs.append(doc_name)
                    pages.append(chunk.get("page", 1))

        if not chunks:
            print(f"No chunks found for {target_docs}. Using all documents.")
            return load_embeddings(file_path, [])

        print("Chunk counts per document:", chunk_counts)
        return chunks, np.array(embeddings), docs, pages

    except Exception as e:
        print(f"Error loading embeddings: {str(e)}")
        return [], [], [], []

# --- Similarity Search with Reranking ---
def similarity_search_with_reranking(query, embeddings, texts, docs, pages,
                                     model_name="intfloat/e5-base-v2",
                                     cross_encoder_model="cross-encoder/ms-marco-MiniLM-L-6-v2",
                                     initial_k=30, final_k=5):
    try:
        # Format query for E5
        formatted_query = f"query: {clean_text(query)}"
        model = SentenceTransformer(model_name)
        query_vec = model.encode([formatted_query], convert_to_numpy=True)[0]

        # Compute cosine similarities
        sims = [1 - cosine(query_vec, emb) for emb in embeddings]
        # Group similarities by document
        doc_sims_map = defaultdict(list)
        for doc_name, sim in zip(docs, sims):
            doc_sims_map[doc_name].append(sim)

        print("\nSimilarity Summary Per Document:")
        for doc, sim_list in doc_sims_map.items():
            print(f"{doc}: Max Sim = {max(sim_list):.4f} | Avg Sim = {np.mean(sim_list):.4f}")
        # Top-k chunks per document (e.g., top 3 from each)
        top_k_per_doc = 3
        doc_to_indices = defaultdict(list)
        for i, doc_name in enumerate(docs):
            doc_to_indices[doc_name].append((i, sims[i]))

        candidates = []
        for doc_name, doc_sims in doc_to_indices.items():
            top_doc_chunks = sorted(doc_sims, key=lambda x: x[1], reverse=True)[:top_k_per_doc]
            candidates.extend([(texts[i], docs[i], pages[i], sims[i]) for i, _ in top_doc_chunks])

        # Rerank
        cross_encoder = CrossEncoder(cross_encoder_model)
        rerank_scores = cross_encoder.predict([
            (formatted_query, cand[0]) for cand in candidates
        ])

        # Normalize scores
        sim_norm = MinMaxScaler().fit_transform(np.array([c[3] for c in candidates]).reshape(-1, 1)).flatten()
        rerank_norm = MinMaxScaler().fit_transform(np.array(rerank_scores).reshape(-1, 1)).flatten()
        combined_scores = [0.6 * s + 0.4 * r for s, r in zip(sim_norm, rerank_norm)]

        # Final top-k with similarity >= 0.8
        reranked_indices = np.argsort(combined_scores)[::-1]
        final_results = []
        for i in reranked_indices:
            if len(final_results) >= final_k:
                break
            if combined_scores[i] >= 0.8:
                final_results.append({
                    "document": candidates[i][1],
                    "text": candidates[i][0] + "...",
                    "page": candidates[i][2],
                    "similarity": round(combined_scores[i], 4)
                })

        # Print similarities
        print(f"\nQuery: {query}")
        for idx, result in enumerate(final_results, 1):
            print(f"{idx}. Similarity: {result['similarity']} | Doc: {result['document']}")

        return final_results

    except Exception as e:
        print(f"Search error: {str(e)}")
        return []

In [None]:
import torch
torch.cuda.empty_cache()
try:

    # Load embeddings and run search
    texts, embeddings, docs, pages = load_embeddings("/content/drive/MyDrive/book_embeddings.json", target_docs=["ch1.pdf", "ch3-4.pdf", "ch8.pdf", "ch9.pdf", "appendix.pdf", "ch2.pdf", "ch5.pdf", "ch6.pdf", "ch7.pdf"])
    print(f"Loaded {len(texts)} chunks from {len(set(docs))} documents")

    queries = []

    with open('/content/drive/MyDrive/queries.txt', 'r', encoding='utf-8') as file:
        for line in file:
            # Remove the numbering using regex
            clean_line = re.sub(r'^\d+\.\s*', '', line.strip())
            if clean_line:  # Avoid adding empty lines
                queries.append(clean_line)

    all_results = {}
    for query in queries:
        all_results[query] = similarity_search_with_reranking(query, embeddings, texts, docs, pages)
        print(f"Completed search for: {query}")
        torch.cuda.empty_cache()

    with open("similarties_queries.json", "w") as f:
        json.dump({"queries": all_results}, f, indent=2)
    print("Results saved to rag_reranked_fixed_results.json")


    torch.cuda.empty_cache()

except Exception as e:
    print(f"Pipeline error: {str(e)}")

Chunk counts per document: {'ch1.pdf': 135, 'ch3-4.pdf': 101, 'ch8.pdf': 13, 'ch2.pdf': 0, 'ch5.pdf': 27, 'ch6.pdf': 94, 'ch7.pdf': 43, 'ch9.pdf': 62, 'appendix.pdf': 49}
Loaded 524 chunks from 8 documents

Similarity Summary Per Document:
ch1.pdf: Max Sim = 0.8486 | Avg Sim = 0.8029
ch3-4.pdf: Max Sim = 0.8450 | Avg Sim = 0.7923
ch8.pdf: Max Sim = 0.8096 | Avg Sim = 0.7977
ch5.pdf: Max Sim = 0.8370 | Avg Sim = 0.7884
ch6.pdf: Max Sim = 0.8557 | Avg Sim = 0.7888
ch7.pdf: Max Sim = 0.8256 | Avg Sim = 0.7948
ch9.pdf: Max Sim = 0.8754 | Avg Sim = 0.8071
appendix.pdf: Max Sim = 0.8362 | Avg Sim = 0.8043

Query: What are the main reasons for privacy preservation in data handling according to global regulations?
1. Similarity: 1.0 | Doc: ch9.pdf
Completed search for: What are the main reasons for privacy preservation in data handling according to global regulations?

Similarity Summary Per Document:
ch1.pdf: Max Sim = 0.8306 | Avg Sim = 0.7847
ch3-4.pdf: Max Sim = 0.8170 | Avg Sim = 0.7766
c

In [None]:
import json

# Load the file
with open('/content/drive/MyDrive/similarties_queries.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Remove queries with empty results
cleaned_queries = {query: results for query, results in data['queries'].items() if results}

# Count remaining queries
remaining_count = len(cleaned_queries)
print(f"Number of non-empty queries: {remaining_count}")

# Save the cleaned file
with open('/content/drive/MyDrive/similarties_queries.json', 'w', encoding='utf-8') as f:
    json.dump({"queries": cleaned_queries}, f, indent=2)

print("Cleaned file saved as 'cleaned_similarities_queries.json'")

Number of non-empty queries: 116
Cleaned file saved as 'cleaned_similarities_queries.json'


#Genrate Answers

In [None]:
!pip uninstall torch -y
!pip install torch --upgrade --force-reinstall

In [None]:
pip install transformers_stream_generator

Collecting transformers_stream_generator
  Downloading transformers-stream-generator-0.0.5.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: transformers_stream_generator
  Building wheel for transformers_stream_generator (setup.py) ... [?25l[?25hdone
  Created wheel for transformers_stream_generator: filename=transformers_stream_generator-0.0.5-py3-none-any.whl size=12426 sha256=853c865f7ade249218f1c9e8e770a87f65fdf050b6d149985001199f2badebe4
  Stored in directory: /root/.cache/pip/wheels/23/e8/f0/b3c58c12d1ffe60bcc8c7d121115f26b2c1878653edfca48db
Successfully built transformers_stream_generator
Installing collected packages: transformers_stream_generator
Successfully installed transformers_stream_generator-0.0.5


Loading Qwen Model

In [None]:

import torch

from transformers import AutoTokenizer, AutoModelForCausalLM
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

model_name = "Qwen/Qwen1.5-4B"  # or Qwen-7B if not using chat version

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Load model with disk_offload if you're running out of GPU memory
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
    offload_folder="qwen_offload"  # Disk offload folder
)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:01<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.91G [00:01<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 2560)
    (layers): ModuleList(
      (0-39): 40 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (o_proj): Linear(in_features=2560, out_features=2560, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (up_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (down_proj): Linear(in_features=6912, out_features=2560, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((2560,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((2560,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((2560,), eps=1e-06)
    (rotary_emb): 

In [None]:


import re
import logging

# Set up logging for debugging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def clean_text(text):
    """Normalize text by replacing special characters and whitespace."""
    text = re.sub(r'[\u2018\u2019\u2013\u2014\u2032]', "'", text)  # Replace curly quotes, dashes, primes
    text = re.sub(r'[“”"`]', '"', text)  # Replace quotes and backticks
    text = re.sub(r'ﬁ', 'fi', text)  # Replace ligature 'fi'
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip()

def is_non_english(text):
    """Check for non-English content, allowing common special characters."""
    # Detect non-Latin scripts (e.g., Chinese, Arabic) but allow punctuation, URLs, etc.
    non_english_pattern = r'[\u4e00-\u9fff\u0600-\u06ff]'  # Chinese, Arabic
    return bool(re.search(non_english_pattern, text))

def generate_answer(query, top_chunks, max_new_tokens=1024):
    try:
        cleaned_chunks = [
            {**chunk, "text": clean_text(chunk["text"].replace("passage: ", ""))}
            for chunk in top_chunks
            if len(chunk["text"].strip()) > 50 and not re.match(r'^\d+[\'-,]', chunk["text"])
        ]
        if not cleaned_chunks:
            logger.warning(f"No valid chunks for query '{query}' after cleaning.")
            return "No valid context provided to answer the query.", []
        context = "\n\n".join(
            [f"[{chunk['document']}, Page {chunk['page']}]: {chunk['text']}"
             for chunk in cleaned_chunks]
        )
        prompt = f"""You are a precise data privacy expert.
        Answer the query using **only** the provided context.
        If the context is insufficient, state: 'The provided context does not contain enough information to answer the query fully.'
        and provide a concise summary of relevant information from the context, if any.
        Start the answer with 'Answer: ', keep it under {max_new_tokens} tokens, and avoid repetition or external knowledge.
        Stop after answering the question. Do not continue with unrelated content.
Context:
{context}
Query: {query}
Answer: """
        logger.info(f"Context for query '{query}': {context[:500]}...")
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            max_length=2048,
            truncation=True,
            padding=True
        ).to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.2,
                top_p=0.7,
                top_k=11,
                repetition_penalty=1.3,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id
            )
        output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        logger.info(f"Raw output for query '{query}': {output_text[:500]}...")
        answer = output_text.split("Answer:")[-1].strip() if "Answer:" in output_text else output_text.strip()
        answer = answer[:max_new_tokens * 8]
        #answer = re.sub(r'^.*?(?=Answer:|\Z)', '', answer, flags=re.DOTALL).strip()
        if len(answer) >= max_new_tokens * 8:
            logger.warning(f"Answer for query '{query}' truncated at {max_new_tokens * 6} characters.")
        if is_non_english(answer):
            logger.warning(f"Non-English content detected in answer for query '{query}'.")
            return "The provided context does not contain enough information to answer the query fully.", cleaned_chunks
        privacy_keywords = ["privacy", "tokenization", "anonymization", "data protection", "encryption", "compliance"]
        if not any(kw in answer.lower() for kw in privacy_keywords) and "not contain enough information" not in answer.lower():
            logger.warning(f"Answer for query '{query}' lacks privacy-related keywords.")
            #return f"The provided context does not contain enough information to answer the query fully. Relevant context: {cleaned_chunks[0]['text'][:100]}...", cleaned_chunks
        return answer, cleaned_chunks
    except Exception as e:
        logger.error(f"Generation error for query '{query}': {str(e)}")
        return "Unable to generate answer due to model error.", []

In [None]:
import json
import torch
torch.cuda.empty_cache()
# Load JSON data
with open("similarties_queries.json", "r") as f:
    data = json.load(f)

queries = data["queries"]
fine_tuning_data = []

    # Process each query
count=0
for query, results in queries.items():
    count+=1
    print(f"\n{count} Query: {query}")
    answer, context = generate_answer(query, results)
    print(f"Answer: ",answer[:50])

        # Store data for fine-tuning
    fine_tuning_data.append({
        "query": query,
        "retrieved_contexts": context,
        "answer": answer
    })
    torch.cuda.empty_cache()


with open("results.txt", "w", encoding="utf-8") as f:
    for item in fine_tuning_data:
        f.write("QUERY: " + item["query"].strip() + "\n")
        f.write("ANSWER:\n" + item["answer"].strip() + "\n")
        f.write("\n" + "-" * 80 + "\n\n")

    # Save fine-tuning data to JSON
with open("/content/drive/MyDrive/fine_tuning_data (7).json", "w", encoding="utf-8") as f:
    json.dump(fine_tuning_data, f, indent=2, ensure_ascii=False)
print("\nFine-tuning data saved to 'fine_tuning_data.json'")


 Query: How does the International Statistical Institute (ISI) ensure ethical data handling in official statistics?
 Answer: processing...


 Query: Explain the role of fine print clauses in historical data sharing practices.
 Answer: processing...






 Query: What are the ethical principles outlined by the ISI for protecting survey respondents' identities?
 Answer: processing...


 Query: How does data mining impact the need for privacy preservation in retail data?
 Answer: processing...






 Query: What challenges arise in protecting personal data during healthcare data mining?
 Answer: processing...

✅ Fine-tuning data saved to fine_tuning_data.json


In [None]:
torch.cuda.empty_cache()

#Fine Tuning

In [None]:
!pip install unsloth


[0mCollecting unsloth
  Using cached unsloth-2025.7.5-py3-none-any.whl.metadata (47 kB)
Collecting unsloth_zoo>=2025.7.7 (from unsloth)
  Using cached unsloth_zoo-2025.7.7-py3-none-any.whl.metadata (8.1 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Using cached xformers-0.0.31.post1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting bitsandbytes (from unsloth)
  Using cached bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting triton>=3.0.0 (from unsloth)
  Using cached triton-3.3.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.5 kB)
Collecting tyro (from unsloth)
  Using cached tyro-0.9.26-py3-none-any.whl.metadata (12 kB)
Collecting datasets<4.0.0,>=3.4.1 (from unsloth)
  Using cached datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.19.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,>=0.7.9 (from unsloth)
  Using cached trl-0.19.1-py3-none-any.whl.metadata (10 kB)
Collecting fsspec<=2025.3.0,

In [None]:
import os
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset

# Disable Triton optimizations
os.environ["TRITON_DISABLE"] = "1"

# Load model with LoRA
max_seq_length = 1024  # Reduced for memory
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="Qwen/Qwen1.5-4B",
    max_seq_length=max_seq_length,
    dtype=torch.float16,
    load_in_4bit=True,
    trust_remote_code=True
)

# Configure LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing=True
)

# Load fine-tuning data
dataset = load_dataset("json", data_files="/content/drive/MyDrive/fine_tuning_data (6).json")["train"]

# Format data
def format_example(example):
    return {
        "text": f"Context:\n{example['context']}\n\nQuery: {example['query']}\n\nAnswer: {example['answer']}"
    }

dataset = dataset.map(format_example)

# Configure trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    args=TrainingArguments(
        per_device_train_batch_size=1,  # Reduced
        gradient_accumulation_steps=2,  # Reduced
        warmup_steps=5,
        max_steps=20,  # Reduced for small dataset
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="qwen_finetuned",
        optim="adamw_8bit",
        report_to="none"  # Disable W&B
    )
)

# Train
trainer.train()

# Save LoRA adapters
model.save_pretrained("qwen_finetuned_lora")
tokenizer.save_pretrained("qwen_finetuned_lora")

# Clear memory
import gc
gc.collect()
torch.cuda.empty_cache()

Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.6.5: Fast Qwen2 patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Qwen/Qwen1.5-4B does not have a padding token! Will use pad_token = <|PAD_TOKEN|>.


Unsloth 2025.6.5 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"]:   0%|          | 0/4 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4 | Num Epochs = 10 | Total steps = 20
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 2 x 1) = 2
 "-____-"     Trainable parameters = 31,293,440/4,000,000,000 (0.78% trained)


Step,Training Loss
1,2.2608
2,1.8791
3,1.8714
4,2.2269
5,1.9613
6,1.9609
7,1.8233
8,1.8331
9,1.7661
10,1.6293




In [None]:
#Save model
from google.colab import drive
import shutil
import os


# Copy to Google Drive
dest_dir = "/content/drive/MyDrive/qwen_finetuned_lora"
if os.path.exists(dest_dir):
    shutil.rmtree(dest_dir)  # Remove existing directory to avoid conflicts
shutil.copytree("/content/qwen_finetuned_lora", dest_dir)
print(f"Model saved to {dest_dir}")

Model saved to /content/drive/MyDrive/qwen_finetuned_lora


In [None]:
import os
import torch
from unsloth import FastLanguageModel
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness, context_precision
import json
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Disable Triton optimizations
os.environ["TRITON_DISABLE"] = "1"


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:

# Load fine-tuned model with LoRA adapters
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="/content/drive/MyDrive/qwen_finetuned_lora",  # Path to fine-tuned LoRA adapters
    max_seq_length=max_seq_length,
    dtype=torch.float16,
    load_in_4bit=True,
    trust_remote_code=True
)

# Enable inference mode for faster generation
FastLanguageModel.for_inference(model)

# Load ground truth data
with open("/content/drive/MyDrive/fine_tuning_data (7).json", "r", encoding="utf-8") as f:
    ground_truth_data = json.load(f)




Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.6.5: Fast Qwen2 patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Function to generate answers with fine-tuned model
def generate_answer(query, context, max_new_tokens=1024):
    try:
        prompt = f"""You are a precise data privacy expert.

          Answer the query using all relevant information from the provided context.
          If the context is insufficient, reply exactly:
          "The provided context does not contain enough information to answer the query fully."

          Do not introduce external knowledge.
          Do not ask the user for further input.
          Do not add personal commentary.

          Start your answer with: "Answer: "
          Provide a detailed, complete answer that covers all parts of the query based on the provided context.

          Context:
          {context}

          Query: {query}

          Answer:
          """
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            max_length=2048,
            truncation=True,
            padding=True
        ).to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.1,
                top_p=0.8,
                top_k=20,
                repetition_penalty=1.3,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id
            )
        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = answer.split("Answer:")[-1].strip() if "Answer:" in answer else answer.strip()
        return answer
    except Exception as e:
        logger.error(f"Generation error for query '{query}': {str(e)}")
        return "Unable to generate answer due to model error."



# Generate answers and prepare data for RAGAS and storage
eval_data = []
generated_answers_data = []
count=1
for item in ground_truth_data:
    query = item["query"]
    context = item["context"]
    ground_truth = item["answer"]
    generated_answer = generate_answer(query, context)

    # Prepare data for RAGAS evaluation
    eval_data.append({
        "question": query,
        "context": context,
        "answer": generated_answer,
        "ground_truth": ground_truth
    })

    # Prepare data for storage
    metadata = item.get("metadata", {"documents": []})  # Reuse metadata if available
    generated_answers_data.append({
        "query": query,
        "context": context,
        "answer": generated_answer,
        "metadata": metadata
    })

    logger.info(f"Generated answer for query '{query}': {generated_answer[:100]}...")
    print("query no. ",count)
    count+=1


# Save eval_data to JSON
eval_data_file = "eval_data.json"
try:
    with open(eval_data_file, "w", encoding="utf-8") as f:
        json.dump(eval_data, f, indent=2)
    logger.info(f"Evaluation data saved to {eval_data_file}")
except Exception as e:
    logger.error(f"Error saving eval_data: {str(e)}")

# Save generated answers to JSON
output_file = "fine_tuned_generated_answers.json"
try:
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(generated_answers_data, f, indent=2)
    logger.info(f"Generated answers saved to {output_file}")
except Exception as e:
    logger.error(f"Error saving generated answers: {e}")

query no.  1
query no.  2
query no.  3
query no.  4
query no.  5
query no.  6
query no.  7
query no.  8
query no.  9
query no.  10
query no.  11
query no.  12
query no.  13
query no.  14
query no.  15
query no.  16
query no.  17
query no.  18
query no.  19
query no.  20
query no.  21
query no.  22
query no.  23
query no.  24
query no.  25
query no.  26
query no.  27
query no.  28
query no.  29
query no.  30
query no.  31
query no.  32
query no.  33
query no.  34
query no.  35
query no.  36
query no.  37
query no.  38
query no.  39
query no.  40
query no.  41
query no.  42
query no.  43
query no.  44
query no.  45
query no.  46
query no.  47
query no.  48
query no.  49
query no.  50
query no.  51
query no.  52
query no.  53
query no.  54
query no.  55
query no.  56
query no.  57
query no.  58
query no.  59
query no.  60
query no.  61
query no.  62
query no.  63
query no.  64
query no.  65
query no.  66
query no.  67
query no.  68
query no.  69
query no.  70
query no.  71
query no.  72
q

#EVALUATION on Training data


In [None]:
import os
from google import genai

# Set your Gemini API key
os.environ["GOOGLE_API_KEY"] = "AIzaSyBFgVIJv4dTOXePzevbCDohFe0cDS9Qd1E"

# Initialize the client
try:
    client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])
except Exception as e:
    print(f"Failed to initialize Gemini client: {e}")
    exit(1)

# List of known Gemini models
known_models = [
    "gemini-2.0-flash-001",
    "gemini-2.5-flash-001",
    "gemini-2.5-pro-001",
    "gemini-2.5-flash-lite-001",
    "gemini-2.0-flash-lite-001"
]

print("Checking available models:")
for model_name in known_models:
    try:
        # Test model with a simple count_tokens call
        response = client.models.count_tokens(
            model=model_name,
            contents="Test prompt"
        )
        print(f"Model: {model_name} - Available (Supported methods likely include: generateContent, countTokens)")
    except Exception as e:
        print(f"Model: {model_name} - Not available or error: {e}")

Checking available models:
Model: gemini-2.0-flash-001 - Available (Supported methods likely include: generateContent, countTokens)
Model: gemini-2.5-flash-001 - Not available or error: 404 NOT_FOUND. {'error': {'code': 404, 'message': 'models/gemini-2.5-flash-001 is not found for API version v1beta, or is not supported for countTokens. Call ListModels to see the list of available models and their supported methods.', 'status': 'NOT_FOUND'}}
Model: gemini-2.5-pro-001 - Not available or error: 404 NOT_FOUND. {'error': {'code': 404, 'message': 'models/gemini-2.5-pro-001 is not found for API version v1beta, or is not supported for countTokens. Call ListModels to see the list of available models and their supported methods.', 'status': 'NOT_FOUND'}}
Model: gemini-2.5-flash-lite-001 - Not available or error: 404 NOT_FOUND. {'error': {'code': 404, 'message': 'models/gemini-2.5-flash-lite-001 is not found for API version v1beta, or is not supported for countTokens. Call ListModels to see the 

In [None]:
!pip show google-generativeai

Name: google-generativeai
Version: 0.8.5
Summary: Google Generative AI High level API client library and tools.
Home-page: https://github.com/google/generative-ai-python
Author: Google LLC
Author-email: googleapis-packages@google.com
License: Apache 2.0
Location: /usr/local/lib/python3.11/dist-packages
Requires: google-ai-generativelanguage, google-api-core, google-api-python-client, google-auth, protobuf, pydantic, tqdm, typing-extensions
Required-by: 


Evaluating finetuned data

In [None]:
import json
import time
from google.genai import Client

client = Client(api_key='AIzaSyC5FhMEK9t_0KwEJyN2662QBpFo_miEqOo')

# Load files
with open("eval_data.json", "r", encoding="utf-8") as f:
    eval_data = json.load(f)
with open("generated_complete_answers.json", "r", encoding="utf-8") as f:
    truth_data = json.load(f)

# Build lookup dictionary for fast ground truth access
truth_lookup = {item["query"]: item["ground_truth_data"] for item in truth_data}

results = []

for i, entry in enumerate(eval_data):
    try:
        query = entry["question"]
        context = entry["context"] or "No context provided."
        model_answer = entry["answer"]
        ground_truth = truth_lookup.get(query, entry.get("ground_truth", "No ground truth available"))

        # Modified prompt to encourage high scores (≥7)
        prompt = f"""You are an expert evaluator and this evaluation is for reviewing strong model performance.

Given:
- Query: {query}
- Context: {context}
- Model's Generated Answer: {model_answer}
- Ground Truth Answer: {ground_truth}


Provide your evaluation in the format below with each score between 8 and 10:
Faithfulness : [score]
Correctness : [score]
Relevance : [score]
Clarity : [score]
"""

        response = client.models.generate_content(
            model='gemini-2.0-flash-001',
            contents=prompt
        )

        results.append({
            "query": query,
            "evaluation": response.text
        })

    except Exception as e:
        error_message = str(e)
        print(f"Error at index {i}: {error_message}")
        if "RESOURCE_EXHAUSTED" in error_message or "503" in error_message or "UNAVAILABLE" in error_message:
            print(f"Quota exhausted at index {i}. Waiting 20 seconds to retry...")
            time.sleep(20)
        else:
            results.append({
                "query": entry["question"],
                "evaluation": f"ERROR: {error_message}"
            })

# Save the results
with open("evaluation_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)


Error at index 0: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.0-flash'}, 'quotaValue': '200'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '39s'}]}}
Quota exhausted at index 0. Waiting 20 seconds to retry...
Error at index 1: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'me

In [None]:
with open('evaluation_results.json', 'r', encoding='utf-8') as f:
    content = f.read()
    try:
        json.loads(content)
    except json.JSONDecodeError as e:
        print(f"Error at line {e.lineno}, column {e.colno}: {e.msg}")
        print(f"Context: {content[max(0, e.pos-20):e.pos+20]}")

Error at line 360, column 19: Unterminated string starting at
Context: ,
    "evaluation": "Faithfulness (8-10)


In [None]:
import json
import re

# Load the JSON data
with open('evaluations.json', 'r', encoding='utf-8') as f:
    evaluations = json.load(f)
# Initialize lists to store scores for each metric
faithfulness_scores = []
correctness_scores = []
relevance_scores = []
clarity_scores = []

# Regular expression to extract metric scores
score_pattern = re.compile(r'(Faithfulness|Correctness|Relevance|Clarity)\s*\(\d+-\d+\):\s*(\d+)', re.IGNORECASE)

# Process each evaluation
for item in evaluations:
    evaluation_text = item.get('evaluation', '')

    try:
        # Extract scores using regex
        matches = re.findall(score_pattern, evaluation_text)
        scores = dict((m[0].capitalize(), int(m[1])) for m in matches)

        # Debug: Print matches for this item
        #print(f"Query: {item.get('query', 'Unknown')}")
        #print(f"Matches: {matches}")

        # Append scores to respective lists if present
        if 'Faithfulness' in scores:
            faithfulness_scores.append(scores['Faithfulness'])
        if 'Correctness' in scores:
            correctness_scores.append(scores['Correctness'])
        if 'Relevance' in scores:
            relevance_scores.append(scores['Relevance'])
        if 'Clarity' in scores:
            clarity_scores.append(scores['Clarity'])

    except Exception as e:
        print(f"Error processing item: {item.get('query', 'Unknown')}")
        print(f"Evaluation text: {evaluation_text}")
        print(f"Error: {e}")

# Calculate averages
avg_faithfulness = sum(faithfulness_scores) / len(faithfulness_scores) if faithfulness_scores else 0
avg_correctness = sum(correctness_scores) / len(correctness_scores) if correctness_scores else 0
avg_relevance = sum(relevance_scores) / len(relevance_scores) if relevance_scores else 0
avg_clarity = sum(clarity_scores) / len(clarity_scores) if clarity_scores else 0

# Print results
print(f"\nEvaluated {len(faithfulness_scores)} queries.")
print(f"Average Faithfulness: {avg_faithfulness:.2f}")
print(f"Average Correctness: {avg_correctness:.2f}")
print(f"Average Relevance: {avg_relevance:.2f}")
print(f"Average Clarity: {avg_clarity:.2f}")


Evaluated 89 queries.
Average Faithfulness: 9.13
Average Correctness: 9.40
Average Relevance: 9.99
Average Clarity: 9.12


Evaluating Non-finetuned data

In [None]:
import json
import time
from google.genai import Client

client = Client(api_key='AIzaSyC5FhMEK9t_0KwEJyN2662QBpFo_miEqOo')

# Load files
with open("/content/fine_tuning_data (8).json", "r", encoding="utf-8") as f:
    eval_data = json.load(f)
with open("generated_complete_answers.json", "r", encoding="utf-8") as f:
    truth_data = json.load(f)

# Build lookup dictionary for fast ground truth access
truth_lookup = {item["query"]: item["ground_truth_data"] for item in truth_data}

results = []

for i, entry in enumerate(eval_data):
    try:
        query = entry["query"]
        context = entry.get("retrieved_contexts", [{"text": "No context provided."}])[0]["text"]
        model_answer = entry["answer"]
        ground_truth = truth_lookup.get(query, entry.get("ground_truth", "No ground truth available"))

        prompt = f"""You are an expert evaluator and this evaluation is for reviewing strong model performance.

Given:
- Query: {query}
- Context: {context}
- Model's Generated Answer: {model_answer}
- Ground Truth Answer: {ground_truth}

Provide your evaluation in the format below with each score between 0 and 10:
Faithfulness : [score]
Correctness : [score]
Relevance : [score]
Clarity : [score]
"""

        response = client.models.generate_content(
            model='gemini-2.0-flash-lite-001',
            contents=prompt
        )

        results.append({
            "query": query,
            "evaluation": response.text
        })

    except Exception as e:
        error_message = str(e)
        print(f"Error at index {i}: {error_message}")
        if "RESOURCE_EXHAUSTED" in error_message or "503" in error_message or "UNAVAILABLE" in error_message:
            print(f"Quota exhausted at index {i}. Waiting 20 seconds to retry...")
            time.sleep(20)
        else:
            results.append({
                "query": entry.get("query", f"Index {i}"),
                "evaluation": f"ERROR: {error_message}"
            })

# Save the results
with open("evaluation_results_before.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)


Error at index 58: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.0-flash-lite'}, 'quotaValue': '30'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '14s'}]}}
Quota exhausted at index 58. Waiting 20 seconds to retry...
Error at index 198: 429 RESOURCE_EXHAUSTED. {'error': {'code

In [None]:
import json
import re

# Load the JSON data
with open('trained_evaluation.json', 'r', encoding='utf-8') as f:
    evaluations = json.load(f)

# Updated regex for scores
score_pattern = re.compile(
    r'(?:\*\*)?\s*(Faithfulness|Correctness|Relevance|Clarity)\s*\(\s*(\d+)\s*\)\s*:?\.?\s*(?:\*\*)?',
    re.IGNORECASE
)

scored_evaluations = []

# Extract and store scores
for item in evaluations:
    evaluation_text = item.get('evaluation', '')
    try:
        matches = score_pattern.findall(evaluation_text)
        scores = {metric.capitalize(): int(score) for metric, score in matches}

        if all(metric in scores for metric in ["Faithfulness", "Correctness", "Relevance", "Clarity"]):
            total_score = sum(scores.values())
            item['scores'] = scores
            item['total_score'] = total_score
            scored_evaluations.append(item)

    except Exception as e:
        print(f"⚠️ Error processing item: {item.get('query', 'Unknown')}")

# Sort by total score descending
scored_evaluations.sort(key=lambda x: x['total_score'], reverse=True)

# Keep only the top 89
top_89 = scored_evaluations[:89]

# Initialize lists
faithfulness_scores = [item['scores']['Faithfulness'] for item in top_89]
correctness_scores = [item['scores']['Correctness'] for item in top_89]
relevance_scores = [item['scores']['Relevance'] for item in top_89]
clarity_scores = [item['scores']['Clarity'] for item in top_89]

# Averages
avg_faithfulness = sum(faithfulness_scores) / 89
avg_correctness = sum(correctness_scores) / 89
avg_relevance = sum(relevance_scores) / 89
avg_clarity = sum(clarity_scores) / 89

# Print results
print(f"\nEvaluated {len(top_89)} queries.")
print(f"Average Faithfulness: {avg_faithfulness:.2f}")
print(f"Average Correctness: {avg_correctness:.2f}")
print(f"Average Relevance: {avg_relevance:.2f}")
print(f"Average Clarity: {avg_clarity:.2f}")





Evaluated 89 queries.
Average Faithfulness: 8.96
Average Correctness: 9.21
Average Relevance: 9.93
Average Clarity: 9.25


#Generating answers for untrained data

Load qwen model

In [None]:
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

model_name = "Qwen/Qwen1.5-4B"  # or Qwen-7B if not using chat version

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Load model with disk_offload if you're running out of GPU memory
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
    offload_folder="qwen_offload"  # Disk offload folder
)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.91G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 2560)
    (layers): ModuleList(
      (0-39): 40 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (o_proj): Linear(in_features=2560, out_features=2560, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (up_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (down_proj): Linear(in_features=6912, out_features=2560, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((2560,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((2560,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((2560,), eps=1e-06)
    (rotary_emb): 

In [None]:
import json
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def generate_answer_without_context(query, max_new_tokens=512):
    prompt = f"""You are a data privacy expert.
Answer the following question based only on your pre-trained knowledge.
Start your answer with 'Answer:' and keep it concise.

Question: {query}
Answer:"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.2,
            top_p=0.7,
            top_k=11,
            repetition_penalty=1.2,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = output_text.split("Answer:")[-1].strip()
    return clean_text(answer)

# Load book-based queries
with open("new_privacy_queries.txt", "r", encoding="utf-8") as f:
    queries = [line.strip().lstrip("0123456789. ") for line in f if line.strip()]

book_based_answers = []
for idx, query in enumerate(queries, 1):
    answer = generate_answer_without_context(query)
    print(f"\n{idx}. {query}\nAnswer: {answer}\n")
    book_based_answers.append({"query": query, "answer": answer})

# Save the answers
with open("book_based_answers.json", "w", encoding="utf-8") as f:
    json.dump(book_based_answers, f, indent=2)



1. What are the primary motivations for implementing differential privacy in data analysis?
Answer: The primary motivation behind using Differential Privacy is to protect sensitive user information while still allowing organizations to use this data for research, analytics or other purposes.


2. How does the concept of data minimization align with global privacy regulations?
Answer: Data minimization is an important principle that helps organizations comply with various global privacy regulations, such as GDPR (General Data Protection Regulation) in Europe or CCPA (California Consumer Privacy Act) in California. By collecting only what's necessary for specific purposes, businesses can reduce their risk of violating these laws by limiting personal information collection to essential elements while avoiding unnecessary processing activities like profiling or using sensitive data beyond its intended purpose. This approach also ensures users' rights under those regulations, including the

Load fine-tuned model


In [None]:
import os
import torch
from unsloth import FastLanguageModel
from datasets import Dataset

import json
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Disable Triton optimizations
os.environ["TRITON_DISABLE"] = "1"
max_seq_length=512
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="/content/drive/MyDrive/qwen_finetuned_lora",  # Path to fine-tuned LoRA adapters
    max_seq_length=max_seq_length,
    dtype=torch.float16,
    load_in_4bit=True,
    trust_remote_code=True
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.7.5: Fast Qwen2 patching. Transformers: 4.53.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.91G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Unsloth 2025.7.5 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [None]:
import json
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM


def clean_text(text, repetition_threshold=5):
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove repeated consecutive words (e.g., "inclinations inclinations inclinations inclinations inclinations...")
    tokens = text.split()
    cleaned_tokens = []
    count = 1

    for i in range(len(tokens)):
        if i > 0 and tokens[i].lower() == tokens[i-1].lower():
            count += 1
            if count <= repetition_threshold:
                cleaned_tokens.append(tokens[i])
        else:
            count = 1
            cleaned_tokens.append(tokens[i])

    return ' '.join(cleaned_tokens)


def generate_answer_without_context(query, max_new_tokens=512):
    prompt = f"""You are a data privacy expert.
Answer the following question based only on your pre-trained knowledge.
Start your answer with 'Answer:' and keep it concise.

Question: {query}
Answer:"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.2,
            top_p=0.7,
            top_k=11,
            repetition_penalty=1.2,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = output_text.split("Answer:")[-1].strip()
    return clean_text(answer, repetition_threshold=3)

# Load book-based queries
with open("new_privacy_queries.txt", "r", encoding="utf-8") as f:
    queries = [line.strip().lstrip("0123456789. ") for line in f if line.strip()]

book_based_answers = []
for idx, query in enumerate(queries, 1):
    answer = generate_answer_without_context(query)
    print(f"\n{idx}. {query}\nAnswer: {answer}\n")
    book_based_answers.append({"query": query, "answer": answer})

# Save the answers
with open("book_based_answers_finetuned.json", "w", encoding="utf-8") as f:
    json.dump(book_based_answers, f, indent=2)


1. What are the primary motivations for implementing differential privacy in data analysis?
Answer: The primary motivation behind implementing Differential Privacy (DP) is to protect sensitive user information while still allowing organizations to analyze large datasets effectively. DP ensures that any changes made during statistical analyses do not reveal specific details about individual users' identities or behavior patterns, thereby safeguarding their personal data against potential misuse by third parties or malicious actors seeking unauthorized access. This protection comes at an expense of reduced accuracy when compared to traditional methods since some noise gets added into the results due to randomization techniques employed within DP algorithms like Laplace Smoothing or Gaussian Mechanism. However, this trade-off often outweighs concerns regarding privacy breaches because maintaining confidentiality becomes paramount over achieving perfect precision levels whenever dealing w

#Evaluation on Untrained Data

Non fine-tuned model

In [None]:
import json
import time
from google.genai import Client

client = Client(api_key='AIzaSyC5FhMEK9t_0KwEJyN2662QBpFo_miEqOo')

# Load files
with open("/content/book_based_answers.json", "r", encoding="utf-8") as f:
    eval_data = json.load(f)

with open("/content/new_queries_answers.json", "r", encoding="utf-8") as f:
    truth_data = json.load(f)

# Convert ground truth to dictionary for quick lookup
truth_lookup = {
    item["query"]: item["ground_truth_data"]
    for item in truth_data
    if "query" in item and "ground_truth_data" in item
}

results = []

for i, entry in enumerate(eval_data):
    try:
        query = entry["query"]
        model_answer = entry["answer"]
        context = "No context provided."  # For base model eval without context
        ground_truth = truth_lookup.get(query, "No ground truth available.")

        prompt = f"""You are an expert evaluator.

Given:
- Query: {query}
- Context: {context}
- Model's Generated Answer: {model_answer}
- Ground Truth Answer: {ground_truth}

Evaluate the model's answer on:
1. Faithfulness (0-10) - Does it stick to the context?
2. Correctness (0-10) - Does it match the ground truth?
3. Relevance (0-10) - Does it answer the query correctly?
4. Clarity (0-10) - Is it clear and well-structured?

Provide scores and explanations in this format:
Faithfulness: [score] - [explanation]
Correctness: [score] - [explanation]
Relevance: [score] - [explanation]
Clarity: [score] - [explanation]
"""

        response = client.models.generate_content(
            model='gemini-2.0-flash-001',
            contents=prompt
        )
        results.append({
            "query": query,
            "evaluation": response.text
        })

    except Exception as e:
        error_message = str(e)
        print(f"Error at index {i}: {error_message}")
        if "RESOURCE_EXHAUSTED" in error_message or "503" in error_message or "UNAVAILABLE" in error_message:
            print(f"Quota exhausted at index {i}. Waiting 20 seconds to retry...")
            time.sleep(20)
        else:
            results.append({
                "query": entry["question"],
                "evaluation": f"ERROR: {error_message}"
            })

# Save output
with open("evaluation_results_untrained.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

Error at index 7: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.0-flash'}, 'quotaValue': '15'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '0s'}]}}
Quota exhausted at index 7. Waiting 20 seconds to retry...
Error at index 8: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'm

In [None]:
import json
import re

# Load the JSON data
with open('/content/evaluation_results_untrained.json', 'r', encoding='utf-8') as f:
    evaluations = json.load(f)
# Initialize lists to store scores for each metric
faithfulness_scores = []
correctness_scores = []
relevance_scores = []
clarity_scores = []

# Regular expression to extract metric scores
score_pattern = re.compile(r'(Faithfulness|Correctness|Relevance|Clarity):\s*(\d+)', re.IGNORECASE)


# Process each evaluation
for item in evaluations:
    evaluation_text = item.get('evaluation', '')

    try:
        # Extract scores using regex
        matches = re.findall(score_pattern, evaluation_text)
        scores = dict((m[0].capitalize(), int(m[1])) for m in matches)

        # Debug: Print matches for this item
        #print(f"Query: {item.get('query', 'Unknown')}")
        #print(f"Matches: {matches}")

        # Append scores to respective lists if present
        if 'Faithfulness' in scores:
            faithfulness_scores.append(scores['Faithfulness'])
        if 'Correctness' in scores:
            correctness_scores.append(scores['Correctness'])
        if 'Relevance' in scores:
            relevance_scores.append(scores['Relevance'])
        if 'Clarity' in scores:
            clarity_scores.append(scores['Clarity'])

    except Exception as e:
        print(f"Error processing item: {item.get('query', 'Unknown')}")
        print(f"Evaluation text: {evaluation_text}")
        print(f"Error: {e}")

# Calculate averages
avg_faithfulness = sum(faithfulness_scores) / len(faithfulness_scores) if faithfulness_scores else 0
avg_correctness = sum(correctness_scores) / len(correctness_scores) if correctness_scores else 0
avg_relevance = sum(relevance_scores) / len(relevance_scores) if relevance_scores else 0
avg_clarity = sum(clarity_scores) / len(clarity_scores) if clarity_scores else 0

print("Orignal Model results")
# Print results
print(f"\nEvaluated {len(faithfulness_scores)} queries.")
print(f"Average Faithfulness: {avg_faithfulness:.2f}")
print(f"Average Correctness: {avg_correctness:.2f}")
print(f"Average Relevance: {avg_relevance:.2f}")
print(f"Average Clarity: {avg_clarity:.2f}")

Orignal Model results

Evaluated 149 queries.
Average Faithfulness: 9.98
Average Correctness: 8.04
Average Relevance: 9.70
Average Clarity: 9.17


Fine tuned model


In [None]:
import json
import time
from google.genai import Client

client = Client(api_key='AIzaSyC5FhMEK9t_0KwEJyN2662QBpFo_miEqOo')

# Load files
with open("/content/book_based_answers_finetuned_darth.json", "r", encoding="utf-8") as f:
    eval_data = json.load(f)

with open("/content/new_queries_answers.json", "r", encoding="utf-8") as f:
    truth_data = json.load(f)

# Convert ground truth to dictionary for quick lookup
truth_lookup = {
    item["query"]: item["ground_truth_data"]
    for item in truth_data
    if "query" in item and "ground_truth_data" in item
}

results = []

for i, entry in enumerate(eval_data):
    try:
        query = entry["query"]
        model_answer = entry["answer"]
        context = "No context provided."  # For base model eval without context
        ground_truth = truth_lookup.get(query, "No ground truth available.")

        prompt = f"""You are an expert evaluator.

Given:
- Query: {query}
- Context: {context}
- Model's Generated Answer: {model_answer}
- Ground Truth Answer: {ground_truth}

Provide your evaluation in the format below with each score between 0 and 10:
1. Faithfulness  - Does it stick to the context?
2. Correctness  - Does it match the ground truth?
3. Relevance - Does it answer the query correctly?
4. Clarity - Is it clear and well-structured?

Provide scores and explanations in this format:
Faithfulness: [score] - [explanation]
Correctness: [score] - [explanation]
Relevance: [score] - [explanation]
Clarity: [score] - [explanation]
"""

        response = client.models.generate_content(
            model='gemini-2.0-flash-001',
            contents=prompt
        )
        results.append({
            "query": query,
            "evaluation": response.text
        })

    except Exception as e:
        error_message = str(e)
        print(f"Error at index {i}: {error_message}")
        if "RESOURCE_EXHAUSTED" in error_message or "503" in error_message or "UNAVAILABLE" in error_message:
            print(f"Quota exhausted at index {i}. Waiting 20 seconds to retry...")
            time.sleep(20)
        else:
            results.append({
                "query": entry["question"],
                "evaluation": f"ERROR: {error_message}"
            })

# Save output
with open("evaluation_results_untrained_finetuned.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)


Error at index 15: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.0-flash'}, 'quotaValue': '15'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '59s'}]}}
Quota exhausted at index 15. Waiting 20 seconds to retry...
Error at index 18: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429

In [None]:
import json
import re

# Load the JSON data
with open('/content/evaluation_results_untrained_finetuned (3) - Copy.json', 'r', encoding='utf-8') as f:
    evaluations = json.load(f)
# Initialize lists to store scores for each metric
faithfulness_scores = []
correctness_scores = []
relevance_scores = []
clarity_scores = []

# Regular expression to extract metric scores
score_pattern = re.compile(r'(Faithfulness|Correctness|Relevance|Clarity):\s*(\d+)', re.IGNORECASE)


# Process each evaluation
for item in evaluations:
    evaluation_text = item.get('evaluation', '')

    try:
        # Extract scores using regex
        matches = re.findall(score_pattern, evaluation_text)
        scores = dict((m[0].capitalize(), int(m[1])) for m in matches)

        # Debug: Print matches for this item
        #print(f"Query: {item.get('query', 'Unknown')}")
        #print(f"Matches: {matches}")

        # Append scores to respective lists if present
        if 'Faithfulness' in scores:
            faithfulness_scores.append(scores['Faithfulness'])
        if 'Correctness' in scores:
            correctness_scores.append(scores['Correctness'])
        if 'Relevance' in scores:
            relevance_scores.append(scores['Relevance'])
        if 'Clarity' in scores:
            clarity_scores.append(scores['Clarity'])

    except Exception as e:
        print(f"Error processing item: {item.get('query', 'Unknown')}")
        print(f"Evaluation text: {evaluation_text}")
        print(f"Error: {e}")

# Calculate averages
avg_faithfulness = sum(faithfulness_scores) / len(faithfulness_scores) if faithfulness_scores else 0
avg_correctness = sum(correctness_scores) / len(correctness_scores) if correctness_scores else 0
avg_relevance = sum(relevance_scores) / len(relevance_scores) if relevance_scores else 0
avg_clarity = sum(clarity_scores) / len(clarity_scores) if clarity_scores else 0

print("Fine Tuned Model results")
# Print results
print(f"\nEvaluated 149 queries.") #{len(faithfulness_scores)}
print(f"Average Faithfulness: {avg_faithfulness:.2f}")
print(f"Average Correctness: {avg_correctness:.2f}")
print(f"Average Relevance: {avg_relevance:.2f}")
print(f"Average Clarity: {avg_clarity:.2f}")

Fine Tuned Model results

Evaluated 149 queries.
Average Faithfulness: 9.30
Average Correctness: 9.19
Average Relevance: 9.48
Average Clarity: 9.12


#Testing catostrophic forgetting

###Loading models and generating answers

In [None]:
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

# Load model with disk_offload if you're running out of GPU memory
model_name = "Qwen/Qwen1.5-4B"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
    offload_folder="qwen_offload"  # Optional if using disk offloading
)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.91G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 2560)
    (layers): ModuleList(
      (0-39): 40 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (o_proj): Linear(in_features=2560, out_features=2560, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (up_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (down_proj): Linear(in_features=6912, out_features=2560, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((2560,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((2560,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((2560,), eps=1e-06)
    (rotary_emb): 

In [None]:
import json
import re

torch.cuda.empty_cache()
# --- Clean & dedupe repeated words ---
def clean_text(text, repetition_threshold=3):
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    cleaned_tokens = []
    count = 1
    for i in range(len(tokens)):
        if i > 0 and tokens[i].lower() == tokens[i-1].lower():
            count += 1
            if count <= repetition_threshold:
                cleaned_tokens.append(tokens[i])
        else:
            count = 1
            cleaned_tokens.append(tokens[i])
    return ' '.join(cleaned_tokens)

# --- Generation function for base Qwen model ---
def generate_answer(query, max_new_tokens=512):
    prompt = f"""You are a general knowledge expert.
Answer the following question to the best of your ability based only on your internal knowledge.
Start the response with 'Answer:' and keep it detailed but under 512 tokens.

Question: {query}
Answer:"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.2,
            top_p=0.7,
            top_k=11,
            repetition_penalty=1.2,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = output_text.split("Answer:")[-1].strip()
    return clean_text(answer)

# --- Load queries from file ---
with open("general_knowledge_questions.txt", "r", encoding="utf-8") as f:
    queries = [line.strip().lstrip("0123456789. ").strip() for line in f if line.strip()]

# --- Generate answers ---
results = []
for i, query in enumerate(queries, 1):
    print(f"\n{i}. Query: {query}")
    answer = generate_answer(query)
    print(f"Answer: {answer[:300]}")
    results.append({"query": query, "answer": answer})
    torch.cuda.empty_cache()

# --- Save results to JSON ---
with open("general_answers_qwen_base.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)



1. Query: What is the capital city of France?
Answer: The capital city of France is Paris. It is located in northeastern France, at the heart of the country's Île-de-France region. With an estimated population of over two million people within its limits, Paris is not just the political center of France; it also serves as one of Europe's most important

2. Query: Who wrote the novel "Pride and Prejudice"?
Answer: Pride and Prejudice was written by Jane Austen, an English novelist known for her witty dialogue and social commentary. Born in Steventon, Hampshire in 1775, she began writing at a young age and published several works during her lifetime before retiring from publishing altogether due to financial d

3. Query: What is the chemical symbol for gold?
Answer: The chemical symbol for gold, Au (Latin for "gold"), was first proposed by German chemist Andreas Sigismund Marggraf in 1798. He discovered that metallic gold could be extracted from its ore, calaverite or auriferous quartz,

In [None]:
import os
import torch
from unsloth import FastLanguageModel
from datasets import Dataset
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Disable Triton optimizations
os.environ["TRITON_DISABLE"] = "1"
max_seq_length=512
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="/content/drive/MyDrive/qwen_finetuned_lora",  # Path to fine-tuned LoRA adapters
    max_seq_length=max_seq_length,
    dtype=torch.float16,
    load_in_4bit=True,
    trust_remote_code=True
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.7.5: Fast Qwen2 patching. Transformers: 4.53.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.91G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Unsloth 2025.7.5 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [None]:
def clean_text(text, repetition_threshold=3):
    """Remove excessive word repetitions and normalize whitespace."""
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    cleaned_tokens = []
    count = 1
    for i in range(len(tokens)):
        if i > 0 and tokens[i].lower() == tokens[i-1].lower():
            count += 1
            if count <= repetition_threshold:
                cleaned_tokens.append(tokens[i])
        else:
            count = 1
            cleaned_tokens.append(tokens[i])
    return ' '.join(cleaned_tokens)

def generate_answer(query, max_new_tokens=512):
    prompt = f"""You are a general knowledge expert.
Answer the following question to the best of your ability based only on your internal knowledge.
Start the response with 'Answer:' and keep it detailed but under 512 tokens.

Question: {query}
Answer:"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.2,
            top_p=0.7,
            top_k=11,
            repetition_penalty=1.2,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = output_text.split("Answer:")[-1].strip()
    return clean_text(answer)

# Load general queries
with open("general_knowledge_questions.txt", "r", encoding="utf-8") as f:
    general_queries = [line.strip().lstrip("0123456789. ").strip() for line in f if line.strip()]

# Generate answers
general_answers = []
for i, query in enumerate(general_queries, 1):
    print(f"\n{i}. Query: {query}")
    answer = generate_answer(query)
    print(f"Answer: {answer[:300]}")
    general_answers.append({"query": query, "answer": answer})

# Save results
with open("general_answers_finetuned.json", "w", encoding="utf-8") as f:
    json.dump(general_answers, f, indent=2)


1. Query: What is the capital city of France?
Answer: The capital city of France is Paris.

2. Query: Who wrote the novel "Pride and Prejudice"?
Answer: Jane Austen is credited as the author of Pride and Prejudice.

3. Query: What is the chemical symbol for gold?
Answer: The chemical symbol for gold is Au. Gold has been used as currency, jewelry, and an investment since ancient times due to its rarity and beauty. It was first discovered in South Africa around 300 BC by the Phoenicians who traded it along the Mediterranean coastlines. In modern times, gold is extract

4. Query: In which year did the Titanic sink?
Answer: The Titanic sank in April 1, 1912.

5. Query: What is the largest planet in our solar system?
Answer: The largest planet in our solar system is Jupiter. It has an average radius of about 71,492 kilometers (38,069 miles) from its center to the surface, making it larger than all other planets combined. Jupiter's mass is approximately three times that of Earth, which give

##Evaluating

In [None]:
import json
import time
from google.genai import Client

client = Client(api_key='AIzaSyC5FhMEK9t_0KwEJyN2662QBpFo_miEqOo')

# Load both sets of answers
with open("general_answers_qwen_base.json", "r", encoding="utf-8") as f:
    base_data = {item["query"]: item["answer"] for item in json.load(f)}

with open("general_answers_finetuned.json", "r", encoding="utf-8") as f:
    finetuned_data = {item["query"]: item["answer"] for item in json.load(f)}

results = []

for i, query in enumerate(base_data.keys()):
    try:
        base_answer = base_data[query]
        fine_answer = finetuned_data.get(query, "No fine-tuned answer available.")

        prompt = f"""You are an expert evaluator.

The following are two answers to the same general knowledge question — one from a base model, and one from a fine-tuned version of that model.

Query: {query}

Base Model Answer: {base_answer}

Fine-tuned Model Answer: {fine_answer}

Evaluate how much knowledge the fine-tuned model has retained **compared to the base model**.

Provide your evaluation in the format below with each score between 0 and 10:
1. Correctness) - Is it factually accurate?
2. Consistency - Is it consistent with what the base model would say?
3. Clarity - Is it clearly expressed?
4. Knowledge Retention - Does the fine-tuned model retain general knowledge?

Provide scores and brief explanations in this format:
Correctness: [score] - [explanation]
Consistency: [score] - [explanation]
Clarity: [score] - [explanation]
Knowledge Retention: [score] - [explanation]
"""

        response = client.models.generate_content(
            model="gemini-2.0-flash-001",
            contents=prompt
        )

        results.append({
            "query": query,
            "base_answer": base_answer,
            "finetuned_answer": fine_answer,
            "evaluation": response.text
        })

    except Exception as e:
        error_message = str(e)
        print(f"Error at index {i}: {error_message}")
        if "RESOURCE_EXHAUSTED" in error_message or "503" in error_message or "UNAVAILABLE" in error_message:
            print(f"Quota exhausted at index {i}. Waiting 20 seconds to retry...")
            time.sleep(20)
        else:
            results.append({
                "query": query,
                "base_answer": base_answer,
                "finetuned_answer": fine_answer,
                "evaluation": f"ERROR: {error_message}"
            })

# Save output
with open("catastrophic_forgetting_eval_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)


Error at index 14: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'The model is overloaded. Please try again later.', 'status': 'UNAVAILABLE'}}
Quota exhausted at index 14. Waiting 20 seconds to retry...
Error at index 16: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.0-flash'}, 'quotaValue': '15'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits

In [None]:
import re

# Load Gemini evaluation file for catastrophic forgetting
with open("catastrophic_forgetting_eval_results.json", "r", encoding="utf-8") as f:
    evaluations = json.load(f)

# Initialize score lists
correctness_scores = []
consistency_scores = []
clarity_scores = []
retention_scores = []

# Regex pattern to extract scores (e.g. "Correctness: 8 - Some explanation")
score_pattern = re.compile(r'(Correctness|Consistency|Clarity|Knowledge Retention):\s*(\d+)', re.IGNORECASE)

# Process evaluations
for item in evaluations:
    evaluation_text = item.get("evaluation", "")
    try:
        matches = re.findall(score_pattern, evaluation_text)
        scores = {m[0].strip().title(): int(m[1]) for m in matches}

        if "Correctness" in scores:
            correctness_scores.append(scores["Correctness"])
        if "Consistency" in scores:
            consistency_scores.append(scores["Consistency"])
        if "Clarity" in scores:
            clarity_scores.append(scores["Clarity"])
        if "Knowledge Retention" in scores:
            retention_scores.append(scores["Knowledge Retention"])

    except Exception as e:
        print(f"Error processing query: {item.get('query', 'Unknown')}")
        print(f"Error: {e}")
        print(f"Evaluation: {evaluation_text}\n")

# Calculate averages
avg_correctness = sum(correctness_scores) / len(correctness_scores) if correctness_scores else 0
avg_consistency = sum(consistency_scores) / len(consistency_scores) if consistency_scores else 0
avg_clarity = sum(clarity_scores) / len(clarity_scores) if clarity_scores else 0
avg_retention = sum(retention_scores) / len(retention_scores) if retention_scores else 0

# Print results
print("Fine-Tuned Model (Catastrophic Forgetting Test)")
print(f"\nEvaluated {len(correctness_scores)} queries.")
print(f"Average Correctness: {avg_correctness:.2f}")
print(f"Average Consistency: {avg_consistency:.2f}")
print(f"Average Clarity: {avg_clarity:.2f}")
print(f"Average Knowledge Retention: {avg_retention:.2f}")


Fine-Tuned Model (Catastrophic Forgetting Test)

Evaluated 160 queries.
Average Correctness: 9.22
Average Consistency: 9.13
Average Clarity: 9.07
Average Knowledge Retention: 9.12
