In [1]:
%pip install PyPDF2 dotenv feedparser ragas datasets

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import PyPDF2
import markdown
from transformers import pipeline
import pandas as pd
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module="PyPDF2")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

#### Download additional research papers from the internet

In [7]:
import feedparser
import urllib.parse

# Function to fetch AI research papers from arXiv
def fetch_arxiv_papers(query="large language models", max_results=20):
    base_url = "http://export.arxiv.org/api/query?"
    encoded_query = urllib.parse.quote(query)  # Encode the query to handle spaces
    search_query = f"search_query=ti:{encoded_query}+OR+abs:{encoded_query}+OR+cat:cs.LG&start=0&max_results={max_results}&sortBy=relevance&sortOrder=descending"

    response = feedparser.parse(base_url + search_query)

    papers = []
    for entry in response.entries:
        papers.append({
            "title": entry.title,
            "summary": entry.summary,
            "pdf_url": entry.link.replace("abs", "pdf")
        })

    return papers

# Fetch and print latest AI papers
ai_papers = fetch_arxiv_papers("Reasoning Capability in LLMs OR Reinforcement Learning OR Logical Reasoning", max_results=20)
for paper in ai_papers:
    print(f"Title: {paper['title']}\nPDF: {paper['pdf_url']}\n")


Title: Code to Think, Think to Code: A Survey on Code-Enhanced Reasoning and
  Reasoning-Driven Code Intelligence in LLMs
PDF: http://arxiv.org/pdf/2502.19411v1

Title: Rewarding Graph Reasoning Process makes LLMs more Generalized Reasoners
PDF: http://arxiv.org/pdf/2503.00845v1

Title: Enhancing Reasoning Capabilities of LLMs via Principled Synthetic Logic
  Corpus
PDF: http://arxiv.org/pdf/2411.12498v2

Title: Reasoning-as-Logic-Units: Scaling Test-Time Reasoning in Large Language
  Models Through Logic Unit Alignment
PDF: http://arxiv.org/pdf/2502.07803v1

Title: Logical Reasoning in Large Language Models: A Survey
PDF: http://arxiv.org/pdf/2502.09100v1

Title: Leveraging LLM Reasoning Enhances Personalized Recommender Systems
PDF: http://arxiv.org/pdf/2408.00802v1

Title: Reinforcing Thinking through Reasoning-Enhanced Reward Models
PDF: http://arxiv.org/pdf/2501.01457v1

Title: ZebraLogic: On the Scaling Limits of LLMs for Logical Reasoning
PDF: http://arxiv.org/pdf/2502.01100v1



In [4]:
import requests

DATA_FOLDER = "data/dataset/"
OUTPUT_FOLDER = "data/processed_data"

In [9]:
def sanitize_filename(filename):
    return "".join(c if c.isalnum() or c in (' ', '_', '-') else '_' for c in filename)

def download_papers(papers, save_folder):
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    for paper in papers:
        pdf_url = paper["pdf_url"]
        response = requests.get(pdf_url)
        sanitized_title = sanitize_filename(paper['title'])
        pdf_filename = os.path.join(save_folder, f"{sanitized_title}.pdf")

        with open(pdf_filename, "wb") as file:
            file.write(response.content)

        print(f"Downloaded: {pdf_filename}")

download_papers(ai_papers, DATA_FOLDER)

Downloaded: data/dataset/Code to Think_ Think to Code_ A Survey on Code-Enhanced Reasoning and_  Reasoning-Driven Code Intelligence in LLMs.pdf
Downloaded: data/dataset/Rewarding Graph Reasoning Process makes LLMs more Generalized Reasoners.pdf
Downloaded: data/dataset/Enhancing Reasoning Capabilities of LLMs via Principled Synthetic Logic_  Corpus.pdf
Downloaded: data/dataset/Reasoning-as-Logic-Units_ Scaling Test-Time Reasoning in Large Language_  Models Through Logic Unit Alignment.pdf
Downloaded: data/dataset/Logical Reasoning in Large Language Models_ A Survey.pdf
Downloaded: data/dataset/Leveraging LLM Reasoning Enhances Personalized Recommender Systems.pdf
Downloaded: data/dataset/Reinforcing Thinking through Reasoning-Enhanced Reward Models.pdf
Downloaded: data/dataset/ZebraLogic_ On the Scaling Limits of LLMs for Logical Reasoning.pdf
Downloaded: data/dataset/Disentangling Logic_ The Role of Context in Large Language Model_  Reasoning Capabilities.pdf
Downloaded: data/dataset/

#### Process Chunks

In [10]:
def extract_pdf_text(pdf_path):
    text = ""
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text()
    return text

def extract_md_text(md_path):
    with open(md_path, "r", encoding="utf-8") as f:
        md_text = f.read()
    return markdown.markdown(md_text)  # Convert MD to plain text if needed

In [18]:
import random

# Process all files
all_text = []

for file in os.listdir(DATA_FOLDER):
    file_path = os.path.join(DATA_FOLDER, file)
    if file.endswith(".pdf"):
        all_text.append(extract_pdf_text(file_path))
    elif file.endswith(".md"):
        all_text.append(extract_md_text(file_path))

# Split into chunks (adjust chunk_size as needed)
chunk_size = 4000
overlap = 500
chunks = [text[i:i+chunk_size] for text in all_text for i in range(0, len(text), chunk_size - overlap)]

random.shuffle(chunks)

In [19]:
import torch
print(torch.cuda.is_available())  # Should output True

True


#### Generate Q&A Pairs **Prompt 1**

In [None]:
import openai

target_pairs = 1000  # Set desired number
qa_pairs = []
current_count = 0

for chunk in chunks:
    if current_count >= target_pairs:
        break

    response = openai.chat.completions.create(
        model="gpt-4o",  # or "gpt-4"
        messages=[
            {"role": "system", "content": "Generate ONE question and answer from technical text. Use format: Q: [question]\nA: [answer]"},
            {"role": "user", "content": f"Text: {chunk[:2000]}"}
        ],
        temperature=0.1,  # Lower for factual accuracy
        max_tokens=256,
    )

    try:
        full_response = response.choices[0].message.content
        q, a = full_response.split("A: ")
        q = q.replace("Q: ", "").strip()
        qa_pairs.append({"question": q, "answer": a.strip(), "context": chunk})
        current_count += 1
        print (f"Generated QA pair {current_count}")
    except:
        continue  # Skip malformed responses

# Convert to DataFrame
df = pd.DataFrame(qa_pairs)
print(f"Generated {len(df)} QA pairs")

Generated QA pair 1
Generated QA pair 2
Generated QA pair 3
Generated QA pair 4
Generated QA pair 5
Generated QA pair 6
Generated QA pair 7
Generated QA pair 8
Generated QA pair 9
Generated QA pair 10
Generated QA pair 11
Generated QA pair 12
Generated QA pair 13
Generated QA pair 14
Generated QA pair 15
Generated QA pair 16
Generated QA pair 17
Generated QA pair 18
Generated QA pair 19
Generated QA pair 20
Generated QA pair 21
Generated QA pair 22
Generated QA pair 23
Generated QA pair 24
Generated QA pair 25
Generated QA pair 26
Generated QA pair 27
Generated QA pair 28
Generated QA pair 29
Generated QA pair 30
Generated QA pair 31
Generated QA pair 32
Generated QA pair 33
Generated QA pair 34
Generated QA pair 35
Generated QA pair 36
Generated QA pair 37
Generated QA pair 38
Generated QA pair 39
Generated QA pair 40
Generated QA pair 41
Generated QA pair 42
Generated QA pair 43
Generated QA pair 44
Generated QA pair 45
Generated QA pair 46
Generated QA pair 47
Generated QA pair 48
G

#### Generate Q&A Pairs **Prompt 2**

In [20]:
import openai
import pandas as pd
import re
import time

target_pairs = 2000
qa_pairs = []
current_count = 0
failed_attempts = 0
max_retries = 3

# Improved prompt template
system_prompt = """You are an AI research assistant creating high-quality question-answer pairs from technical AI literature. Follow these rules:

1. Generate **at least 3-5** DEEP, TECHNICAL questions per chunk.
2. Answers must be:
   - Comprehensive but concise (1-3 sentences)
   - Contain technical details from the text
3. Question types should include:
   - Conceptual understanding
   - Experimental results analysis
   - Technical comparisons

🚨 STRICT OUTPUT FORMAT:
Q1: [question]
A1: [answer]
Q2: [question]
A2: [answer]
...
DO NOT include explanations or unnecessary text."""

In [23]:
for chunk in chunks:
    if current_count >= target_pairs:
        break  # Stop if we have reached the target pairs

    # Preprocess chunk to preserve technical content
    chunk = chunk[:4000].replace("\n", " ")  # Remove newlines but keep other formatting
    chunk = re.sub(r'\s+', ' ', chunk).strip()  # Collapse multiple spaces

    for attempt in range(max_retries):
        try:
            # API call with GPT-4o for question generation
            response = openai.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"Text excerpt from AI research paper:\n{chunk}"}
                ],
                temperature=0.3,  # Controlled randomness for better phrasing
                max_tokens=1500,  # Allow longer technical answers
                top_p=0.95,
                frequency_penalty=0.2,
                presence_penalty=0.1
            )

            full_response = response.choices[0].message.content

            # Extract multiple Q&A pairs using regex
            matches = re.findall(r"Q\d*:\s*(.*?)\nA\d*:\s*(.*?)(?=\nQ\d*:|\Z)", full_response, re.DOTALL)

            if matches:
                for q, a in matches:
                    qa_pairs.append({
                        "question": q.strip(),
                        "answer": a.strip(),
                        "context": chunk[:500] + "..."  # Store only first 500 chars to reduce redundancy
                    })
                    current_count += 1

                    print(f"✅ Generated QA pair {current_count}/{target_pairs}")

                    # Stop processing this chunk if we reach target pairs
                    if current_count >= target_pairs:
                        break

                failed_attempts = 0  # Reset failures after success
                break  # Stop retrying if we succeeded

            else:
                raise ValueError("Invalid format detected")

        except openai.RateLimitError:
            wait_time = 2 ** attempt  # Exponential backoff
            print(f"⚠️ Rate limit reached. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)

        except Exception as e:
            failed_attempts += 1
            print(f"⚠️ Retry {attempt+1} for chunk. Error: {str(e)}")

            # If too many failures, skip the chunk and reset failure count
            if failed_attempts > 20:
                print("⚠️ Skipping this chunk after 20 failures.")
                failed_attempts = 0
                break

# Convert to DataFrame with metadata
df = pd.DataFrame(qa_pairs)

print(f"✅ Generation complete. Final dataset: {len(df)} QA pairs")

✅ Generated QA pair 1389/2000
✅ Generated QA pair 1390/2000
✅ Generated QA pair 1391/2000
✅ Generated QA pair 1392/2000
✅ Generated QA pair 1393/2000
✅ Generated QA pair 1394/2000
✅ Generated QA pair 1395/2000
✅ Generated QA pair 1396/2000
✅ Generated QA pair 1397/2000
✅ Generated QA pair 1398/2000
✅ Generated QA pair 1399/2000
✅ Generated QA pair 1400/2000
✅ Generated QA pair 1401/2000
✅ Generated QA pair 1402/2000
✅ Generated QA pair 1403/2000
✅ Generated QA pair 1404/2000
✅ Generated QA pair 1405/2000
✅ Generated QA pair 1406/2000
✅ Generated QA pair 1407/2000
✅ Generated QA pair 1408/2000
✅ Generated QA pair 1409/2000
✅ Generated QA pair 1410/2000
✅ Generated QA pair 1411/2000
✅ Generated QA pair 1412/2000
✅ Generated QA pair 1413/2000
✅ Generated QA pair 1414/2000
✅ Generated QA pair 1415/2000
✅ Generated QA pair 1416/2000
✅ Generated QA pair 1417/2000
✅ Generated QA pair 1418/2000
✅ Generated QA pair 1419/2000
✅ Generated QA pair 1420/2000
✅ Generated QA pair 1421/2000
✅ Generate

#### Save Q&A Pairs

In [24]:
df.to_csv(os.path.join(OUTPUT_FOLDER, "qa_pairs2.csv"), index=False)

In [25]:
print(df)

                                               question  \
0     What are the potential societal benefits of us...   
1     How does the paper "Program synthesis with lar...   
2     What is the focus of the research by Chen et a...   
3     Describe the approach taken by Chen et al. in ...   
4     What is the significance of teaching large lan...   
...                                                 ...   
1995  What framework is SARA built on, and what is i...   
1996  Which datasets are used to evaluate the method...   
1997  How does SARA's performance compare to other b...   
1998  What evaluation metrics are used for HotpotQA,...   
1999  What problem does the proposed framework aim t...   

                                                 answer  \
0     The potential societal benefits include creati...   
1     The paper explores the use of large language m...   
2     The research focuses on improving code generat...   
3     The approach involves leveraging a divide-and-...

In [None]:
df = pd.read_csv(os.path.join(OUTPUT_FOLDER, "qa_pairs2.csv"))

# Add quality checks
df = df[df['answer'].str.len() > 30]  # Remove short answers
df = df.drop_duplicates(subset=['question'])  # Remove duplicates

#### Evaluate Q&A Pairs

In [None]:
from ragas import evaluate
from ragas.metrics import answer_relevancy
from datasets import Dataset

# Fix 1: Prepare dataset with RAGAS-required columns
df["retrieved_contexts"] = df["context"].apply(lambda x: [x])  # Convert context to list of contexts
dataset = Dataset.from_pandas(df)

# Fix 2: Rename columns to match RAGAS expectations
dataset = dataset.rename_column("context", "ground_truths")  # Optional: Only if using ground_truths
dataset = dataset.rename_column("retrieved_contexts", "contexts")  # Required for faithfulness metric

# Evaluate
score = evaluate(
    dataset,
    metrics=[answer_relevancy],
    column_map={
        "question": "question",
        "answer": "answer",
        "contexts": "contexts"  # Explicitly map retrieved contexts
    }
)

score_df = score.to_pandas()  # Convert RAGAS scores to DataFrame
score_df.to_csv(os.path.join(OUTPUT_FOLDER, "score_df2.csv"), index=False)

Evaluating:   0%|          | 0/916 [00:00<?, ?it/s]

In [None]:
score_df = score.to_pandas()  # Convert RAGAS scores to DataFrame

In [None]:
score_df = pd.read_csv(os.path.join(OUTPUT_FOLDER, "score_df2.csv"))
# Output scores
print(score_df)

                                            user_input  \
0    What is the primary focus of existing unlearni...   
1    What is the purpose of the Unlearning Improvem...   
2    What is the purpose of LLM unlearning in the c...   
3    What is one of the key factors contributing to...   
4    What did the preliminary experiment using a vi...   
..                                                 ...   
911  What is the focus of the paper titled "LongRAG...   
912  What is the focus of the paper titled "GraphRe...   
913  What was the outcome of the Broken Mirror Test...   
914  What is a significant logical flaw in the fair...   
915  What does the LongWriter-Ruler test demonstrat...   

                                    retrieved_contexts  \
0    ['UIPE: Enhancing LLM Unlearning by Removing K...   
1    ['eason for the suboptimal unlearn-\ning perfo...   
2    ["d as a critical research direction.\nLLM unl...   
3    ['umerous studies have emerged aimed\nat impro...   
4    ['osed w

#### Filter Q&A Pairs

In [None]:
low_relevancy_count = len(score_df[score_df['answer_relevancy'] < 0.8])
print(f"Number of rows with answer_relevancy less than 0.8: {low_relevancy_count}")

filtered_score_df = score_df[score_df['answer_relevancy'] >= 0.8]
print(f"Number of rows after filtering: {len(filtered_score_df)}")

filtered_score_df.to_csv(os.path.join(OUTPUT_FOLDER, "filtered_score_df2.csv"), index=False)

Number of rows with answer_relevancy less than 0.8: 37
Number of rows after filtering: 879


In [None]:
filtered_score_df = pd.read_csv(os.path.join(OUTPUT_FOLDER, "filtered_score_df2.csv"))
# Output scores
print(filtered_score_df)

                                            user_input  \
0    What is the primary focus of existing unlearni...   
1    What is the purpose of the Unlearning Improvem...   
2    What is the purpose of LLM unlearning in the c...   
3    What is one of the key factors contributing to...   
4    What did the preliminary experiment using a vi...   
..                                                 ...   
874  What is the focus of the paper titled "LongRAG...   
875  What is the focus of the paper titled "GraphRe...   
876  What was the outcome of the Broken Mirror Test...   
877  What is a significant logical flaw in the fair...   
878  What does the LongWriter-Ruler test demonstrat...   

                                    retrieved_contexts  \
0    ['UIPE: Enhancing LLM Unlearning by Removing K...   
1    ['eason for the suboptimal unlearn-\ning perfo...   
2    ["d as a critical research direction.\nLLM unl...   
3    ['umerous studies have emerged aimed\nat impro...   
4    ['osed w

In [None]:
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.3.9-py3-none-any.whl.metadata (59 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.3/59.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.3.8 (from unsloth)
  Downloading unsloth_zoo-2025.3.8-py3-none-any.whl.metadata (16 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.16-py3-none-any.whl.metadata (9.4 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,<=0.15.2,>=0.7.9 (from unsloth)
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting protobuf<4.0.0 (from unsloth)
  Downloading

In [None]:
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

Found existing installation: unsloth 2025.3.9
Uninstalling unsloth-2025.3.9:
  Successfully uninstalled unsloth-2025.3.9
Collecting git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-req-build-ch5fnmel
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-req-build-ch5fnmel
  Resolved https://github.com/unslothai/unsloth.git to commit 2b5d81d75281c02480927cf3ca0dea7c8e98d484
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2025.3.9-py3-none-any.whl size=191200 sha256=1ad30a70ca64aa85d8bb6575faffaf613d4af52d95ea1c83a658ba31425b465c
  Stored in directory: /tmp/pip-ephem-wheel-cache-maxjrkzv/wheels/d1/17/

#### Loaded a Pre-trained Model

**Method followed will be finetuning after quantization**

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
  )

==((====))==  Unsloth 2025.3.9: Fast Qwen2 patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


#### Applied LoRA

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0.05, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
import pandas as pd

#### Loading the Dataset

In [None]:
df = pd.read_csv("data/processed_data/filtered_score_df.csv") # Changed load_csv to read_csv

In [None]:
df.head(5)

Unnamed: 0,user_input,retrieved_contexts,response,answer_relevancy
0,What is the primary focus of existing unlearni...,['UIPE: Enhancing LLM Unlearning by Removing K...,Existing unlearning methods for Large Language...,0.967899
1,What is the purpose of the Unlearning Improvem...,['eason for the suboptimal unlearn-\ning perfo...,The purpose of the Unlearning Improvement via ...,1.0
2,What is the purpose of LLM unlearning in the c...,"[""d as a critical research direction.\nLLM unl...",LLM unlearning aims to mitigate the influence ...,0.964369
3,What is one of the key factors contributing to...,['umerous studies have emerged aimed\nat impro...,One of the key factors contributing to the sub...,0.979143
4,What did the preliminary experiment using a vi...,['osed with dia-\nbetes” from the target forge...,The preliminary experiment revealed that unlea...,0.946121


In [None]:
df = df.drop(columns=["retrieved_contexts", "answer_relevancy"])

In [None]:
df.head(5)

Unnamed: 0,user_input,response
0,What is the primary focus of existing unlearni...,Existing unlearning methods for Large Language...
1,What is the purpose of the Unlearning Improvem...,The purpose of the Unlearning Improvement via ...
2,What is the purpose of LLM unlearning in the c...,LLM unlearning aims to mitigate the influence ...
3,What is one of the key factors contributing to...,One of the key factors contributing to the sub...
4,What did the preliminary experiment using a vi...,The preliminary experiment revealed that unlea...


In [None]:
from datasets import Dataset
import pandas as pd

dataset = Dataset.from_pandas(df)
print(dataset)

Dataset({
    features: ['user_input', 'response'],
    num_rows: 879
})


#### Tokenized the Dataset

In [None]:
from transformers import AutoTokenizer

model_name = "Qwen/Qwen2.5-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
  return tokenizer(
      examples["user_input"],  # Input (Questions)
      text_target=examples["response"],  # Output (Answers)
      truncation=True,
      padding="max_length",
      max_length=512
  )

tokenized_dataset = dataset.map(preprocess_function, batched=True)

print(tokenized_dataset)


Map:   0%|          | 0/879 [00:00<?, ? examples/s]

Dataset({
    features: ['user_input', 'response', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 879
})


#### Set Up Training Parameters

#### Fine-Tuned the Model

In [None]:
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from trl import SFTTrainer

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./qwen-finetuned",
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    gradient_accumulation_steps=4,
    warmup_steps=50,
    num_train_epochs=3,  # Train for 3 full epochs
    learning_rate=2e-4,
    fp16=True,  # Mixed precision for speed
    logging_steps=10,
    optim="adamw_8bit",  # Optimized for QLoRA
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    seed=42,
    save_strategy="epoch",
    report_to="none",
)

# Define Trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_dataset,
    dataset_text_field="user_input",
    max_seq_length=2048,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    dataset_num_proc=2,
    packing=False,
    args=training_args,
)

# ✅ Start Fine-Tuning
trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 879 | Num Epochs = 3 | Total steps = 165
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 29,933,568/1,830,055,936 (1.64% trained)


Step,Training Loss
10,7.9288


KeyboardInterrupt: 

In [None]:
#@title Show final memory and time stats
import torch

# Get initial GPU memory usage
start_gpu_memory = round(torch.cuda.memory_allocated() / 1024 / 1024 / 1024, 3)
max_memory = round(torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1024, 3) # Assuming you're using GPU 0


used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
# Access training runtime from trainer.state
print(f"{trainer.state.log_history[-1]['train_runtime']} seconds used for training.")
print(f"{round(trainer.state.log_history[-1]['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

1785.0773 seconds used for training.
29.75 minutes used for training.
Peak reserved memory = 4.012 GB.
Peak reserved memory for training = 1.596 GB.
Peak reserved memory % of max memory = 27.217 %.
Peak reserved memory for training % of max memory = 10.827 %.


#### Saved the Fine-Tuned Model

In [None]:
model.save_pretrained("qwen-finetuned")

In [None]:
tokenizer.save_pretrained("qwen-finetuned_tokenizer")

('qwen-finetuned_tokenizer/tokenizer_config.json',
 'qwen-finetuned_tokenizer/special_tokens_map.json',
 'qwen-finetuned_tokenizer/vocab.json',
 'qwen-finetuned_tokenizer/merges.txt',
 'qwen-finetuned_tokenizer/added_tokens.json',
 'qwen-finetuned_tokenizer/tokenizer.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%ls

[0m[01;34mdata[0m/   [01;34mhuggingface_tokenizers_cache[0m/  [01;34mqwen-finetuned_tokenizer[0m/  [01;34munsloth_compiled_cache[0m/
[01;34mdrive[0m/  [01;34mqwen-finetuned[0m/                [01;34msample_data[0m/


In [None]:
!zip -r qwen-finetuned_tokenizer.zip qwen-finetuned_tokenizer/

  adding: qwen-finetuned_tokenizer/ (stored 0%)
  adding: qwen-finetuned_tokenizer/added_tokens.json (deflated 67%)
  adding: qwen-finetuned_tokenizer/vocab.json (deflated 61%)
  adding: qwen-finetuned_tokenizer/tokenizer.json (deflated 81%)
  adding: qwen-finetuned_tokenizer/tokenizer_config.json (deflated 83%)
  adding: qwen-finetuned_tokenizer/special_tokens_map.json (deflated 69%)
  adding: qwen-finetuned_tokenizer/merges.txt (deflated 57%)


In [None]:
from google.colab import files
files.download('qwen-finetuned_tokenizer.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from unsloth import FastLanguageModel

model_path = "qwen-finetuned"
model, tokenizer = FastLanguageModel.from_pretrained(model_path)

==((====))==  Unsloth 2025.3.9: Fast Qwen2 patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
model.save_pretrained_gguf("model", tokenizer, quantization_method="q4_k_m")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.4G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.51 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 36/36 [00:02<00:00, 16.05it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model/pytorch_model-00001-of-00002.bin...
Unsloth: Saving model/pytorch_model-00002-of-00002.bin...
Done.


Unsloth: Converting qwen2 model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at model into f16 GGUF format.
The output location will be /content/model/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00002.bin'
INFO:hf-to-gguf:token_embd.weight,         torch.float16 --> F16, shape = {2048

In [None]:
def generate_answer(question):
    inputs = tokenizer(question, return_tensors="pt").to("cuda")
    output = model.generate(
      **inputs,
      max_new_tokens=3000,
      # num_beams=5,  # Number of beams to use
      early_stopping=True  # Stop when the model generates an end-of-sequence token
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

print(generate_answer("What is Deepseek R1?"))

What is Deepseek R1? a a,, large and with, with and with  with,. language model,. that of


In [None]:
from transformers import pipeline

qa_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=2000,
    temperature=0.2,
)
print(qa_pipeline("What is Deepseek R1?")[0]["generated_text"])

Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'Jam

What is Deepseek R1? a, of and, for,,.,,,,,
