In [1]:
import os
import PyPDF2
import markdown
from transformers import pipeline
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [3]:
import feedparser
import urllib.parse

# Function to fetch AI research papers from arXiv
def fetch_arxiv_papers(query="large language models", max_results=5):
    base_url = "http://export.arxiv.org/api/query?"
    encoded_query = urllib.parse.quote(query)  # Encode the query to handle spaces
    search_query = f"search_query=all:{encoded_query}&start=0&max_results={max_results}&sortBy=lastUpdatedDate&sortOrder=descending"

    response = feedparser.parse(base_url + search_query)

    papers = []
    for entry in response.entries:
        papers.append({
            "title": entry.title,
            "summary": entry.summary,
            "pdf_url": entry.link.replace("abs", "pdf")
        })

    return papers

# Fetch and print latest AI papers
ai_papers = fetch_arxiv_papers("LLM fine-tuning", max_results=10)
for paper in ai_papers:
    print(f"Title: {paper['title']}\nPDF: {paper['pdf_url']}\n")


Title: LLMVoX: Autoregressive Streaming Text-to-Speech Model for Any LLM
PDF: http://arxiv.org/pdf/2503.04724v1

Title: Shifting Long-Context LLMs Research from Input to Output
PDF: http://arxiv.org/pdf/2503.04723v1

Title: Enough Coin Flips Can Make LLMs Act Bayesian
PDF: http://arxiv.org/pdf/2503.04722v1

Title: Predictable Scale: Part I -- Optimal Hyperparameter Scaling Law in Large
  Language Model Pretraining
PDF: http://arxiv.org/pdf/2503.04715v1

Title: How Far Are We on the Decision-Making of LLMs? Evaluating LLMs' Gaming
  Ability in Multi-Agent Environments
PDF: http://arxiv.org/pdf/2403.11807v7

Title: Universality of Layer-Level Entropy-Weighted Quantization Beyond Model
  Architecture and Size
PDF: http://arxiv.org/pdf/2503.04704v1

Title: UIPE: Enhancing LLM Unlearning by Removing Knowledge Related to
  Forgetting Targets
PDF: http://arxiv.org/pdf/2503.04693v1

Title: Quantifying the Reasoning Abilities of LLMs on Real-world Clinical Cases
PDF: http://arxiv.org/pdf/2503.0

In [5]:
import requests

DATA_FOLDER = "data/dataset/"
OUTPUT_FOLDER = "data/processed_data"

In [None]:
def sanitize_filename(filename):
    return "".join(c if c.isalnum() or c in (' ', '_', '-') else '_' for c in filename)

def download_papers(papers, save_folder):
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    for paper in papers:
        pdf_url = paper["pdf_url"]
        response = requests.get(pdf_url)
        sanitized_title = sanitize_filename(paper['title'])
        pdf_filename = os.path.join(save_folder, f"{sanitized_title}.pdf")

        with open(pdf_filename, "wb") as file:
            file.write(response.content)

        print(f"Downloaded: {pdf_filename}")

# Download papers
download_papers(ai_papers, DATA_FOLDER)


Downloaded: data/dataset/LLMVoX_ Autoregressive Streaming Text-to-Speech Model for Any LLM.pdf
Downloaded: data/dataset/Shifting Long-Context LLMs Research from Input to Output.pdf
Downloaded: data/dataset/Enough Coin Flips Can Make LLMs Act Bayesian.pdf
Downloaded: data/dataset/Predictable Scale_ Part I -- Optimal Hyperparameter Scaling Law in Large_  Language Model Pretraining.pdf
Downloaded: data/dataset/How Far Are We on the Decision-Making of LLMs_ Evaluating LLMs_ Gaming_  Ability in Multi-Agent Environments.pdf
Downloaded: data/dataset/Universality of Layer-Level Entropy-Weighted Quantization Beyond Model_  Architecture and Size.pdf
Downloaded: data/dataset/UIPE_ Enhancing LLM Unlearning by Removing Knowledge Related to_  Forgetting Targets.pdf
Downloaded: data/dataset/Quantifying the Reasoning Abilities of LLMs on Real-world Clinical Cases.pdf
Downloaded: data/dataset/RAAD-LLM_ Adaptive Anomaly Detection Using LLMs and RAG Integration.pdf
Downloaded: data/dataset/DIMSUM_ Discou

In [5]:
def extract_pdf_text(pdf_path):
    text = ""
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text()
    return text

def extract_md_text(md_path):
    with open(md_path, "r", encoding="utf-8") as f:
        md_text = f.read()
    return markdown.markdown(md_text)  # Convert MD to plain text if needed

In [6]:
# Process all files
data_dir = "data/dataset/"
all_text = []

for file in os.listdir(data_dir):
    file_path = os.path.join(data_dir, file)
    if file.endswith(".pdf"):
        all_text.append(extract_pdf_text(file_path))
    elif file.endswith(".md"):
        all_text.append(extract_md_text(file_path))

# Split into chunks (adjust chunk_size as needed)
chunk_size = 1024  # characters
chunks = [text[i:i+chunk_size] for text in all_text for i in range(0, len(text), chunk_size)]

In [3]:
import torch
print(torch.cuda.is_available())  # Should output True

True


#### Using Qwen

**DO NOT RUN**

In [6]:
model_name = "Qwen/Qwen2.5-3B-Instruct"
pipe = pipeline("text-generation", model=model_name, device_map="cuda")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda


In [8]:
import numpy as np

target_pairs = 500  # <--- SET YOUR DESIRED NUMBER HERE
qa_pairs = []
current_count = 0

while current_count < target_pairs:
    for chunk in chunks:

        prompt = f"""
        Generate **ONE** question and answer based on this text.
        Return ONLY in format: "Q: [question]\nA: [answer]".
        Text: {chunk[:1000]}
        """

        response = pipe(
            prompt,
            max_new_tokens=256,
            temperature=0.3,
        )[0]['generated_text']

        try:
            q, a = response.split("A: ")[0].strip(), response.split("A: ")[1].strip()
            qa_pairs.append({"question": q.replace("Q: ", ""), "answer": a, "context": chunk})
            print (f"Generated QA pair {current_count + 1}")
            current_count += 1
            if current_count >= target_pairs:
                break  # Stop when target is reached
        except:
            continue  # Skip errors

    # Optional: Shuffle chunks to avoid bias from looping
    np.random.shuffle(chunks)

# Convert to DataFrame
df = pd.DataFrame(qa_pairs)
print(f"Generated {len(df)} QA pairs")  # Verify count

Generated QA pair 1
Generated QA pair 2


KeyboardInterrupt: 

#### Using OpenAPI Key

In [None]:
import openai

target_pairs = 200  # Set desired number
qa_pairs = []
current_count = 0

for chunk in chunks:
    if current_count >= target_pairs:
        break

    response = openai.chat.completions.create(
        model="gpt-4o",  # or "gpt-4"
        messages=[
            {"role": "system", "content": "Generate ONE question and answer from technical text. Use format: Q: [question]\nA: [answer]"},
            {"role": "user", "content": f"Text: {chunk[:1000]}"}
        ],
        temperature=0.2,  # Lower for factual accuracy
        max_tokens=256
    )

    try:
        full_response = response.choices[0].message.content
        q, a = full_response.split("A: ")
        q = q.replace("Q: ", "").strip()
        qa_pairs.append({"question": q, "answer": a.strip(), "context": chunk})
        current_count += 1
        print (f"Generated QA pair {current_count}")
    except:
        continue  # Skip malformed responses

# Convert to DataFrame
df = pd.DataFrame(qa_pairs)
print(f"Generated {len(df)} QA pairs")

Generated QA pair 1
Generated QA pair 2
Generated QA pair 3
Generated QA pair 4
Generated QA pair 5
Generated QA pair 6
Generated QA pair 7
Generated QA pair 8
Generated QA pair 9
Generated QA pair 10
Generated QA pair 11
Generated QA pair 12
Generated QA pair 13
Generated QA pair 14
Generated QA pair 15
Generated QA pair 16
Generated QA pair 17
Generated QA pair 18
Generated QA pair 19
Generated QA pair 20
Generated QA pair 21
Generated QA pair 22
Generated QA pair 23
Generated QA pair 24
Generated QA pair 25
Generated QA pair 26
Generated QA pair 27
Generated QA pair 28
Generated QA pair 29
Generated QA pair 30
Generated QA pair 31
Generated QA pair 32
Generated QA pair 33
Generated QA pair 34
Generated QA pair 35
Generated QA pair 36
Generated QA pair 37
Generated QA pair 38
Generated QA pair 39
Generated QA pair 40
Generated QA pair 41
Generated QA pair 42
Generated QA pair 43
Generated QA pair 44
Generated QA pair 45
Generated QA pair 46
Generated QA pair 47
Generated QA pair 48
G

In [9]:
df.to_csv(os.path.join(OUTPUT_FOLDER, "qa_pairs.csv"), index=False)

In [15]:
from ragas import evaluate
from ragas.metrics import answer_relevancy
from datasets import Dataset

# Fix 1: Prepare dataset with RAGAS-required columns
df["retrieved_contexts"] = df["context"].apply(lambda x: [x])  # Convert context to list of contexts
dataset = Dataset.from_pandas(df)

# Fix 2: Rename columns to match RAGAS expectations
dataset = dataset.rename_column("context", "ground_truths")  # Optional: Only if using ground_truths
dataset = dataset.rename_column("retrieved_contexts", "contexts")  # Required for faithfulness metric

# Evaluate
score = evaluate(
    dataset,
    metrics=[answer_relevancy],
    column_map={
        "question": "question",
        "answer": "answer",
        "contexts": "contexts"  # Explicitly map retrieved contexts
    }
)

score_df = score.to_pandas()  # Convert RAGAS scores to DataFrame
score_df.to_csv(os.path.join(OUTPUT_FOLDER, "score_df.csv"), index=False)

Evaluating:   0%|          | 0/190 [00:00<?, ?it/s]

In [None]:
# Fix 3: Convert scores to pandas series and filter
score_df = score.to_pandas()  # Convert RAGAS scores to DataFrame

In [6]:
score_df = pd.read_csv(os.path.join(OUTPUT_FOLDER, "score_df.csv"))
# Output scores
print(score_df)

                                            user_input  \
0    What are the main differences between DeepSeek...   
1    What is the accuracy percentage of DeepSeek-R1...   
2    What is the focus of section 2.3 in the document?   
3    What recent development in Large Language Mode...   
4    What is the main goal of the research discusse...   
..                                                 ...   
185  What is the purpose of querying language model...   
186  How do language models initially handle the to...   
187  How do instruct models perform compared to non...   
188  How do instruct-based models perform in the bi...   
189  How does the relative ordering among different...   

                                    retrieved_contexts  \
0    ['DeepSeek-R1: Incentivizing Reasoning Capabil...   
1    ['024\n(Pass@1)Codeforces\n(Percentile)GPQA Di...   
2    ['. . . 6\n2.2.3 Training Template . . . . . ....   
3    ['4.2 Unsuccessful Attempts . . . . . . . . . ...   
4    ['oding,

In [7]:
low_relevancy_count = len(score_df[score_df['answer_relevancy'] < 0.8])
print(f"Number of rows with answer_relevancy less than 0.8: {low_relevancy_count}")

filtered_score_df = score_df[score_df['answer_relevancy'] >= 0.8]
print(f"Number of rows after filtering: {len(filtered_score_df)}")

Number of rows with answer_relevancy less than 0.8: 3
Number of rows after filtering: 187


#### Tokenization

In [9]:
# %%
import os
import torch
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


Using device: cuda


In [15]:
!pip install triton

ERROR: Could not find a version that satisfies the requirement triton (from versions: none)
ERROR: No matching distribution found for triton


In [18]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    # Can select any from the below:
    # "unsloth/Qwen2.5-0.5B", "unsloth/Qwen2.5-1.5B", "unsloth/Qwen2.5-3B"
    # "unsloth/Qwen2.5-14B",  "unsloth/Qwen2.5-32B",  "unsloth/Qwen2.5-72B",
    # And also all Instruct versions and Math. Coding verisons!
    model_name = "unsloth/Qwen2.5-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit)

    # Move the model to the appropriate device
model.to(device)


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


ModuleNotFoundError: No module named 'triton'