<a href="https://colab.research.google.com/github/RiccardoRubini93/ML-AI-cookbook/blob/main/Fine_tune_on_pdf_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!mkdir pdfs

In [2]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [3]:
import os
import re
import PyPDF2

In [4]:
def remove_references(text):
    # Remove text after "REFERENCES"
    reference_start = text.find("REFERENCES")
    if reference_start != -1:
        text = text[:reference_start]

    return text

def remove_links(text):
    pattern = r"\[\d+\]|\(http[s]?://\S+\)|www\.\S+|[^a-zA-Z0-9\s]"
    return re.sub(pattern, "", text)

def remove_special_chars(text):
    pattern = r"[^\w\s.]"
    return re.sub(pattern, "", text)

def preprocess_text(text):
    text = remove_references(text)
    text = remove_links(text)
    text = text.lower()
    text = remove_special_chars(text)

    text = re.sub(r'\[\d*\]', '', text)  # Remove square brackets containing numbers
    text = re.sub(r'\[.*?\]', '', text)   # Remove other text between square brackets

    # Remove occurrences of "fig"
    text = re.sub(r'\bfig.\b', '', text)


    # Remove numbers
    text = re.sub(r'\b\d+\b', '', text)  # Remove numbers

    # Remove single characters or numbers in a line
    text = re.sub(r'\b\w\b|\b\d\b', '', text)

    # Filter out lines with only a single character, number, or special character
    lines = text.split('\n')
    lines = [line for line in lines if len(line.strip()) > 1]  # Filter out lines with length <= 1
    text = '\n'.join(lines)

    return text

In [5]:
!pip install pdfminer.six nltk

Collecting pdfminer.six
  Downloading pdfminer_six-20250416-py3-none-any.whl.metadata (4.1 kB)
Downloading pdfminer_six-20250416-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20250416


In [6]:
import nltk

# Download the punkt tokenizer data
nltk.download('punkt')
print("NLTK punkt tokenizer successfully downloaded!")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


NLTK punkt tokenizer successfully downloaded!


In [7]:
import os
import re
from pdfminer.high_level import extract_text
from nltk.tokenize import sent_tokenize

def preprocess_text(text):
    """
    Preprocess the extracted text from PDF files.
    - Remove extra whitespace
    - Remove non-ascii characters
    - Convert to lowercase
    """
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove non-ascii characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # Convert to lowercase
    text = text.lower()
    return text.strip()

def extract_text_from_pdf(pdf_path):
    """Extract text from PDF using PyPDF2."""
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text() + "\n"
    return text

def extract_sentences_from_folder(folder_path):
    if not os.path.isdir(folder_path):
        print("Folder path does not exist.")
        return []

    sentences = []
    for pdf_file in os.listdir(folder_path):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, pdf_file)
            try:
                # Extract text from PDF
                text = extract_text_from_pdf(pdf_path)
                # Preprocess the extracted text
                text = preprocess_text(text)

                try:
                    # Try to tokenize using NLTK
                    pdf_sentences = sent_tokenize(text)
                except LookupError:
                    # Fallback: simple sentence splitting if NLTK fails
                    print(f"Warning: NLTK punkt tokenizer not available, using simple sentence splitting for {pdf_file}")
                    pdf_sentences = [s.strip() + '.' for s in text.split('.') if s.strip()]

                sentences.extend(pdf_sentences)
                print(f"Processed: {pdf_file} - {len(pdf_sentences)} sentences")
            except Exception as e:
                print(f"Error processing {pdf_file}: {str(e)}")

    return sentences
def write_to_txt(sentences, output_file):
    with open(output_file, 'w', encoding='utf-8') as txt_file:
        for sentence in sentences:
            # Write each sentence in a new line
            txt_file.write(sentence.strip() + "\n")

In [8]:
# Folder containing PDF files
pdf_folder = "pdfs"
# Output file for preprocessed text
output_file = "text.txt"

# Extract sentences from PDF files
sentences = extract_sentences_from_folder(pdf_folder)
print("Total number of sentences extracted:", len(sentences))

# Write sentences to a text file
write_to_txt(sentences, output_file)

Processed: 2006.05736v2.pdf - 1166 sentences
Total number of sentences extracted: 1166


In [9]:
def split_into_passages(input_file, output_file, words_per_passage=500):
    with open(input_file, 'r', encoding='utf-8') as f:
        text = f.read()

    words = text.split()
    passages = []
    passage = ""
    word_count = 0

    # Iterate through words in the text
    for word in words:
        # Add word to current passage
        passage += word + " "
        word_count += 1

        # Check if the word count exceeds the limit for a passage
        if word_count >= words_per_passage:
            passages.append(passage.strip())
            passage = ""
            word_count = 0

    # Write passages to the output file
    with open(output_file, 'w', encoding='utf-8') as f:
        for passage in passages:
            f.write(passage + "\n\n")

# Input and output file paths
input_file = "text.txt"
output_file = "passages.txt"

# Call the function
split_into_passages(input_file, output_file)

In [10]:
import json
import os

def text_to_json(text_file, json_file, chunk_size=5):
    """
    Convert a text file with sentences to a JSON file with passages.
    Each passage is formed by combining chunk_size consecutive sentences.

    Args:
        text_file (str): Path to the input text file
        json_file (str): Path to the output JSON file
        chunk_size (int): Number of sentences to combine into one passage
    """
    # Check if the text file exists
    if not os.path.exists(text_file):
        print(f"Text file '{text_file}' does not exist.")
        return False

    # Read the sentences from the text file
    with open(text_file, 'r', encoding='utf-8') as f:
        sentences = [line.strip() for line in f if line.strip()]

    # Create passages by combining sentences
    passages = []
    for i in range(0, len(sentences), chunk_size):
        chunk = sentences[i:i+chunk_size]
        passage = " ".join(chunk)
        if passage:  # Only add non-empty passages
            passages.append({"answer": passage})

    # Write passages to JSON file
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(passages, f, indent=2, ensure_ascii=False)

    print(f"Converted {len(sentences)} sentences into {len(passages)} passages.")
    print(f"JSON file saved as '{json_file}'")
    return True

text_file = "passages.txt"  # Your input text file
json_file = "output.json"  # Output JSON file
text_to_json(text_file, json_file, chunk_size=5)

Converted 27 sentences into 6 passages.
JSON file saved as 'output.json'


True

In [11]:
!pip install transformers



In [12]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import json

def generate_questions_with_transformers(json_file, output_file):
    # Load the pre-trained question generation model and tokenizer
    model_name = "valhalla/t5-small-qg-hl"
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    instructions = []
    for item in data:
        passage = item['answer']
        # Tokenize the passage
        inputs = tokenizer.encode("question: " + passage, return_tensors="pt", max_length=512, truncation=True)

        # Generate questions using the model
        questions = model.generate(inputs, max_length=64, num_beams=3, num_return_sequences=3, early_stopping=True)
        for question in questions:
            question_str = tokenizer.decode(question, skip_special_tokens=True)
            instructions.append({
                "instruction": f"Based on the following passage: '{passage}', provide an accurate and relevant question.",
                "input": "",
                "output": question_str
            })

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(instructions, f, indent=2, ensure_ascii=False)

generate_questions_with_transformers('output.json', 'instructions.json')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [13]:
import json

def swap_instruction_output(json_file, output_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    swapped_data = []
    for item in data:
        swapped_data.append({
            "instruction": item["output"],
            "input": item["input"],
            "output": item["instruction"]
        })

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(swapped_data, f, indent=2, ensure_ascii=False)

swap_instruction_output('instructions.json', 'swapped_instructions.json')

In [14]:
def clean_answer_text2(data):
    cleaned_data = []
    for item in data:
        output = item["output"]

        cleaned_answer = output.replace("Based on the following passage: ", "").replace(", provide an accurate and relevant question.", "").strip()

        cleaned_item = {
            "instruction": item["instruction"],
            "input": item["input"],
            "output": cleaned_answer
        }
        cleaned_data.append(cleaned_item)
    return cleaned_data


def process_json_file3(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    cleaned_data = clean_answer_text2(data)
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(cleaned_data, f, indent=2, ensure_ascii=False)

process_json_file3("swapped_instructions.json", "final.json")

Fine tuning

In [1]:
!git clone https://github.com/hiyouga/LLaMA-Factory.git

fatal: destination path 'LLaMA-Factory' already exists and is not an empty directory.


In [2]:
%cd LLaMA-Factory/
!pip install -r requirements.txt

/content/LLaMA-Factory


In [3]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.5


In [4]:
!pip install llamafactory

Collecting llamafactory
  Downloading llamafactory-0.9.2-py3-none-any.whl.metadata (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.3/66.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers!=4.46.*,!=4.47.*,!=4.48.0,<=4.49.0,>=4.41.2 (from llamafactory)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets<=3.2.0,>=2.16.0 (from llamafactory)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting accelerate<=1.2.1,>=0.34.0 (from llamafactory)
  Downloading accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Collecting peft<=0.12.0,>=0.11.1 (from llamafactory)
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<=0.21.0,>=0.19.0 (from llamafactory)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.wh

In [None]:
!GRADIO_SHARE=1 llamafactory-cli webui

2025-04-25 20:04:13.497536: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745611453.517689    2895 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745611453.523619    2895 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-25 20:04:13.543679: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
* Running on local URL:  http://0.0.0.0:7860
* Running on public URL: https://d7550f5b9d0a8140a3.gradio.live

This sh