# 📌 1️⃣ Extract & Clean Text from PDFs

In [5]:
import fitz  # PyMuPDF for PDF processing
import os
import json
import re


In [6]:

def extract_text_from_pdf(pdf_path):
    """ Extract raw text from a PDF file """
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            text += page.get_text("text") + "\n"
    except Exception as e:
        print(f"⚠️ Error reading {pdf_path}: {e}")
    return text

def clean_text(text):
    """ Clean extracted text by removing unwanted characters, extra spaces, and symbols """
    text = re.sub(r'\n+', '\n', text)  # Remove extra newlines
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^A-Za-z0-9أ-ي .,؛:؟!-]', '', text)  # Keep only meaningful characters
    text = re.sub(r'\bPage \d+\b', '', text, flags=re.IGNORECASE)  # Remove "Page X" footers
    text = re.sub(r'\bFigure \d+\b', '', text, flags=re.IGNORECASE)  # Remove "Figure X" references
    text = re.sub(r'\bTable \d+\b', '', text, flags=re.IGNORECASE)  # Remove "Table X" references
    return text.strip()

def process_pdfs_in_folder(folder_path, output_json="clean_reports.json"):
    """ Extract and clean text from all PDFs in a folder and save to JSON """
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".pdf")]

    reports = []
    for pdf_file in pdf_files:
        raw_text = extract_text_from_pdf(pdf_file)
        cleaned_text = clean_text(raw_text)
        if cleaned_text:
            reports.append({"file": os.path.basename(pdf_file), "text": cleaned_text})

    # Save extracted text to JSON
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(reports, f, ensure_ascii=False, indent=4)

    print(f"✅ Extracted and cleaned text from {len(reports)} reports. Saved to {output_json}.")

# Run the script on the PDF folder
process_pdfs_in_folder(r"C:\Users\rkhm3\Desktop\GeoKnowlogy_Dataset")


✅ Extracted and cleaned text from 55 reports. Saved to clean_reports.json.


# 📌 2️⃣ Convert Cleaned Text into LLM Training Data

In [7]:
def prepare_data_for_training(json_file, output_file="training_data.json"):
    """ Convert cleaned text into structured LLM training data """
    with open(json_file, "r", encoding="utf-8") as f:
        reports = json.load(f)

    dataset = []
    for report in reports:
        text = report["text"]
        sections = re.split(r'\.\s+', text)  # Split text into sentences

        for section in sections:
            if len(section.strip()) > 50:  # Ignore very short lines
                dataset.append({
                    "instruction": "Summarize this geological information:",
                    "input": section,
                    "output": "Summary of this section..."
                })

    # Save formatted training data
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)

    print(f"✅ Prepared {len(dataset)} samples for training. Saved to {output_file}.")

# Convert extracted data into structured training data
prepare_data_for_training("clean_reports.json")


✅ Prepared 37836 samples for training. Saved to training_data.json.


# 📌 3️⃣ Fine-Tune the LLM on Your Data (LoRA for Efficiency)

In [19]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import json


In [20]:
# Load model and tokenizer
model_name = "ALLaM-AI/ALLaM-7B-Instruct-preview"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto"
)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu and disk.


In [21]:

# Apply LoRA for memory-efficient fine-tuning
lora_config = LoraConfig(r=8, lora_alpha=32, target_modules=["q_proj", "v_proj"],
                         lora_dropout=0.1, bias="none", task_type="CAUSAL_LM")
model = get_peft_model(model, lora_config)


In [22]:

# Load training data
with open("training_data.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)


In [None]:

# Training settings
training_args = TrainingArguments(
    output_dir="./trained_model",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_steps=500,
    logging_steps=100
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data
)

# Start training
trainer.train()


# 📌 4️⃣ Deploy the Trained Model as an API

In [None]:
from fastapi import FastAPI
from transformers import AutoModelForCausalLM, AutoTokenizer

app = FastAPI()

# Load trained model
model_name = "./trained_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")

@app.get("/ask")
def ask(question: str):
    """ API to interact with the trained model """
    inputs = tokenizer(question, return_tensors="pt")
    output = model.generate(**inputs, max_length=500)
    return {"answer": tokenizer.decode(output[0], skip_special_tokens=True)}

