Step 1: Data Ingestion and Preprocessing

Part 1: Extraction

In [5]:
import os
import json
import logging
import fitz  # PyMuPDF for handling PDFs
from nltk.tokenize import sent_tokenize  # For sentence tokenization
import nltk
nltk.download('punkt')


def extract_text_with_formatting(page):
    """
    Extract paragraphs from a single PDF page, excluding the "References" section.
    Args:
        page (fitz.Page): A page object from the PDF.
    Returns:
        list: A list of sentences extracted from the page.
    """
    sentences = []
    blocks = page.get_text("dict")["blocks"]
    references_detected = False  # Flag to detect the "References" section

    for block in blocks:
        if block["type"] == 0:  # Only process text blocks
            paragraph = " ".join(
                " ".join(span["text"] for span in line["spans"]).strip()
                for line in block["lines"]
            )

            # Check if the paragraph contains the word "References" (case-insensitive)
            if not references_detected and "references" in paragraph.lower():
                references_detected = True
                continue  # Skip the "References" heading and all subsequent content

            if not references_detected and paragraph.strip():
                # Tokenize the paragraph into sentences
                sentences.extend(sent_tokenize(paragraph))

    return sentences


def extract_images(page, output_dir, page_number):
    """
    Extract images from a PDF page and save them to the output directory.
    Args:
        page (fitz.Page): A page object from the PDF.
        output_dir (str): Directory to save the extracted images.
        page_number (int): The page number (0-indexed).
    Returns:
        list: A list of image file paths.
    """
    image_paths = []
    for img_index, img in enumerate(page.get_images(full=True)):
        xref = img[0]
        pixmap = fitz.Pixmap(page.parent, xref)
        image_filename = f"page_{page_number + 1}_img_{img_index + 1}.png"
        image_path = os.path.join(output_dir, image_filename)
        pixmap.save(image_path)
        image_paths.append(image_path)
        logging.info(f"Saved image: {image_path}")

    return image_paths


def process_pdf(pdf_path, output_dir):
    """
    Process a single PDF file and save structured data to the output directory.
    Args:
        pdf_path (str): Path to the PDF file.
        output_dir (str): Directory to save structured data.
    """
    try:
        doc = fitz.open(pdf_path)
        output_data = []

        # Create a subdirectory for images
        images_dir = os.path.join(output_dir, "images")
        if not os.path.exists(images_dir):
            os.makedirs(images_dir)

        for page_number in range(len(doc)):
            page = doc.load_page(page_number)
            sentences = extract_text_with_formatting(page)
            extract_images(page, images_dir, page_number)  # Extract images but don't store paths

            page_data = {
                'page_number': page_number + 1,
                'sentences': sentences,  # Store sentences
            }
            output_data.append(page_data)

        json_filename = os.path.splitext(os.path.basename(pdf_path))[0] + '.json'
        json_path = os.path.join(output_dir, json_filename)
        with open(json_path, 'w', encoding='utf-8') as json_file:
            json.dump(output_data, json_file, indent=4, ensure_ascii=False)

        logging.info(f"Saved structured data to {json_path}")

    except Exception as e:
        logging.error(f"Error processing {pdf_path}: {e}")


def process_pdfs_in_directory(input_dir, output_dir):
    """
    Process all PDF files in the input directory and save structured data to the output directory.
    Args:
        input_dir (str): Path to the directory containing PDF files.
        output_dir (str): Path to the directory to save structured data.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    pdf_files = [f for f in os.listdir(input_dir) if f.endswith('.pdf')]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(input_dir, pdf_file)
        logging.info(f"Processing {pdf_path}...")
        process_pdf(pdf_path, output_dir)


# Example Usage for Extraction
if __name__ == "__main__":
    pdf_directory = r"C:\Users\Amina\Downloads\freelanceProject\CHatbotUrgent\Dataset"
    extracted_directory = r"C:\Users\Amina\Downloads\freelanceProject\CHatbotUrgent\Extracteddataset"

    # Step 1: Extract data from PDFs
    process_pdfs_in_directory(pdf_directory, extracted_directory)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Amina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2025-01-23 14:22:12,118 - INFO - Processing C:\Users\Amina\Downloads\freelanceProject\CHatbotUrgent\Dataset\2106.10270v2.pdf...
2025-01-23 14:22:12,256 - INFO - Saved image: C:\Users\Amina\Downloads\freelanceProject\CHatbotUrgent\Extracteddataset\images\page_7_img_1.png
2025-01-23 14:22:12,256 - INFO - Saved image: C:\Users\Amina\Downloads\freelanceProject\CHatbotUrgent\Extracteddataset\images\page_7_img_2.png
2025-01-23 14:22:12,265 - INFO - Saved image: C:\Users\Amina\Downloads\freelanceProject\CHatbotUrgent\Extracteddataset\images\page_7_img_3.png
2025-01-23 14:22:12,265 - INFO - Saved image: C:\Users\Amina\Downloads\freelanceProject\CHatbotUrgent\Extracteddataset\images\page_7_img_4.png
2025-01-23 14:22:12,272 - INFO - Saved image: C:\Users\Amina\Downloads\freelanceProject\CHatbotUrgent\Extracteddataset\images\page_7_img_5.png
2025-

Part 2: Cleaning and Tokenization

In [6]:
import os
import json
import re
import logging
from nltk.tokenize import sent_tokenize  # For sentence tokenization
from nltk.corpus import stopwords
from string import punctuation
# Ensure NLTK resources are available
import nltk
nltk.download('punkt')
nltk.download('stopwords')


def clean_sentence(sentence):
    """
    Cleans a sentence by removing URLs, punctuation, and stopwords.
    Args:
        sentence (str): The sentence to clean.
    Returns:
        str: The cleaned sentence.
    """
    # Remove URLs using a robust regex pattern
    sentence = re.sub(r"http\S+|www\S+|https\S+", "", sentence, flags=re.IGNORECASE)
    
    # Remove punctuation
    sentence = re.sub(f"[{re.escape(punctuation)}]", "", sentence)
    
    # Lowercase the text
    sentence = sentence.lower()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    cleaned_sentence = " ".join(word for word in sentence.split() if word not in stop_words)
    
    return cleaned_sentence


def clean_json_file(input_path, output_path):
    """
    Cleans sentences in a JSON file and saves the cleaned data.
    Args:
        input_path (str): Path to the input JSON file.
        output_path (str): Path to save the cleaned JSON file.
    """
    try:
        with open(input_path, 'r', encoding='utf-8') as infile:
            data = json.load(infile)

        for page in data:
            page['sentences'] = [clean_sentence(sentence) for sentence in page.get('sentences', [])]

        with open(output_path, 'w', encoding='utf-8') as outfile:
            json.dump(data, outfile, indent=4, ensure_ascii=False)

        logging.info(f"Cleaned data saved to {output_path}")

    except Exception as e:
        logging.error(f"Error cleaning {input_path}: {e}")


def process_directory(input_dir, output_dir):
    """
    Processes all JSON files in a directory, cleaning them and saving the results.
    Args:
        input_dir (str): Path to the directory containing JSON files.
        output_dir (str): Path to save cleaned JSON files.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    json_files = [f for f in os.listdir(input_dir) if f.endswith('.json')]

    for json_file in json_files:
        input_path = os.path.join(input_dir, json_file)
        output_path = os.path.join(output_dir, json_file)
        logging.info(f"Processing {json_file}...")
        clean_json_file(input_path, output_path)


# Example Usage for Cleaning and Tokenization
if __name__ == "__main__":
    extracted_directory = r"C:\Users\Amina\Downloads\freelanceProject\CHatbotUrgent\Extracteddataset"
    cleaned_directory = r"C:\Users\Amina\Downloads\freelanceProject\CHatbotUrgent\Cleaneddataset"

    # Step 2: Clean the extracted data
    process_directory(extracted_directory, cleaned_directory)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Amina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Amina\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2025-01-23 14:32:39,323 - INFO - Processing 2106.10270v2.json...
2025-01-23 14:32:39,522 - INFO - Cleaned data saved to C:\Users\Amina\Downloads\freelanceProject\CHatbotUrgent\Cleaneddataset\2106.10270v2.json
2025-01-23 14:32:39,522 - INFO - Processing 2112.13492v1.json...
2025-01-23 14:32:39,657 - INFO - Cleaned data saved to C:\Users\Amina\Downloads\freelanceProject\CHatbotUrgent\Cleaneddataset\2112.13492v1.json
2025-01-23 14:32:39,657 - INFO - Processing 2307.08461v3.json...
2025-01-23 14:32:39,673 - INFO - Cleaned data saved to C:\Users\Amina\Downloads\freelanceProject\CHatbotUrgent\Cleaneddataset\2307.08461v3.json
2025-01-23 14:32:39,673 - INFO - Processing 2310.05421v1.json

Step 1 (Load and Parse JSON Files), Step 2 (Generate Embeddings), and Step 3 (Index the Embeddings),+Retrieve Relevant Documents

In [5]:
import os
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Step 1: Load and Parse JSON Files
def load_and_parse_json_files(directory):
    """
    Load JSON files from the directory and combine sentences into chunks.
    """
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r", encoding="utf-8") as f:
                data = json.load(f)
                #print(f"Loaded JSON file: {filename}")
                #print(f"Structure: {data}")  # Debug: Print the structure of the JSON file
                documents.append(data)

    # Combine sentences into chunks (e.g., by page)
    text_chunks = []
    for doc in documents:
        # Debug: Print the structure of each document
        #print(f"Document structure: {doc}")
        
        # Check if the document is a list or a dictionary
        if isinstance(doc, list):
            # If the document is a list, iterate through it
            for item in doc:
                page_number = item.get("page_number", "Unknown")  # Use .get() to avoid KeyError
                sentences = item.get("sentences", [])  # Use .get() to avoid KeyError
                chunk = " ".join(sentences)  # Combine sentences into a single chunk
                text_chunks.append({
                    "page_number": page_number,
                    "text": chunk
                })
        elif isinstance(doc, dict):
            # If the document is a dictionary, process it directly
            page_number = doc.get("page_number", "Unknown")  # Use .get() to avoid KeyError
            sentences = doc.get("sentences", [])  # Use .get() to avoid KeyError
            chunk = " ".join(sentences)  # Combine sentences into a single chunk
            text_chunks.append({
                "page_number": page_number,
                "text": chunk
            })
        else:
            print(f"Unexpected document format: {type(doc)}")
    
    return text_chunks

# Step 2: Generate Embeddings
def generate_embeddings(text_chunks):
    """
    Generate embeddings for the text chunks using Sentence-BERT.
    """
    # Load a pre-trained embedding model
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

    # Extract text from chunks
    texts = [chunk["text"] for chunk in text_chunks]

    # Generate embeddings for all text chunks
    embeddings = embedding_model.encode(texts)
    
    return embeddings, embedding_model

# Step 3: Index the Embeddings with Cosine Similarity
def index_embeddings(embeddings):
    """
    Index the embeddings using FAISS with cosine similarity.
    """
    # Normalize embeddings to unit vectors (required for cosine similarity)
    embeddings = np.array(embeddings).astype('float32')
    faiss.normalize_L2(embeddings)  # Normalize embeddings

    # Create a FAISS index for cosine similarity
    dimension = embeddings.shape[1]  # Dimension of the embeddings
    index = faiss.IndexFlatIP(dimension)  # Inner product (cosine similarity)
    index.add(embeddings)  # Add embeddings to the index
    
    return index

# Step 4: Retrieve Relevant Documents
def retrieve_documents(query, embedding_model, index, text_chunks, top_k=3):
    """
    Retrieve the most relevant documents for a given query using cosine similarity.
    """
    # Generate embedding for the query
    query_embedding = embedding_model.encode([query])
    query_embedding = np.array(query_embedding).astype('float32')
    faiss.normalize_L2(query_embedding)  # Normalize query embedding

    # Search the FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve the relevant text chunks
    results = []
    for idx in indices[0]:
        results.append(text_chunks[idx])
    
    return results

# Main function to execute all steps
def main():
    # Directory containing cleaned JSON files
    cleaned_directory = r"C:\Users\Amina\Downloads\freelanceProject\CHatbotUrgent\Cleaneddataset"

    # Step 1: Load and parse JSON files
    print("Loading and parsing JSON files...")
    text_chunks = load_and_parse_json_files(cleaned_directory)
    print(f"Loaded {len(text_chunks)} text chunks.")

    # Step 2: Generate embeddings
    print("Generating embeddings...")
    embeddings, embedding_model = generate_embeddings(text_chunks)
    print("Embeddings generated.")

    # Step 3: Index embeddings with cosine similarity
    print("Indexing embeddings...")
    index = index_embeddings(embeddings)
    print("Embeddings indexed.")

    # Step 4: Retrieve relevant documents for a query
    query = "What is the vision transformer?"
    print(f"Query: {query}")

    # Retrieve the top-k relevant documents
    top_k = 3
    relevant_chunks = retrieve_documents(query, embedding_model, index, text_chunks, top_k)

    # Print the results
    print(f"Top {top_k} results:")
    for i, chunk in enumerate(relevant_chunks):
        print(f"Result {i+1}: Page {chunk['page_number']}")
        print(f"Text: {chunk['text']}\n")

# Run the main function
if __name__ == "__main__":
    main()

Loading and parsing JSON files...
Loaded 168 text chunks.
Generating embeddings...
Embeddings generated.
Indexing embeddings...
Embeddings indexed.
Query: What is the vision transformer?
Top 3 results:
Result 1: Page 1
Text: vision transformer smallsize datasets seung hoon lee inha university incheon south korea aanna0701gmailcom seunghyun lee inha university incheon south korea lsh910703gmailcom byung cheol song inha university incheon south korea bcsonginhaackr abstract recently vision transformer vit applied transformer structure image classiﬁcation task outperformed convolutional neural networks however high performance vit results pretraining using largesize dataset jft300m depen dence large dataset interpreted due low locality inductive bias paper proposes shifted patch tokeniza tion spt locality selfattention lsa effec tively solve lack locality inductive bias enable learn scratch even smallsize datasets moreover spt lsa generic effective addon modules easily applicable various 

In [15]:
import os
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import ollama

# Step 1: Load the FAISS index and embedding model
def load_faiss_index(index_path):
    try:
        index = faiss.read_index(index_path)
        print("FAISS index loaded successfully.")
        return index
    except Exception as e:
        print(f"Error loading FAISS index: {e}")
        raise

def load_embedding_model():
    try:
        embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        print("Embedding model loaded successfully.")
        return embedding_model
    except Exception as e:
        print(f"Error loading embedding model: {e}")
        raise

# Step 2: Load all JSON files from a directory
def load_text_chunks_from_directory(directory_path):
    text_chunks = []
    try:
        for file_name in os.listdir(directory_path):
            if file_name.endswith('.json'):
                file_path = os.path.join(directory_path, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:  # Explicitly use UTF-8
                    data = json.load(file)
                    if isinstance(data, list):
                        text_chunks.extend(data)
        print(f"Loaded {len(text_chunks)} text chunks from directory.")
        return text_chunks
    except Exception as e:
        print(f"Error loading text chunks from directory: {e}")
        raise


# Step 3: Retrieve context using FAISS index
def retrieve_context(query, embedding_model, index, text_chunks, top_k=3):
    try:
        query_embedding = embedding_model.encode([query])
        query_embedding = np.array(query_embedding).astype('float32')
        faiss.normalize_L2(query_embedding)

        distances, indices = index.search(query_embedding, top_k)
        context = "\n".join([text_chunks[idx]["text"] for idx in indices[0]])
        return context
    except Exception as e:
        print(f"Error retrieving context: {e}")
        return None

# Step 4: Load the LLaMA model using Ollama
def load_llama_model():
    try:
        models = ollama.list()
        model_list = models.get('models', [])
        model_names = [model.get('model') for model in model_list]
        
        if "llama3.2:latest" not in model_names:
            print("Downloading LLaMA 3.2 model...")
            ollama.pull("llama3.2:latest")
        print("LLaMA 3.2 model ready")
    except Exception as e:
        print(f"Model loading failed: {e}")
        raise

# Step 5: Generate response using retrieved context
def generate_response(context, question):
    try:
        prompt = f"""Perform these tasks on the text below:
        1. Summarize in 3 sentences
        2. Answer: {question}

        Text: {context}

        Format exactly as:
        Summary: <summary>
        Answer: <answer>"""

        response = ollama.generate(
            model="llama3.2:latest",
            prompt=prompt,
            options={"temperature": 0.3}
        )
        return parse_response(response["response"])
    except Exception as e:
        print(f"Error generating response: {e}")
        return None, None

# Step 6: Parse the response
def parse_response(response):
    summary, answer = None, None
    lines = response.split("\n")
    for line in lines:
        line = line.strip()
        if line.startswith("Summary:"):
            summary = line.split("Summary:", 1)[1].strip()
        elif line.startswith("Answer:"):
            answer = line.split("Answer:", 1)[1].strip()
    if not summary or not answer:
        print("Warning: Response format incorrect")
    return summary, answer

# Main workflow
def main():
    # File paths
    faiss_index_path = "faiss_index_cosine.index"  # Replace with your FAISS index path
    directory_path = "C:\\Users\\Amina\\Downloads\\freelanceProject\\CHatbotUrgent\\Cleaneddataset"  # JSON files directory

    # Load FAISS index, embedding model, and text chunks
    index = load_faiss_index(faiss_index_path)
    embedding_model = load_embedding_model()
    text_chunks = load_text_chunks_from_directory(directory_path)

    # Load the LLaMA model
    load_llama_model()

    # Retrieve context and generate response
    query = "What is the vision transformer?"  # Replace with your query
    context = retrieve_context(query, embedding_model, index, text_chunks, top_k=3)
    print(f"Retrieved Context:\n{context}\n")

    summary, answer = generate_response(context, query)
    if summary is not None and answer is not None:
        print(f"Summary:\n{summary}\n")
        print(f"Answer:\n{answer}\n")
    else:
        print("Failed to generate summary and answer.")

if __name__ == "__main__":
    main()


FAISS index loaded successfully.
Embedding model loaded successfully.
Loaded 168 text chunks from directory.
LLaMA 3.2 model ready
Error retrieving context: 
Retrieved Context:
None

Summary:
The vision transformer (ViT) is a neural network architecture that processes visual data efficiently using self-attention mechanisms. It was introduced in 2020 and has become popular for image classification tasks due to its ability to capture long-range dependencies. ViT uses this compact representation to transform visual features into a useful output.

Answer:
The vision transformer (ViT) is a type of neural network architecture that uses self-attention mechanisms to process visual data efficiently, capturing long-range dependencies and contextual information.

