In [1]:
!pip install PyPDF2 pytesseract pdf2image pillow

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pytesseract, PyPDF2, pdf2image
Successfully installed PyPDF2-3.0.1 pdf2image-1.17.0 pytesseract-0.3.13


In [2]:
!sudo apt-get install tesseract-ocr tesseract-ocr-ara

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
The following NEW packages will be installed:
  tesseract-ocr-ara
0 upgraded, 1 newly installed, 0 to remove and 34 not upgraded.
Need to get 645 kB of archives.
After this operation, 1,447 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-ara all 1:4.00~git30-7274cfa-1.1 [645 kB]
Fetched 645 kB in 1s (573 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting p

In [7]:
# Install required packages
!pip install PyPDF2 pytesseract pdf2image

# Download and install tesseract + Arabic language pack if running in Colab
import os
if 'google.colab' in str(get_ipython()):
    !apt-get install tesseract-ocr
    !apt-get install tesseract-ocr-ara
    !apt-get install poppler-utils

import re
import json
import PyPDF2
import pytesseract
from pdf2image import convert_from_path
import numpy as np
from PIL import Image
import tempfile
from google.colab import files

def extract_text_from_pdf(pdf_path, use_ocr=True):
    """
    Extract text from a PDF file.
    If use_ocr is True, uses OCR (recommended for Arabic PDFs).
    Otherwise, tries to extract text directly (might not work well with Arabic).
    """
    if not use_ocr:
        # Try to extract text directly from PDF
        text = ""
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text() + "\n\n"
        return text

    # If OCR is requested or direct extraction didn't yield good results
    print("Using OCR to extract text from PDF...")

    # Convert PDF to images
    print("Converting PDF to images...")
    images = convert_from_path(pdf_path)

    # Extract text from each image using OCR with Arabic language
    full_text = ""
    for i, image in enumerate(images):
        print(f"Processing page {i+1}/{len(images)}...")
        text = pytesseract.image_to_string(image, lang='ara')  # 'ara' for Arabic
        full_text += text + "\n\n"

    return full_text

def split_into_paragraphs(text):
    """Split text into paragraphs based on new lines."""
    # Split on one or more newlines
    paragraphs = re.split(r'\n+', text)
    # Remove empty paragraphs
    return [p.strip() for p in paragraphs if p.strip()]

def split_into_sentences(paragraph):
    """Split an Arabic paragraph into sentences."""
    # This regex looks for sentence endings with Arabic punctuation marks
    sentences = re.split(r'([.!?؟।\n]+)', paragraph)

    # Combine each sentence with its punctuation
    processed_sentences = []
    for i in range(0, len(sentences)-1, 2):
        if i+1 < len(sentences):
            processed_sentences.append(sentences[i] + sentences[i+1])
        else:
            processed_sentences.append(sentences[i])

    # Handle cases where the last element doesn't have punctuation
    if len(sentences) % 2 == 1 and sentences[-1].strip():
        processed_sentences.append(sentences[-1])

    return [s.strip() for s in processed_sentences if s.strip()]

def chunk_paragraphs(paragraphs, min_sentences=2, max_sentences=4):
    """
    Split paragraphs into chunks of 2-4 sentences.
    Returns a list of chunks.
    """
    chunks = []

    for paragraph in paragraphs:
        sentences = split_into_sentences(paragraph)

        # If the paragraph has fewer than min_sentences, keep it as is
        if len(sentences) <= min_sentences:
            chunks.append(" ".join(sentences))
            continue

        # Create chunks of 2-4 sentences
        current_chunk = []
        for sentence in sentences:
            current_chunk.append(sentence)

            # When the chunk reaches the max size or we're at the last sentence,
            # add it to chunks and reset
            if len(current_chunk) >= max_sentences:
                chunks.append(" ".join(current_chunk))
                current_chunk = []

        # Don't forget the last chunk if it's not empty
        if current_chunk:
            chunks.append(" ".join(current_chunk))

    return chunks

def process_arabic_pdf(pdf_path, output_file='chunked_arabic_book.json', use_ocr=True):
    """Process an Arabic PDF and save chunks to a JSON file."""
    # Extract text from PDF
    text = extract_text_from_pdf(pdf_path, use_ocr)

    # Save the extracted text for inspection
    text_file = os.path.splitext(output_file)[0] + "_full_text.txt"
    with open(text_file, 'w', encoding='utf-8') as f:
        f.write(text)
    print(f"Full extracted text saved to {text_file}")

    # Split into paragraphs
    paragraphs = split_into_paragraphs(text)

    # Chunk the paragraphs
    chunks = chunk_paragraphs(paragraphs)

    # Save the chunks with index
    output_data = {
        "source": os.path.basename(pdf_path),
        "chunks": [{"id": i, "text": chunk} for i, chunk in enumerate(chunks)]
    }

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)

    print(f"Processed {len(paragraphs)} paragraphs into {len(chunks)} chunks")
    print(f"Output saved to {output_file}")

    return chunks, text_file, output_file

# This section for uploading a PDF file in Colab
print("Please upload your Arabic PDF file...")
uploaded = files.upload()

# Get the filename of the uploaded file
pdf_filename = list(uploaded.keys())[0]
print(f"Processing {pdf_filename}...")

# Process the PDF
chunks, text_file, json_file = process_arabic_pdf(pdf_filename, use_ocr=True)

# Display a sample of chunks
print("\nSample chunks:")
for i, chunk in enumerate(chunks[:3]):
    print(f"Chunk {i}: {chunk[:100]}...")

# Download the output files
files.download(text_file)
files.download(json_file)

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr-ara is already the newest version (1:4.00~git30-7274cfa-1.1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 34 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.7 [186 kB]
Fetched 186 kB in 0s (375 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database

Saving book.pdf to book (1).pdf
Processing book (1).pdf...
Using OCR to extract text from PDF...
Converting PDF to images...
Processing page 1/192...
Processing page 2/192...
Processing page 3/192...
Processing page 4/192...
Processing page 5/192...
Processing page 6/192...
Processing page 7/192...
Processing page 8/192...
Processing page 9/192...
Processing page 10/192...
Processing page 11/192...
Processing page 12/192...
Processing page 13/192...
Processing page 14/192...
Processing page 15/192...
Processing page 16/192...
Processing page 17/192...
Processing page 18/192...
Processing page 19/192...
Processing page 20/192...
Processing page 21/192...
Processing page 22/192...
Processing page 23/192...
Processing page 24/192...
Processing page 25/192...
Processing page 26/192...
Processing page 27/192...
Processing page 28/192...
Processing page 29/192...
Processing page 30/192...
Processing page 31/192...
Processing page 32/192...
Processing page 33/192...
Processing page 34/192...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [3]:
!pip install sentence-transformers transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [4]:
import json
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm  # For progress tracking in notebooks

# Load the model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Function to generate embeddings
def generate_embeddings(texts):
    return model.encode(texts, convert_to_tensor=True)

# Load chunks from the JSON file
def load_chunks(json_file_path):
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Process and save embeddings
def process_chunks_with_embeddings(json_file_path, output_path='chunks_with_embeddings.json'):
    # Load data
    data = load_chunks(json_file_path)

    # Extract texts from chunks
    chunk_texts = [chunk['text'] for chunk in data['chunks']]
    print(f"Generating embeddings for {len(chunk_texts)} chunks...")

    # Generate embeddings
    embeddings = generate_embeddings(chunk_texts)
    print(f"Shape of embeddings: {embeddings.shape}")

    # Convert to numpy for saving
    embeddings_numpy = embeddings.cpu().numpy()

    # Add embeddings to chunks
    for i, embedding in enumerate(embeddings_numpy):
        data['chunks'][i]['embedding'] = embedding.tolist()

    # Save the data with embeddings
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    # Also save embeddings separately as numpy array
    np.save('chunk_embeddings.npy', embeddings_numpy)

    print(f"Saved chunks with embeddings to {output_path}")
    print(f"Saved raw embeddings to chunk_embeddings.npy")

    return data, embeddings

try:
    from google.colab import files
    print("Please upload your chunked Arabic book JSON file...")
    uploaded = files.upload()
    json_file = list(uploaded.keys())[0]

    # Process the data
    data, embeddings = process_chunks_with_embeddings(json_file)

    # Download the results
    print("Downloading the files with embeddings...")
    files.download('chunks_with_embeddings.json')
    files.download('chunk_embeddings.npy')

except ImportError:
    # Not running in Colab
    print("Enter the path to your chunked JSON file:")
    json_file = input()  # Or hardcode it: json_file = "chunked_arabic_book.json"

    # Process the data
    data, embeddings = process_chunks_with_embeddings(json_file)

# Show the first embedding as a sample
print("\nSample embedding (first 5 dimensions):")
print(embeddings[0][:5])
print(f"Total embedding dimensions: {embeddings.shape[1]}")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Please upload your chunked Arabic book JSON file...


Saving chunked_arabic_book.json to chunked_arabic_book (1).json
Generating embeddings for 4664 chunks...
Shape of embeddings: torch.Size([4664, 384])
Saved chunks with embeddings to chunks_with_embeddings.json
Saved raw embeddings to chunk_embeddings.npy
Downloading the files with embeddings...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Sample embedding (first 5 dimensions):
tensor([ 0.1642,  0.0462,  0.0783, -0.0278, -0.0244], device='cuda:0')
Total embedding dimensions: 384


In [5]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [6]:
!pip list | grep faiss

faiss-cpu                             1.11.0


In [15]:
import faiss
import os


def load_embeddings(embeddings_file='chunk_embeddings.npy'):
    """Load the embeddings from the NumPy file."""
    if os.path.exists(embeddings_file):
        embeddings = np.load(embeddings_file)
        print(f"Loaded embeddings with shape: {embeddings.shape}")
        return embeddings
    else:
        raise FileNotFoundError(f"Embeddings file {embeddings_file} not found")

def load_chunks_with_embeddings(json_file='chunks_with_embeddings.json'):
    """Load chunks with their embeddings from the JSON file."""
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    print(f"Loaded {len(data['chunks'])} chunks with embeddings")
    return data

def create_faiss_index(embeddings, index_type='flat'):
    """
    Create a FAISS index from the embeddings.

    Parameters:
    - embeddings: numpy array of embeddings
    - index_type: type of FAISS index to create ('flat', 'ivf', or 'hnsw')

    Returns:
    - faiss index
    """
    # Get dimensions
    vector_dimension = embeddings.shape[1]
    num_vectors = embeddings.shape[0]

    print(f"Creating FAISS index for {num_vectors} vectors with {vector_dimension} dimensions")

    # Normalize vectors for cosine similarity
    embeddings_normalized = embeddings.copy()
    faiss.normalize_L2(embeddings_normalized)

    # Choose index type
    if index_type == 'flat':
        # Flat index - exact search, but slower for large datasets
        index = faiss.IndexFlatIP(vector_dimension)  # Inner product for cosine similarity
        index.add(embeddings_normalized)

    elif index_type == 'ivf':
        # IVF index - approximate search, faster for larger datasets
        # Number of centroids - rule of thumb: sqrt(N)
        nlist = int(np.sqrt(num_vectors))
        # Create quantizer
        quantizer = faiss.IndexFlatIP(vector_dimension)
        # Create IVF index
        index = faiss.IndexIVFFlat(quantizer, vector_dimension, nlist, faiss.METRIC_INNER_PRODUCT)
        # Train the index
        print("Training IVF index...")
        index.train(embeddings_normalized)
        # Add vectors
        index.add(embeddings_normalized)
        # Set number of probes (higher = more accurate but slower)
        index.nprobe = min(10, nlist)

    elif index_type == 'hnsw':
        # HNSW index - very fast for large datasets, good accuracy
        # M parameter controls the maximum number of connections per layer
        M = 16
        # Create HNSW index
        index = faiss.IndexHNSWFlat(vector_dimension, M, faiss.METRIC_INNER_PRODUCT)
        # Add vectors
        index.add(embeddings_normalized)

    else:
        raise ValueError(f"Unknown index type: {index_type}")

    print(f"Created {index_type.upper()} index with {index.ntotal} vectors")
    return index

def save_faiss_index(index, output_file='arabic_embeddings.faiss'):
    """Save the FAISS index to a file."""
    faiss.write_index(index, output_file)
    print(f"Saved FAISS index to {output_file}")
    return output_file

def test_faiss_index(index, embeddings, chunks_data, query_text="مرحبا", top_k=5):
    """
    Test the FAISS index with a query and display results.

    Parameters:
    - index: FAISS index
    - embeddings: original embeddings
    - chunks_data: original chunks with text
    - query_text: Arabic query text
    - top_k: number of results to return
    """
    print(f"\nTesting index with query: '{query_text}'")

    # Load the model to encode the query
    model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

    # Encode the query
    query_embedding = model.encode([query_text], convert_to_tensor=True)
    query_embedding_np = query_embedding.cpu().numpy()

    # Normalize the query embedding
    faiss.normalize_L2(query_embedding_np)

    # Search in the index
    distances, indices = index.search(query_embedding_np, top_k)

    # Display results
    print(f"\nTop {top_k} results:")
    for i, (idx, distance) in enumerate(zip(indices[0], distances[0])):
        chunk_text = chunks_data['chunks'][idx]['text']
        # Truncate long chunks for display
        if len(chunk_text) > 100:
            chunk_text = chunk_text[:100] + "..."
        print(f"{i+1}. [Score: {distance:.4f}] {chunk_text}")

    return distances, indices

def main(embeddings_file='chunk_embeddings.npy',
         chunks_file='chunks_with_embeddings.json',
         index_type='flat',
         output_file='arabic_embeddings.faiss'):
    """Main function to run the indexing process."""
    # Load embeddings
    embeddings = load_embeddings(embeddings_file)

    # Load chunks with text
    chunks_data = load_chunks_with_embeddings(chunks_file)

    # Create FAISS index
    index = create_faiss_index(embeddings, index_type)

    # Save index
    save_faiss_index(index, output_file)

    # Test the index
    test_query = "مرحبا"  # You can change this to a word from your book
    test_faiss_index(index, embeddings, chunks_data, test_query)

    print("\nFAISS indexing complete!")
    return index, chunks_data

# For Google Colab: Check if running in Colab and handle file upload/download
# For Google Colab: Check if running in Colab and handle file paths
try:
    from google.colab import files
    RUNNING_IN_COLAB = True
    print("Running in Google Colab environment")

    # Ask for file paths
    embeddings_file = input("Enter path to embeddings file (default: chunk_embeddings.npy): ") or "chunk_embeddings.npy"
    chunks_file = input("Enter path to chunks file (default: chunks_with_embeddings.json): ") or "chunks_with_embeddings.json"

    # Choose index type
    print("\nChoose FAISS index type:")
    print("1. Flat index (exact search, slower but most accurate)")
    print("2. IVF index (approximate search, good balance of speed and accuracy)")
    print("3. HNSW index (very fast, good accuracy, best for large datasets)")
    choice = input("Enter your choice (1/2/3): ")

    index_type = {
        '1': 'flat',
        '2': 'ivf',
        '3': 'hnsw'
    }.get(choice, 'flat')

    # Run indexing
    index, chunks_data = main(embeddings_file, chunks_file, index_type)

    # Download the index
    files.download('arabic_embeddings.faiss')

except ImportError:
    RUNNING_IN_COLAB = False
    print("Not running in Colab environment")

    # Ask for file paths
    embeddings_file = input("Enter path to embeddings file (default: chunk_embeddings.npy): ") or "chunk_embeddings.npy"
    chunks_file = input("Enter path to chunks file (default: chunks_with_embeddings.json): ") or "chunks_with_embeddings.json"

    # Choose index type
    print("\nChoose FAISS index type:")
    print("1. Flat index (exact search, slower but most accurate)")
    print("2. IVF index (approximate search, good balance of speed and accuracy)")
    print("3. HNSW index (very fast, good accuracy, best for large datasets)")
    choice = input("Enter your choice (1/2/3): ")

    index_type = {
        '1': 'flat',
        '2': 'ivf',
        '3': 'hnsw'
    }.get(choice, 'flat')

    # Run indexing
    index, chunks_data = main(embeddings_file, chunks_file, index_type)

Running in Google Colab environment
Enter path to embeddings file (default: chunk_embeddings.npy): /content/chunk_embeddings.npy
Enter path to chunks file (default: chunks_with_embeddings.json): /content/chunks_with_embeddings.json

Choose FAISS index type:
1. Flat index (exact search, slower but most accurate)
2. IVF index (approximate search, good balance of speed and accuracy)
3. HNSW index (very fast, good accuracy, best for large datasets)
Enter your choice (1/2/3): 3
Loaded embeddings with shape: (4664, 384)
Loaded 4664 chunks with embeddings
Creating FAISS index for 4664 vectors with 384 dimensions
Created HNSW index with 4664 vectors
Saved FAISS index to arabic_embeddings.faiss

Testing index with query: 'مرحبا'

Top 5 results:
1. [Score: 0.8195] هه
2. [Score: 0.8132] متسر
3. [Score: 0.8098] عات
4. [Score: 0.7823] ىه
5. [Score: 0.7815] اده

FAISS indexing complete!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>