#Install the dependencies

In [None]:
! pip install langchain
! pip install sentence-transformers
! pip install langchain-community
! pip install translate
! pip install pypdfium2
! pip install pypdf2
! pip install boto3
! pip install "pydantic>=1.10.0"
! pip install faiss-cpu


: 

# Data prepration

Extracting PDF text from S3, chunking it, generating embeddings, and finally retrieving the most relevant chunks based on a query.

##### Key Steps:
1. Setup and Initialization
    - Import necessary libraries, including boto3 for AWS S3 access, sentence_transformers for embeddings, pypdfium2 for PDF text extraction, and faiss for efficient similarity search.
    - Define S3 credentials and the bucket where PDFs are stored.

2. Define Helper Functions
    - Chunking Function (chunk_text_with_overlap): Splits the text into manageable chunks with some overlap. Each chunk will have max_length words, and overlap defines how much overlap each chunk has with the next. This helps maintain context within each chunk.
    - PDF Text Extraction (extract_text_from_pdf_stream): Loads a PDF file in memory, extracts text from each page, and stores it along with the page number. This is useful for associating each chunk with the correct page later on.
    - Read PDF from S3 (read_pdf_from_s3): Directly reads a PDF file from S3, avoiding the need to download it locally. Returns a stream that can be passed to extract_text_from_pdf_stream.

3. Extract Text and Chunk the PDF Content
    - List Files in S3 Bucket: Connect to the S3 bucket and list all files. 
    - Process Each PDF
    - Retrieve the PDF Stream
    - Chunk the Text by Page
    - Storage for Chunks and Metadata

4. Generate Embeddings
    - Initialize the Model
    - Compute Embeddings for Chunks
    - Normalize the Embeddings

5. Set Up Similarity Search with FAISS
    - nitialize FAISS Index
    - Add Embeddings to the Index

6. Query Function for Retrieval
    - Define the Query Function (query_faiss): This function takes a query, computes its embedding, and retrieves the most similar chunks using FAISS.
    - Generate Query Embedding
    - Perform FAISS Search
    - Return Results with Metadata

In [2]:
import os
import numpy as np
import boto3
from sentence_transformers import SentenceTransformer
import pypdfium2 as pdfium
import io
import faiss  # Import FAISS for similarity search

# Initialize the SentenceTransformer model
model = SentenceTransformer('distiluse-base-multilingual-cased')  # Multilingual SBERT, supports Arabic

# Function to simulate the text chunking process
def chunk_text_with_overlap(text, max_length, overlap):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + max_length
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += max_length - overlap
    return chunks

# Extract text from a PDF file in memory and return it with page numbers
def extract_text_from_pdf_stream(pdf_stream):
    pdf_document = pdfium.PdfDocument(io.BytesIO(pdf_stream))  # Use stream as BytesIO
    text_pages = []
    for page_index in range(len(pdf_document)):
        textpage = pdf_document[page_index]
        text = textpage.get_textpage().get_text_range()
        text_pages.append({"text": text, "page_number": page_index + 1})  # Store text with its page number
    return text_pages

# Function to read a PDF file directly from S3 (streaming without download)
def read_pdf_from_s3(bucket_name, pdf_key):
    s3 = boto3.client(
        's3',
        aws_access_key_id='',
        aws_secret_access_key=''
    )
    response = s3.get_object(Bucket=bucket_name, Key=pdf_key)
    pdf_stream = response['Body'].read()  # Read the file content into memory
    return pdf_stream  # Return the PDF content as a stream

# Step 1: Read and chunk the documents, and also store metadata
documents = []
document_ids = []
metadata = []  # To store metadata for each chunk
page_numbers = []  # To store the page numbers for each chunk

# Your S3 bucket name
bucket_name = 'areeb-data-s3'

# List all the PDF files in the S3 bucket
s3 = boto3.client(
    's3',
    aws_access_key_id='',
    aws_secret_access_key=''
)

response = s3.list_objects_v2(Bucket=bucket_name)

# Check if the response has Contents (files)
if 'Contents' in response:
    for item in response['Contents']:
        if item['Key'].endswith('.pdf'):
            pdf_key = item['Key']
            print(f"Processing PDF: {pdf_key}")
            # Read the PDF file directly from S3 as a stream
            pdf_stream = read_pdf_from_s3(bucket_name, pdf_key)
            document_pages = extract_text_from_pdf_stream(pdf_stream)

            # Process each page separately
            for page in document_pages:
                page_text = page["text"]
                page_number = page["page_number"]

                # Chunk the page text and associate each chunk with the current page number
                chunks = chunk_text_with_overlap(page_text, max_length=300, overlap=75)
                documents.extend(chunks)
                chunk_ids = [f"{pdf_key}_chunk_{i}" for i in range(len(chunks))]
                document_ids.extend(chunk_ids)
                page_numbers.extend([page_number] * len(chunks))  # Store the page number for each chunk
                metadata.extend([{"filename": pdf_key, "chunk_id": chunk_id, "page_number": page_number} for chunk_id in chunk_ids])

# Step 2: Generate embeddings for each chunk using embedding model
embeddings = []
for chunk in documents:
    embedding = model.encode(chunk)
    embeddings.append(embedding)

# Convert embeddings to a NumPy array
embeddings_np = np.array(embeddings).astype('float32')

# Step 3: Use FAISS for cosine similarity search
index = faiss.IndexFlatIP(embeddings_np.shape[1])  # Using Inner Product for cosine similarity
faiss.normalize_L2(embeddings_np)  # Normalize embeddings for cosine similarity
index.add(embeddings_np)  # Add embeddings to the FAISS index

# Step 4: Define a query and perform retrieval with similarity percentage and metadata lookup
def query_faiss(query_text):
    # Generate embedding for the query using the same model
    query_embedding = model.encode(query_text).astype('float32')
    faiss.normalize_L2(query_embedding.reshape(1, -1))  # Normalize query embedding

    # Perform FAISS search
    distances, indices = index.search(query_embedding.reshape(1, -1), 3)

    # Convert distances to similarity percentages
    similarities = distances[0] * 100

    # Retrieve the relevant chunks, document IDs, and similarity percentages
    results = []
    for idx, similarity in zip(indices[0], similarities):
        results.append({
            "document_id": document_ids[idx],
            "document": documents[idx],
            "metadata": metadata[idx],  # Retrieve the corresponding metadata
            "Page_No": metadata[idx]["page_number"]  # Add the page number from metadata
        })

    # Sort results by similarity in descending order
    results = sorted(results, key=lambda x: x['similarity'], reverse=True)
    return results[:2]


  from tqdm.autonotebook import tqdm, trange


Processing PDF: Biology/Biology_Chapter1.pdf




Processing PDF: Biology/Biology_Chapter10.pdf
Processing PDF: Biology/Biology_Chapter11.pdf
Processing PDF: Biology/Biology_Chapter12.pdf
Processing PDF: Biology/Biology_Chapter13.pdf
Processing PDF: Biology/Biology_Chapter14.pdf
Processing PDF: Biology/Biology_Chapter15.pdf
Processing PDF: Biology/Biology_Chapter2.pdf
Processing PDF: Biology/Biology_Chapter3.pdf
Processing PDF: Biology/Biology_Chapter4.pdf
Processing PDF: Biology/Biology_Chapter5.pdf
Processing PDF: Biology/Biology_Chapter6.pdf
Processing PDF: Biology/Biology_Chapter7.pdf
Processing PDF: Biology/Biology_Chapter8.pdf
Processing PDF: Biology/Biology_Chapter9.pdf
Processing PDF: Chemistry/Chemistry_Chapter1.pdf
Processing PDF: Chemistry/Chemistry_Chapter2.pdf
Processing PDF: Chemistry/Chemistry_Chapter3.pdf
Processing PDF: Chemistry/Chemistry_Chapter4.pdf
Processing PDF: Frontiers of Galaxy Evolution /Frontiers of Galaxy Evolution_Chapter1.pdf
Processing PDF: Frontiers of Galaxy Evolution /Frontiers of Galaxy Evolution_C

### Test the RAG

In [3]:
# Example query

query = "How do scientists test their ideas or experiments?"
retrieved_chunks = query_faiss(query)

# Display the retrieved chunks with metadata and similarity percentage
for result in retrieved_chunks:
    print(f"Document ID: {result['document_id']}")
    print(f"Document Text: {result['document']}")
    print(f"Metadata: {result['metadata']}")
    print(f"Page number: {result['Page_No']}\n")



Document ID: Chemistry/Chemistry_Chapter1.pdf_chunk_1
Document Text: will likely draw a more complete conclusion and a more evidence based answer. Running an experiment requires that you have previous knowledge, that you understand all the steps, and that when you get a result, does that result make sense. Can you draw a conclusion from your results? How can you confirm that your result is correct? Scientists run experiments to answer a question, and then they run that same experiment again and again to check their result. They also run other experiments to confirm that other variables are not affecting the conclusion. In addition to confirming your result, consider what evidence do you have that your experiment did not work? Consider the errors that could have occurred during the experimental process. Science is very methodical, but it does not always follow the same linear method. You can visit this awesome site to learn more about how science works! 1.5: The Process of Science is sh

# Predefined questions for each categories

In [4]:
chemistry_questions = [
    "What is a hypothesis in science?",
    "How do scientists test their ideas or experiments?",
    "What is radiation, and where can we find it in nature?",
    "Can you name one good thing about using radiation in medicine?",
    "What is the difference between fission and fusion?",
    "Why do some scientists say radiation can be dangerous?",
    "What is energy, and why do we need it?",
    "How do we use energy in our everyday lives?",
    "What are covalent bonds, and how do they hold things together?",
    "Can you name something that causes pollution in the air?",
    "What makes water special compared to other liquids?",
    "Can you explain what happens when salt dissolves in water?",
    "What is the pH scale, and why is it important for water?",
    "What is the difference between acids and bases?"
]
galaxy_questions = [
    "Can you name one way that galaxies are different from stars?",
    "How did scientists first discover that the universe is expanding?",
    "What is the Big Bang, and why is it important in understanding space?",
    "What is a quasar, and why is it special?",
    "How can scientists find quasars by looking at their light?",
    "Why do quasars change their brightness over time?",
    "What makes quasars look different from other stars or galaxies?",
    "Why do quasars sometimes change color as they get brighter?",
    "What can scientists learn by studying the color of quasars?",
    "How do scientists use special telescopes to observe quasars?",
    "What is the difference between red and blue quasars?",
    "What happens when two galaxies merge together?",
    "How do new stars form when galaxies collide?",
    "Why do scientists study galaxy mergers from long ago?"
]
biology_questions = [
    "What is the difference between prokaryotic and eukaryotic cells?",
    "How do cell membranes help protect the cell?",
    "What is the role of the nucleus inside a cell?",
    "What is energy, and why do cells need it?",
    "How do cells use glucose in the process of glycolysis?",
    "What happens in the citric acid cycle?",
    "Can you explain what fermentation is and when it occurs?",
    "How do plants use sunlight to make their own food?",
    "What are the light-dependent reactions in photosynthesis?",
    "Can you explain what the Calvin cycle is?",
    "What is the cell cycle, and why is it important?",
    "How do cells divide to make more cells?",
    "What happens when the cell cycle?",
    "What is sexual reproduction, and why is it important?",
    "What happens during meiosis, and how is it different from mitosis?",
    "Why do children look like their parents?",
    "What is homeostasis, and why is it important for the body?",
    "How does the digestive system help our body get energy from food?",
    "Can you name two types of animals that are invertebrates?",
    "What is the difference between a flatworm and an arthropod?",
    "Can you explain what seedless plants are?",
    "What is a gymnosperm, and how is it different from an angiosperm?",
    "Why are flowers important for plant reproduction?"
]

This code translates a set of English questions into Arabic, retrieves related text for each question, and stores the results in CSV files. It enables quick creation of bilingual question datasets with context.

##### Key Steps
1. Initialize Tools:
    - Sets up a translator to convert questions from English to Arabic.
    - Uses a retrieval function to find relevant text chunks for each question.
2. Process and Save Data:
     - Defines a function to:
        - Translate each question,
        - Retrieve related text,
        - Save the English question, its Arabic translation, and relevant text to a CSV file.       
3. Generate CSVs for Multiple Topics:

##### Result
-  Each CSV contains:
    - English and Arabic versions of each question,
    - Relevant text for context.

In [7]:
import csv
from translate import Translator  # Assumes you have a Translator model or API for translation

# Initialize the translator
translator = Translator(from_lang="en", to_lang="ar")

# Function to save results to CSV with Arabic translation of the questions
def save_results_to_csv(filename, questions):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Add "Arabic Question" as a new column
        writer.writerow(["ID", "Question", "Arabic_Question", "Retrieved Text"])

        for i, question in enumerate(questions):
            # Translate question to Arabic
            arabic_question = translator.translate(question)

            # Get relevant text chunks for the question
            results = query_sklearn(question)
            for result in results:
                # Save the ID, English question, Arabic question, and retrieved text
                writer.writerow([result['document_id'], question, arabic_question, result['document']])

# Generate and save results for each question array with Arabic translations
save_results_to_csv("chemistry_results.csv", chemistry_questions)
save_results_to_csv("galaxy_results.csv", galaxy_questions)
save_results_to_csv("biology_results.csv", biology_questions)
