In [None]:
# Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install necessary Library

In [None]:
!pip install python-docx pymupdf pandas pdfplumber langchain tiktoken openai faiss-cpu nomic optimum auto-gptq numpy transformers huggingface_hub langdetect rouge-score openpyxl

In [None]:
# nomic and hugging face login credential

In [None]:
!nomic login # provide your nomic login token here
from huggingface_hub import login
login(token="") #  ---> provide your hugging face login token here

In [None]:
# import necessary libraries

In [None]:
import os
import time
import warnings
import csv
import re
import docx
import fitz  # PyMuPDF
import numpy as np
import pandas as pd
import pdfplumber
import tiktoken
import faiss
import nltk



from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util
from nomic import embed
from langdetect import detect
from rouge_score import rouge_scorer
from docx import Document

from sklearn.metrics.pairwise import cosine_similarity

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    pipeline,
    LlamaTokenizer,
    LlamaForCausalLM
)


In [None]:
#Task 1

# Reading the Publications:
# • You have different types of publications: Ms Word files (.docx), PDF files (.pdf), and 'csv', 'xlsx', 'xls', 'xlsm' files.
# • Write a Python function to extract the text from these files.
# • Remember, the files might contain tables. Extract the text from these tables in a readable format, but do not
# attempt to reconstruct their visual layout.

In [None]:

# Initialize tokenizer for token counting and chunking
encoder = tiktoken.get_encoding("cl100k_base")

def tokenize_and_chunk(text_with_page, global_chunk_counter):
    """
    Tokenizes text into chunks (around 1000 tokens each) and adds metadata.

    Args:
        text_with_page (dict): Contains 'content', 'page_number', and optionally 'source'.
        global_chunk_counter (int): Global chunk count to track sequence.

    Returns:
        tuple: (List of chunked dicts, updated global chunk count)
    """
    text = text_with_page['content']
    page_number = text_with_page['page_number']
    tokens = encoder.encode(text)

    # Estimate average token length per word
    avg_token_length = len(tokens) / len(text.split()) if len(text.split()) > 0 else 1
    desired_chunk_size_in_tokens = 1000
    tokens_per_chunk = int(desired_chunk_size_in_tokens * avg_token_length)

    # Split into token chunks
    chunks = [tokens[i: i + tokens_per_chunk] for i in range(0, len(tokens), tokens_per_chunk)]
    chunked_data = []

    for i, chunk in enumerate(chunks):
        chunked_data.append({
            "content": encoder.decode(chunk),  # Decode tokens back to text
            "page_number": page_number,
            "chunk_number": global_chunk_counter,
            "source": text_with_page.get('source', 'Unknown'),
        })
        global_chunk_counter += 1

    return chunked_data, global_chunk_counter

def extract_text_from_file(file_path):
    """
    Dispatches file to appropriate extractor based on its extension.

    Args:
        file_path (str): Path to the file.

    Returns:
        list: List of dicts with extracted content and metadata.
    """
    file_extension = os.path.splitext(file_path)[1].lower()

    if file_extension == '.docx':
        return extract_text_from_docx(file_path)
    elif file_extension == '.pdf':
        return extract_text_and_tables_from_pdf(file_path)
    elif file_extension in ['.csv', '.xlsx', '.xls', '.xlsm']:
        return extract_all_tables(file_path)
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")

def extract_text_from_docx(file_path):
    """
    Extracts paragraphs and tables from a Word (.docx) file.

    Args:
        file_path (str): Path to the DOCX file.

    Returns:
        list: Extracted elements with page number and type (text/table).
    """
    doc = docx.Document(file_path)
    text = []
    namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
    page_number = 1

    for element in doc.element.body:
        if element.tag.endswith('p'):  # Paragraph
            para_text = ' '.join([
                run.text.strip() for run in element.findall('.//w:t', namespaces) if run.text.strip() != ''
            ])
            if para_text:
                text.append({
                    "type": "text",
                    "source": file_path,
                    "page_number": page_number,
                    "content": para_text.strip(),
                    "table": None
                })

        elif element.tag.endswith('tbl'):  # Table
            table_data = []
            for row in element.findall('.//w:tr', namespaces):
                row_text = [cell.text.strip() for cell in row.findall('.//w:t', namespaces) if cell.text.strip()]
                if row_text:
                    table_data.append(" | ".join(row_text))
            if table_data:
                text.append({
                    "type": "table",
                    "source": file_path,
                    "page_number": page_number,
                    "content": "\n".join(table_data),
                    "table": table_data
                })

        if len(text) % 5 == 0:  # Approximate page number increment
            page_number += 1

    return text

def extract_text_and_tables_from_pdf(file_path):
    """
    Extracts text and tables from a PDF using both PyMuPDF and pdfplumber.

    Args:
        file_path (str): Path to the PDF file.

    Returns:
        list: Extracted content with page number and content type.
    """
    combined_content = []
    doc = fitz.open(file_path)

    with pdfplumber.open(file_path) as pdf:
        for page_num in range(len(doc)):
            # Extract text using PyMuPDF
            page = doc.load_page(page_num)
            page_text = page.get_text("text")
            if page_text.strip():
                combined_content.append({
                    "type": "text",
                    "source": file_path,
                    "page_number": page_num + 1,
                    "content": page_text.strip()
                })

            # Extract tables using pdfplumber
            pdf_page = pdf.pages[page_num]
            tables = pdf_page.extract_tables()

            for table in tables:
                formatted_table = []
                for row in table:
                    formatted_row = "|".join([str(cell) for cell in row if cell])
                    formatted_table.append(formatted_row)
                if formatted_table:
                    combined_content.append({
                        "type": "table",
                        "source": file_path,
                        "page_number": page_num + 1,
                        "content": "\n".join(formatted_table)
                    })

    return combined_content

def extract_all_tables(file_path):
    """
    Extracts tables from the first sheet of an Excel file.

    Args:
        file_path (str): Path to Excel file.

    Returns:
        list: Extracted table content.
    """
    xl = pd.ExcelFile(file_path)
    sheet_name = xl.sheet_names[0]
    sheet_data = xl.parse(sheet_name)
    return extract_tables_from_sheet(sheet_data, sheet_name, file_path)

def extract_tables_from_sheet(sheet_data, sheet_name, file_path):
    """
    Extracts structured tables from an Excel sheet.

    Args:
        sheet_data (DataFrame): Parsed Excel sheet.
        sheet_name (str): Name of the sheet.
        file_path (str): File path for reference.

    Returns:
        list: Extracted tables with headers and data.
    """
    tables = []
    rows = sheet_data.values.tolist()
    current_table = []
    table_title = None

    for row in rows:
        if any(cell for cell in row):  # Non-empty row
            if table_title is None:
                table_title = row
            else:
                current_table.append(row)
        else:
            if current_table:
                # Store the complete table
                formatted_table = {
                    "type": "table",
                    "source": file_path,
                    "page_number": sheet_name,
                    "title": " | ".join([str(cell) for cell in table_title]) if table_title else "No Title",
                    "table_data": "\n".join([" | ".join([str(cell) for cell in row]) for row in current_table]),
                    "content": " | ".join([str(cell) for cell in table_title]) + " | " + "\n".join([" | ".join([str(cell) for cell in row]) for row in current_table])
                }
                tables.append(formatted_table)
                table_title = None
                current_table = []

    # Capture last table if file ends without an empty row
    if current_table:
        formatted_table = {
            "type": "table",
            "source": file_path,
            "page_number": sheet_name,
            "title": " | ".join([str(cell) for cell in table_title]) if table_title else "No Title",
            "table_data": "\n".join([" | ".join([str(cell) for cell in row]) for row in current_table]),
            "content": " | ".join([str(cell) for cell in table_title]) + " | " + "\n".join([" | ".join([str(cell) for cell in row]) for row in current_table])
        }
        tables.append(formatted_table)

    return tables

def process_and_save_chunks(file_path):
    """
    Master function to extract, tokenize, chunk, and save structured text.

    Args:
        file_path (str): File to be processed.

    Outputs:
        - Console performance report.
        - CSV file with chunked content.
    """
    extracted_text = extract_text_from_file(file_path)
    print(f"Extracted text from {file_path}:")

    all_chunks = []
    global_chunk_counter = 1
    total_tokens = 0
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    start_time = time.time()

    for item in extracted_text:
        chunks, global_chunk_counter = tokenize_and_chunk(item, global_chunk_counter)
        for chunk in chunks:
            tokens_in_chunk = len(encoder.encode(chunk["content"]))
            total_tokens += tokens_in_chunk
        all_chunks.extend(chunks)

    end_time = time.time()
    total_time = end_time - start_time
    tps = total_tokens / total_time if total_time > 0 else 0

    # Print performance stats
    print(f"\n--- Performance Stats ---")
    print(f"Total tokens processed: {total_tokens}")
    print(f"Total time: {total_time:.2f} seconds")
    print(f"Tokens per second (TPS): {tps:.2f}")

    # Save chunks to CSV
    csv_file = file_name + "_chunked_text_details.csv"
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["source", "page_number", "chunk_number", "content"])
        writer.writeheader()
        writer.writerows(all_chunks)

    print(f"Chunk details saved to {csv_file}")

file_path = "/content/drive/MyDrive/osos_technical_testing_folder/Dr.X Files/new-approaches-and-procedures-for-cancer-treatment.pdf"
process_and_save_chunks(file_path)


In [None]:
  # Task 2

#   Breaking Down the Publications:
# • The publications are extensive, so you need to divide them into smaller, manageable parts.
# • Use the cl100k_base tokenizer to break the text into chunks.
# • For each chunk, record:
# o The file name (source).
# o The page number.
# o The chunk number (hint: add a counter).
# o The text of that chunk.

In [None]:
# Initialize the tokenizer used by OpenAI models (cl100k_base)
encoder = tiktoken.get_encoding("cl100k_base")

# Load the CSV file that contains chunked text, please provide the file path here
csv_file = '/content/drive/MyDrive/osos_technical/csv_ouput_osos/new-approaches-and-procedures-for-cancer-treatment_chunked_text_details (1).csv'
csv_file_name = os.path.basename(csv_file)
df = pd.read_csv(csv_file)

# Initialize lists and counters
embedding_results = []
total_tokens = 0  # Track total tokens processed

start_time = time.time()  # Track start time for performance stats

# Process each chunk from the CSV
for index, row in df.iterrows():
    text = row['content']

    # Count tokens using the tokenizer
    token_count = len(encoder.encode(text))
    total_tokens += token_count

    try:
        # Generate embedding using the embedding model
        output = embed.text(
            texts=[text],
            model='nomic-embed-text-v1',
        )
        # Extract embedding vector
        embedding = np.array(output['embeddings'])[0]

        # Append results to the list
        embedding_results.append({
            'chunk_number': row['chunk_number'],
            'page_number': row['page_number'],
            'content': text,
            'embedding': embedding.tolist(),
            'token_count': token_count
        })

    except Exception as e:
        print(f"Error embedding chunk {row['chunk_number']}: {e}")


# Performance Tracking

# Calculate total elapsed time
end_time = time.time()
elapsed_time = end_time - start_time

# Compute tokens processed per second
tps = total_tokens / elapsed_time if elapsed_time > 0 else 0

# Print performance summary
print(f"\n--- Embedding Performance Stats ---")
print(f"Total Chunks: {len(embedding_results)}")
print(f"Total Tokens: {total_tokens}")
print(f"Total Time: {elapsed_time:.2f} seconds")
print(f"Tokens per Second (TPS): {tps:.2f}")


# Save the embedding results to a new CSV
embedding_df = pd.DataFrame(embedding_results)


# give your required file path here:
file_path = '/content/'
output_path = file_path + csv_file_name.replace('.csv', '_embedding_results.csv')
embedding_df.to_csv(output_path, index=False)

print(f"Embeddings saved to: {output_path}")

In [None]:
# Task 3


# Building a Vector Database:
# • Create a vector database from the chunked publications.
# • Use nomic embedding model to generate vector embeddings for each chunk.
# • Store the embeddings along with the chunk metadata in a vector database.

# Task 4
# Creating a RAG Q&A System:
# • Develop a RAG-based Q&A system that can answer questions about the publications.
# • When a user asks a question, the system should:
# o Generate a vector embedding for the question.
# o Retrieve the most relevant chunks from the vector database.
# o Use llama LLM to generate an answer based on the retrieved chunks.
# • Hint: Can your code answer to user’s questions based of the previous question?

In [None]:
# Load tokenizer and model (GPTQ)
model_id = "TheBloke/Llama-2-7B-Chat-GPTQ"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    device_map="auto",
    revision="main"
)

# LLaMA pipeline
llama_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    return_full_text=False  # This ensures only the answer is returned
)

# Load embeddings
csv_file = "/content/new-approaches-and-procedures-for-cancer-treatment_chunked_text_details (1)_embedding_results.csv"
df = pd.read_csv(csv_file)

# Convert string embeddings to numpy array
embeddings = np.array(df['embedding'].apply(eval).tolist())

# Create FAISS index
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)

# Metadata
contents = df['content'].tolist()
chunk_numbers = df['chunk_number'].tolist()
page_numbers = df['page_number'].tolist()

# Tokenizer for performance stats
encoder = tiktoken.get_encoding("cl100k_base")

# RAG function with proper prompt handling
def rag_query_with_history(query, conversation_history, top_k=3, max_new_tokens=200):
    """
    This function performs a Retrieval-Augmented Generation (RAG) query by retrieving relevant chunks from the FAISS index,
    constructing a prompt with the retrieved context and conversation history, and generating a response using the LLaMA model.

    Parameters:
    - query (str): The user's query to be answered.
    - conversation_history (list): A list of past Q&A pairs to provide context for the current query.
    - top_k (int): The number of top retrieved chunks to use for context (default is 3).
    - max_new_tokens (int): The maximum number of tokens to generate for the answer (default is 200).

    Returns:
    - response_text (str): The generated response to the query.
    - conversation_history (list): The updated conversation history with the current Q&A added.
    """
    try:
        # Embed the query
        query_embedding = embed.text(texts=[query], model="nomic-embed-text-v1")["embeddings"][0]
        query_vector = np.array(query_embedding).reshape(1, -1)

        # Search FAISS index
        distances, indices = index.search(query_vector, top_k)
        retrieved_chunks = [contents[idx] for idx in indices[0]]

        # Deduplicate and filter
        seen = set()
        cleaned_chunks = []
        for chunk in retrieved_chunks:
            chunk = chunk.strip()
            if chunk not in seen and len(chunk) > 30:
                seen.add(chunk)
                cleaned_chunks.append(chunk)

        # Prepare context
        context = "\n\n".join(f"- {chunk}" for chunk in cleaned_chunks)

        # Clean conversation history (optional)
        trimmed_history = "\n".join(conversation_history[-4:])  # Only last two QA pairs

        # Prompt construction
        prompt = f"""You are a helpful research assistant. Use the context below to answer the question clearly and concisely.

Context:
{context}

Conversation history:
{trimmed_history}

Question: {query}
Answer:"""

        # Measure TPS
        prompt_tokens = len(encoder.encode(prompt))
        start_time = time.time()

        # Generate answer (temperature adjusted, deterministic)
        result = llama_pipe(prompt, max_new_tokens=max_new_tokens, do_sample=False, temperature=0.7)[0]["generated_text"]
        end_time = time.time()

        # Extract response
        response_text = result.strip()

        # Token count
        response_tokens = len(encoder.encode(response_text))
        total_tokens = prompt_tokens + response_tokens
        elapsed = end_time - start_time
        tps = total_tokens / elapsed if elapsed > 0 else 0

        # Logging
        print(f"\nPerformance:")
        print(f" - Prompt tokens: {prompt_tokens}")
        print(f" - Response tokens: {response_tokens}")
        print(f" - Total tokens: {total_tokens}")
        print(f" - Time: {elapsed:.2f} sec")
        print(f" - Tokens/sec: {tps:.2f}")

        # Update history
        conversation_history.append(f"Question: {query}")
        conversation_history.append(f"Answer: {response_text}")

        return response_text, conversation_history

    except Exception as e:
        print(f"Error processing query: {e}")
        return "Sorry, something went wrong with this query.", conversation_history


# Initialize history
conversation_history = []

# Queries
queries = [
    "What are the main advantages of targeted therapy in cancer treatment?",
    "What are the potential disadvantages associated with gene therapy, particularly in relation to RNA interference (RNAi)?",
    "How do natural antioxidants contribute to cancer prevention or treatment, and what are the common compounds under clinical trials?"
]

# Run all queries
for i, query in enumerate(queries):
    print(f"\nQuery {i+1}: {query}")
    answer, conversation_history = rag_query_with_history(query, conversation_history)
    print(f"\nAnswer {i+1}: {answer}")


In [None]:
# translation task

In [None]:
# Task 5
# Translating the publications:
# • Dr. X wrote some publications in different languages.
# • Build a tool (using any LLM) to translate between any language to English or Arabic.
# • A plus: Strive to maintain the original structure and formatting of the publications after translation.
# • Find creative ways to improve the translation accuracy and fluency.

In [None]:
# translation task -- Example: French to English, Spanish to English from txt/pdf files.

In [None]:
# ----------------------------
# Force CPU usage for all models
# ----------------------------
device = -1  # CPU
print("[*] Loading translation models (Helsinki, CPU)...")

translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", device=device)
translator_ar = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ar", device=device)

# ----------------------------
# Load Grammar Correction Model
# ----------------------------
print("[*] Loading grammar correction model (T5-small, CPU)...")
fluency_model_id = "vennify/t5-base-grammar-correction"
fluency_tokenizer = AutoTokenizer.from_pretrained(fluency_model_id)
fluency_model = AutoModelForSeq2SeqLM.from_pretrained(fluency_model_id)

def improve_fluency(text):
    """
    Improve the fluency of English text using a grammar correction model.

    Args:
        text (str): Raw English text.

    Returns:
        str: Grammatically improved text.
    """
    print("[*] Polishing translation (fluency)...")
    input_text = "grammar: " + text.strip().replace("\n", " ")
    inputs = fluency_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = fluency_model.generate(inputs, max_length=512, num_beams=5, early_stopping=True)
    corrected = fluency_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected

# ----------------------------
# Language Mapping
# ----------------------------
LANG_MAP = {
    "en": "English",
    "es": "Spanish",
    "fr": "French",
    "de": "German",
    "ar": "Arabic",
    "ru": "Russian",
    "zh-cn": "Chinese",
    "tr": "Turkish"
}

# ----------------------------
# File Extraction
# ----------------------------
def extract_text(file_path):
    """
    Extract text from a file based on its extension.

    Args:
        file_path (str): Path to the input file (.pdf, .docx, .txt).

    Returns:
        str: Extracted plain text.
    """
    ext = file_path.lower().split('.')[-1]
    if ext == "pdf":
        return extract_from_pdf(file_path)
    elif ext == "docx":
        return extract_from_docx(file_path)
    elif ext == "txt":
        return extract_from_txt(file_path)
    else:
        raise ValueError("Unsupported file format")

def extract_from_pdf(pdf_path):
    """
    Extract text from a PDF file using pdfplumber.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        str: Text extracted from the PDF.
    """
    with pdfplumber.open(pdf_path) as pdf:
        return "\n\n".join([page.extract_text() or "" for page in pdf.pages])

def extract_from_docx(docx_path):
    """
    Extract text from a DOCX file using python-docx.

    Args:
        docx_path (str): Path to the DOCX file.

    Returns:
        str: Text extracted from the DOCX.
    """
    doc = Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_from_txt(txt_path):
    """
    Extract text from a plain text file.

    Args:
        txt_path (str): Path to the TXT file.

    Returns:
        str: Contents of the TXT file.
    """
    with open(txt_path, 'r', encoding='utf-8') as f:
        return f.read()

# ----------------------------
# Split for Max Length Translation
# ----------------------------
def split_text(text, max_len=512):
    """
    Split text into smaller chunks for translation based on sentence boundaries.

    Args:
        text (str): Full input text.
        max_len (int): Max character length for each chunk.

    Returns:
        List[str]: List of text chunks.
    """
    sentences = text.split('. ')
    chunks, current = [], ''
    for sent in sentences:
        if len(current) + len(sent) < max_len:
            current += sent + '. '
        else:
            chunks.append(current.strip())
            current = sent + '. '
    if current:
        chunks.append(current.strip())
    return chunks

# ----------------------------
# Translate Logic with TPS
# ----------------------------
def translate_text(text, target_lang="en"):
    """
    Translate input text into the desired target language (English or Arabic).

    Args:
        text (str): Input text in any language.
        target_lang (str): 'en' for English or 'ar' for Arabic.

    Returns:
        str: Translated text.
    """
    print(f"[*] Translating to {target_lang.upper()}...")

    detected_lang = detect(text)
    print(f"[*] Detected source language: {LANG_MAP.get(detected_lang, 'Unknown')}")

    chunks = split_text(text)
    total_tokens = 0
    start_time = time.time()
    translated_chunks = []

    if target_lang == "ar":
        # Step 1: Translate to English if source is not English
        if detected_lang != "en":
            print("[*] Intermediate translation: Source → English")
            temp_chunks = []
            for chunk in chunks:
                tokenized_input = translator_en.tokenizer(chunk, return_tensors="pt")
                num_tokens = len(tokenized_input["input_ids"][0])
                total_tokens += num_tokens
                translated = translator_en(chunk, max_length=512)
                temp_chunks.append(translated[0]["translation_text"])
        else:
            temp_chunks = chunks

        # Step 2: Translate English → Arabic
        print("[*] Final translation: English → Arabic")
        for chunk in temp_chunks:
            tokenized_input = translator_ar.tokenizer(chunk, return_tensors="pt")
            num_tokens = len(tokenized_input["input_ids"][0])
            total_tokens += num_tokens
            translated = translator_ar(chunk, max_length=512)
            translated_chunks.append(translated[0]["translation_text"])

    elif target_lang == "en":
        # Translate directly to English
        for chunk in chunks:
            tokenized_input = translator_en.tokenizer(chunk, return_tensors="pt")
            num_tokens = len(tokenized_input["input_ids"][0])
            total_tokens += num_tokens
            translated = translator_en(chunk, max_length=512)
            translated_chunks.append(translated[0]["translation_text"])
    else:
        raise ValueError(f"Unsupported target language: {target_lang}")

    elapsed_time = time.time() - start_time
    tps = total_tokens / elapsed_time if elapsed_time > 0 else 0
    print(f"[*] Processed {total_tokens} tokens in {elapsed_time:.2f} seconds.")
    print(f"[*] Tokens per second: {tps:.2f}")

    return "\n\n".join(translated_chunks)

# ----------------------------
# Save DOCX Output
# ----------------------------
def save_as_docx(text, output_path):
    """
    Save translated text as a DOCX file.

    Args:
        text (str): Text to be saved.
        output_path (str): Destination file path (.docx).
    """
    doc = Document()
    for para in text.split("\n\n"):
        doc.add_paragraph(para.strip())
    doc.save(output_path)

# ----------------------------
# Main Processor
# ----------------------------
def process_document(file_path, target_lang="en"):
    """
    Main document processing pipeline: extract, translate, polish (if English), save.

    Args:
        file_path (str): Path to the input file.
        target_lang (str): Desired translation target ('en' or 'ar').
    """
    print(f"[+] Processing file: {file_path}")
    text = extract_text(file_path)
    detected = detect(text)
    print(f"[*] Detected language: {LANG_MAP.get(detected, 'Unknown')}")

    translated = translate_text(text, target_lang=target_lang)

    # Apply grammar polishing only if translating to English
    if target_lang == "en":
        polished = improve_fluency(translated)
    else:
        polished = translated

    # Save output
    lang_suffix = "_translated_en" if target_lang == "en" else "_translated_ar"
    output_path = os.path.splitext(file_path)[0] + lang_suffix + ".docx"
    save_as_docx(polished, output_path)
    print(f"Saved translated file: {output_path}")

# ----------------------------
# Example Usage
# ----------------------------
if __name__ == "__main__":
    file_path = "/content/drive/MyDrive/osos_technical_testing_folder/transalation_files/osos_spanish_file.txt"
    process_document(file_path, target_lang="ar")  # Change 'ar' to 'en' for English output

In [None]:
# Task 6
# Finding the Main Ideas:
# • Create a tool (using any LLM) to summarize the publications.
# • Evaluate the quality of your summaries using the ROUGE metric.
# • Experiment with different summarization techniques and prompt strategies and record the result.

In [None]:
### summarization of the given text

In [None]:
# Load a summarization model (e.g., BART or T5)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_text(text, max_length=150):
    """
    Summarizes a given input text using a pretrained summarization model.

    Parameters:
    - text (str): The full input text to summarize.
    - max_length (int): The maximum number of tokens in the summary. Default is 150.

    Returns:
    - summarized_text (str): The summary generated by the model.

    The function also prints performance metrics such as:
    - Number of input tokens
    - Time taken to summarize
    - Tokens processed per second (TPS)
    """
    start_time = time.time()  # Start the timer

    # Tokenize the input text to calculate tokens
    tokenized_input = summarizer.tokenizer(text, return_tensors="pt")
    num_tokens = len(tokenized_input["input_ids"][0])  # Number of tokens in the input text

    # Summarize the text
    summary = summarizer(text, max_length=max_length, min_length=50, do_sample=False)
    summarized_text = summary[0]['summary_text']

    # Calculate time taken for summarization
    elapsed_time = time.time() - start_time

    # Calculate tokens per second (TPS)
    tps = num_tokens / elapsed_time if elapsed_time > 0 else 0

    print(f"[*] Processed {num_tokens} tokens in {elapsed_time:.2f} seconds.")
    print(f"[*] Tokens per second: {tps:.2f}")

    return summarized_text

# Example publication text (could be read from a file)
publication_text = """
This is a long publication about advancements in AI. Artificial intelligence (AI) is the simulation of human intelligence in machines. The goal is to create systems capable of performing tasks that typically require human intelligence. These tasks include speech recognition, decision-making, and visual perception. In recent years, AI has seen significant growth in applications ranging from self-driving cars to healthcare diagnostics. With advancements in deep learning and neural networks, AI is expected to revolutionize industries worldwide. However, challenges like ethical considerations and data privacy remain.
"""

# Summarize the publication
summary = summarize_text(publication_text)
print("\nSummary:\n", summary)


In [None]:
# Experiment with Different Summarization Techniques & Prompt Strategies

# 1) Try different summarization models (e.g., T5, PEGASUS).

# 2) Try prompt variations (e.g., "summarize:", "tl;dr:", "In summary, ...").

# 3) Log and compare output quality, token usage, and tokens/sec.



In [None]:


# Text to summarize
text = """
This is a long publication about advancements in AI. Artificial intelligence (AI) is the simulation of human intelligence in machines. The goal is to create systems capable of performing tasks that typically require human intelligence. These tasks include speech recognition, decision-making, and visual perception. In recent years, AI has seen significant growth in applications ranging from self-driving cars to healthcare diagnostics. With advancements in deep learning and neural networks, AI is expected to revolutionize industries worldwide. However, challenges like ethical considerations and data privacy remain.
"""

# Models to compare
summarization_models = {
    "BART": "facebook/bart-large-cnn",
    "PEGASUS": "google/pegasus-xsum",
    "T5": "t5-base"
}

# Prompt strategies
prompt_styles = {
    "default": lambda t: t,
    "summarize_prefix": lambda t: "summarize: " + t,
    "tl;dr": lambda t: "tl;dr: " + t
}

# Run all combinations
for model_name, model_id in summarization_models.items():
    summarizer = pipeline("summarization", model=model_id)

    for prompt_name, prompt_func in prompt_styles.items():
        prompt_text = prompt_func(text)

        tokenized = summarizer.tokenizer(prompt_text, return_tensors="pt")
        num_tokens = len(tokenized["input_ids"][0])

        start = time.time()
        summary = summarizer(prompt_text, max_length=150, min_length=50, do_sample=False)
        end = time.time()

        elapsed = end - start
        tps = num_tokens / elapsed if elapsed > 0 else 0
        summarized_text = summary[0]["summary_text"]

        print(f"\n=== {model_name} | Prompt: {prompt_name} ===")
        print(f"Tokens: {num_tokens}, Time: {elapsed:.2f}s, TPS: {tps:.2f}")
        print("Summary:", summarized_text)




In [None]:
## rouge metrics evaulation

In [None]:
def evaluate_summary(reference_summary, generated_summary):
    """
    Evaluates the quality of a generated summary by comparing it to a human-written reference summary
    using ROUGE metrics.

    Parameters:
    - reference_summary (str): The ground-truth or human-made summary of the original text.
    - generated_summary (str): The summary generated by the model.

    Returns:
    - scores (dict): A dictionary containing the ROUGE-1, ROUGE-2, and ROUGE-L scores.

    ROUGE (Recall-Oriented Understudy for Gisting Evaluation) scores measure:
    - ROUGE-1: Overlap of unigrams (individual words).
    - ROUGE-2: Overlap of bigrams (two-word sequences).
    - ROUGE-L: Longest common subsequence-based similarity.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)
    return scores

# Example reference summary (human-made)
reference_summary = """
AI is the simulation of human intelligence in machines to perform tasks like speech recognition and decision-making. It has grown in applications like self-driving cars and healthcare diagnostics, but challenges such as ethics and privacy remain.
"""

# Evaluate the model's summary using ROUGE
generated_summary = summarize_text(publication_text)
rouge_scores = evaluate_summary(reference_summary, generated_summary)
print("ROUGE Scores:\n", rouge_scores)


In [None]:
# Task 7
# Performance Measurement:
# • During the embedding generation, translation, summarization, and RAG processes, record the "tokens per
# second" processed by the LLM. This will help us understand the efficiency of your algorithms.

# Answer

# To provide a complete and detailed implementation for performance measurement across all steps (embedding generation, translation, summarization, and RAG processes), I've added tokens per second (TPS) tracking throughout each process.

In [None]:
# Task 8

# Be Creative:
# • Demonstrate your creativity. For example:
# o Develop advanced chunking methods.
# o Enhance the accuracy and clarity of your translations.
# o Create unique evaluation metrics for the algorithms.
# o Implement create algorithms for tables and charts text extraction.

In [None]:
# Develop advanced chunking methods.

#  Smart Table-Aware Chunking
# For table-heavy content:

# Preserve full tables in one chunk regardless of token count.

# Flag them as "is_table": True in the chunk metadata.

In [None]:

# Initialize tokenizer
encoder = tiktoken.get_encoding("cl100k_base")
nltk.data.clear_cache()
nltk.download('punkt_tab')

def tokenize_and_chunk_semantic(text_with_page, global_chunk_counter, max_tokens=1000):
    """
    Tokenizes and chunks the provided text into semantic chunks based on sentence boundaries,
    ensuring each chunk does not exceed the specified maximum token count.

    Args:
    - text_with_page (dict): A dictionary containing 'content' (text) and 'page_number'.
    - global_chunk_counter (int): A counter to keep track of chunk numbers across pages.
    - max_tokens (int, optional): The maximum number of tokens per chunk (default is 1000).

    Returns:
    - list: A list of chunked text entries, each with 'content', 'page_number', and 'chunk_number'.
    - int: The updated chunk counter.
    """
    text = text_with_page['content']
    page_number = text_with_page['page_number']
    sentences = sent_tokenize(text)

    chunks = []
    current_chunk = ""
    current_tokens = 0

    for sentence in sentences:
        sentence_tokens = encoder.encode(sentence)
        if current_tokens + len(sentence_tokens) <= max_tokens:
            current_chunk += " " + sentence
            current_tokens += len(sentence_tokens)
        else:
            # Save current chunk
            chunks.append({
                "content": current_chunk.strip(),
                "page_number": page_number,
                "chunk_number": global_chunk_counter,
                "source": text_with_page.get('source', 'Unknown')
            })
            global_chunk_counter += 1
            current_chunk = sentence
            current_tokens = len(sentence_tokens)

    if current_chunk:
        chunks.append({
            "content": current_chunk.strip(),
            "page_number": page_number,
            "chunk_number": global_chunk_counter,
            "source": text_with_page.get('source', 'Unknown')
        })
        global_chunk_counter += 1

    return chunks, global_chunk_counter

def extract_text_from_file(file_path):
    """
    Extracts text or tables from a document based on its file extension.

    Args:
    - file_path (str): The path to the file to be processed.

    Returns:
    - list: A list of extracted text or tables, each represented as a dictionary.
    """
    file_extension = os.path.splitext(file_path)[1].lower()
    if file_extension == '.docx':
        return extract_text_from_docx(file_path)
    elif file_extension == '.pdf':
        return extract_text_and_tables_from_pdf(file_path)
    elif file_extension in ['.csv', '.xlsx', '.xls', '.xlsm']:
        return extract_all_tables(file_path)
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")

def extract_text_from_docx(file_path):
    """
    Extracts text and tables from a DOCX file.

    Args:
    - file_path (str): The path to the DOCX file.

    Returns:
    - list: A list of dictionaries, each representing a paragraph or table.
    """
    doc = docx.Document(file_path)
    text = []
    namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
    page_number = 1
    for element in doc.element.body:
        if element.tag.endswith('p'):
            para_text = ' '.join([run.text.strip() for run in element.findall('.//w:t', namespaces) if run.text.strip() != ''])
            if para_text:
                text.append({
                    "type": "text",
                    "source": file_path,
                    "page_number": page_number,
                    "content": para_text.strip(),
                    "table": None
                })
        elif element.tag.endswith('tbl'):
            table_data = []
            for row in element.findall('.//w:tr', namespaces):
                row_text = [cell.text.strip() for cell in row.findall('.//w:t', namespaces) if cell.text.strip()]
                if row_text:
                    table_data.append(" | ".join(row_text))
            if table_data:
                text.append({
                    "type": "table",
                    "source": file_path,
                    "page_number": page_number,
                    "content": "\n".join(table_data),
                    "table": table_data
                })
        if len(text) % 5 == 0:
            page_number += 1
    return text

def extract_text_and_tables_from_pdf(file_path):
    """
    Extracts text and tables from a PDF file.

    Args:
    - file_path (str): The path to the PDF file.

    Returns:
    - list: A list of dictionaries, each representing a page's text or table data.
    """
    combined_content = []
    doc = fitz.open(file_path)
    with pdfplumber.open(file_path) as pdf:
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            page_text = page.get_text("text")
            if page_text.strip():
                combined_content.append({
                    "type": "text",
                    "source": file_path,
                    "page_number": page_num + 1,
                    "content": page_text.strip()
                })
            pdf_page = pdf.pages[page_num]
            tables = pdf_page.extract_tables()
            for table in tables:
                formatted_table = []
                for row in table:
                    formatted_row = "|".join([str(cell) for cell in row if cell])
                    formatted_table.append(formatted_row)
                if formatted_table:
                    combined_content.append({
                        "type": "table",
                        "source": file_path,
                        "page_number": page_num + 1,
                        "content": "\n".join(formatted_table)
                    })
    return combined_content

def extract_all_tables(file_path):
    """
    Extracts tables from Excel files (CSV, XLSX, etc.).

    Args:
    - file_path (str): The path to the Excel file.

    Returns:
    - list: A list of dictionaries, each representing a table.
    """
    xl = pd.ExcelFile(file_path)
    sheet_name = xl.sheet_names[0]
    sheet_data = xl.parse(sheet_name)
    return extract_tables_from_sheet(sheet_data, sheet_name, file_path)

def extract_tables_from_sheet(sheet_data, sheet_name, file_path):
    """
    Extracts tables from a specific Excel sheet.

    Args:
    - sheet_data (pd.DataFrame): The DataFrame containing sheet data.
    - sheet_name (str): The name of the sheet.
    - file_path (str): The path to the Excel file.

    Returns:
    - list: A list of table dictionaries.
    """
    tables = []
    rows = sheet_data.values.tolist()
    current_table = []
    table_title = None
    for row in rows:
        if any(cell for cell in row):
            if table_title is None:
                table_title = row
            else:
                current_table.append(row)
        else:
            if current_table:
                formatted_table = {
                    "type": "table",
                    "source": file_path,
                    "page_number": sheet_name,
                    "title": " | ".join([str(cell) for cell in table_title]) if table_title else "No Title",
                    "table_data": "\n".join([" | ".join([str(cell) for cell in row]) for row in current_table]),
                    "content": " | ".join([str(cell) for cell in table_title]) + " | " + "\n".join([" | ".join([str(cell) for cell in row]) for row in current_table])
                }
                tables.append(formatted_table)
                table_title = None
                current_table = []
    if current_table:
        formatted_table = {
            "type": "table",
            "source": file_path,
            "page_number": sheet_name,
            "title": " | ".join([str(cell) for cell in table_title]) if table_title else "No Title",
            "table_data": "\n".join([" | ".join([str(cell) for cell in row]) for row in current_table]),
            "content": " | ".join([str(cell) for cell in table_title]) + " | " + "\n".join([" | ".join([str(cell) for cell in row]) for row in current_table])
        }
        tables.append(formatted_table)
    return tables

def process_and_save_chunks(file_path):
    """
    Processes a document to extract and chunk its content, saving the chunks in CSV files.

    Args:
    - file_path (str): The path to the document (PDF, DOCX, CSV, Excel, etc.).
    """
    extracted_text = extract_text_from_file(file_path)
    print(f"Extracted text from {file_path}:")

    all_chunks = []
    global_chunk_counter = 1
    total_tokens = 0
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    start_time = time.time()

    for item in extracted_text:
        if item["type"] == "table":
            chunk = {
                "content": item["content"],
                "page_number": item["page_number"],
                "chunk_number": global_chunk_counter,
                "source": item.get('source', 'Unknown'),
                "is_table": True
            }
            all_chunks.append(chunk)
            total_tokens += len(encoder.encode(chunk["content"]))
            global_chunk_counter += 1
        else:
            chunks, global_chunk_counter = tokenize_and_chunk_semantic(item, global_chunk_counter)
            for chunk in chunks:
                chunk["is_table"] = False
                total_tokens += len(encoder.encode(chunk["content"]))
            all_chunks.extend(chunks)

    end_time = time.time()
    total_time = end_time - start_time
    tps = total_tokens / total_time if total_time > 0 else 0

    print(f"\n--- Performance Stats ---")
    print(f"Total tokens processed: {total_tokens}")
    print(f"Total time: {total_time:.2f} seconds")
    print(f"Tokens per second (TPS): {tps:.2f}")

    # Save the full smart (table-aware) version
    smart_csv = file_name + "_smart_table_aware_chunking_chunked_text_details.csv"
    with open(smart_csv, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["source", "page_number", "chunk_number", "content", "is_table"])
        writer.writeheader()
        writer.writerows(all_chunks)
    print(f"Smart table-aware chunks saved to: {smart_csv}")

    # Save a simplified version (without `is_table`)
    simple_csv = file_name + "_chunked_text_details.csv"
    with open(simple_csv, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["source", "page_number", "chunk_number", "content"])
        writer.writeheader()
        for row in all_chunks:
            writer.writerow({k: v for k, v in row.items() if k != "is_table"})
    print(f"Clean/simple chunks saved to: {simple_csv}")

# Example usage
# file_path = "/your/local/path/to/document.xlsx"  # Change to your file path
file_path = "/content/drive/MyDrive/osos_technical_testing_folder/Dr.X Files/Ocean_ecogeochemistry_A_review.pdf"
process_and_save_chunks(file_path)


In [None]:
# improve the translation accuracy and fluency of the tool, I will integrate the following enhancements into the existing code:

# Context-Aware Chunking: We'll adjust the chunking strategy to better preserve context.

# Dynamic Tokenization: We'll adjust chunk sizes based on token limits to preserve context better.

In [None]:
# ----------------------------
# Force CPU usage for all models
# ----------------------------
device = -1  # CPU
print("[*] Device set to: CPU (forced to avoid GPU memory errors)")

# ----------------------------
# Load Translation Pipelines
# ----------------------------
print("[*] Loading translation models...")

translator_to_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", device=device)
""" Translation pipeline for multilingual input to English """

translator_en_to_ar = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ar", device=device)
""" Translation pipeline from English to Arabic """

# ----------------------------
# Load Grammar Correction Model (English only)
# ----------------------------
fluency_model_id = "vennify/t5-base-grammar-correction"
fluency_tokenizer = AutoTokenizer.from_pretrained(fluency_model_id)
fluency_model = AutoModelForSeq2SeqLM.from_pretrained(fluency_model_id)
""" Loads grammar correction model for polishing English text """

# ----------------------------
# Text Extraction Functions
# ----------------------------
def extract_text(file_path):
    """
    Detects the file format and extracts text accordingly.

    Args:
        file_path (str): Path to the input file (.pdf, .docx, .txt)

    Returns:
        str: Extracted text content.
    """
    ext = file_path.lower().split('.')[-1]
    if ext == "pdf":
        return extract_from_pdf(file_path)
    elif ext == "docx":
        return extract_from_docx(file_path)
    elif ext == "txt":
        return extract_from_txt(file_path)
    else:
        raise ValueError("Unsupported file format")

def extract_from_pdf(pdf_path):
    """
    Extract text from PDF file using pdfplumber.

    Args:
        pdf_path (str): Path to PDF.

    Returns:
        str: Combined text from all pages.
    """
    with pdfplumber.open(pdf_path) as pdf:
        return "\n\n".join([page.extract_text() or "" for page in pdf.pages])

def extract_from_docx(docx_path):
    """
    Extract text from DOCX file.

    Args:
        docx_path (str): Path to DOCX file.

    Returns:
        str: Combined paragraph text.
    """
    doc = Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_from_txt(txt_path):
    """
    Extract text from plain text file.

    Args:
        txt_path (str): Path to TXT file.

    Returns:
        str: File content.
    """
    with open(txt_path, 'r', encoding='utf-8') as f:
        return f.read()

# ----------------------------
# Text Chunking Helper
# ----------------------------
def split_text(text, max_len=512):
    """
    Splits long text into smaller chunks based on sentence boundaries.

    Args:
        text (str): Full raw text.
        max_len (int): Maximum characters allowed per chunk.

    Returns:
        list: List of text chunks.
    """
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks, current = [], ''
    for sent in sentences:
        if len(current) + len(sent) < max_len:
            current += sent + ' '
        else:
            chunks.append(current.strip())
            current = sent + ' '
    if current:
        chunks.append(current.strip())
    return chunks

# ----------------------------
# Translation Function
# ----------------------------
def translate_text(text, target_lang="en"):
    """
    Translates given text into the target language ('en' or 'ar').

    Args:
        text (str): Input text.
        target_lang (str): Target language code ('en' or 'ar').

    Returns:
        str: Translated text.
    """
    print(f"[*] Translating to {target_lang.upper()}...")
    chunks = split_text(text)
    translated_chunks = []

    start_time = time.time()
    total_tokens = 0

    for chunk in chunks:
        model = translator_to_en if target_lang == "en" else translator_en_to_ar
        tokenized_input = model.tokenizer(chunk, return_tensors="pt")
        num_tokens = len(tokenized_input["input_ids"][0])
        total_tokens += num_tokens
        translated = model(chunk, max_length=512)
        translated_chunks.append(translated[0]["translation_text"])

    elapsed_time = time.time() - start_time
    print(f"[*] Processed {total_tokens} tokens in {elapsed_time:.2f} seconds.")
    print(f"[*] Tokens per second: {total_tokens / elapsed_time:.2f}")

    return "\n\n".join(translated_chunks)

# ----------------------------
# Grammar Correction
# ----------------------------
def improve_fluency(text):
    """
    Improves fluency of English text using a grammar correction model.

    Args:
        text (str): Input English text.

    Returns:
        str: Polished, grammatically correct text.
    """
    print("[*] Polishing English translation (fluency)...")
    input_text = "grammar: " + text.strip().replace("\n", " ")
    inputs = fluency_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = fluency_model.generate(inputs, max_length=512, num_beams=5, early_stopping=True)
    return fluency_tokenizer.decode(outputs[0], skip_special_tokens=True)

# ----------------------------
# Save Output
# ----------------------------
def save_as_docx(text, output_path):
    """
    Saves the final text as a DOCX file.

    Args:
        text (str): Text to be saved.
        output_path (str): File path to save the DOCX.
    """
    doc = Document()
    for para in text.split("\n\n"):
        doc.add_paragraph(para.strip())
    doc.save(output_path)

# ----------------------------
# Main Processor
# ----------------------------
def process_document(file_path, target_lang="en"):
    """
    Main function to process a document: extract, translate, polish (if needed), and save.

    Args:
        file_path (str): Input file path (.pdf, .docx, .txt)
        target_lang (str): Target language code ('en' or 'ar')
    """
    print(f"\n[+] Processing file: {file_path}")
    text = extract_text(file_path)
    detected_lang = detect(text)
    print(f"[*] Detected source language: {detected_lang}")

    if target_lang == "ar" and detected_lang != "en":
        print("[*] Source is not English. Translating to English first...")
        text_in_english = translate_text(text, target_lang="en")
        text_in_english = improve_fluency(text_in_english)
        print("[*] Translating English to Arabic...")
        final_translation = translate_text(text_in_english, target_lang="ar")
    else:
        final_translation = translate_text(text, target_lang=target_lang)
        if target_lang == "en":
            final_translation = improve_fluency(final_translation)

    suffix = "_translated_to_" + target_lang
    output_path = os.path.splitext(file_path)[0] + suffix + ".docx"
    save_as_docx(final_translation, output_path)
    print(f"[✓] Saved translated file to: {output_path}\n")

# ----------------------------
# Example Usage
# ----------------------------
if __name__ == "__main__":
    file_path = "/content/drive/MyDrive/osos_technical/csv_ouput_osos/other_language_files/osos_french.pdf"  # Replace with your actual path
    target_language = "ar"  # Change to 'en' for English output
    process_document(file_path, target_lang=target_language)


In [None]:
# improve evaluation metrics for algorithms

In [None]:
# evaluation metrics for translation task

In [None]:
embedding_model = SentenceTransformer("distiluse-base-multilingual-cased-v2")


def calculate_translation_preservation_score(original_text, back_translated_text):
    """
    Calculates the semantic similarity between the original and back-translated texts
    to evaluate translation accuracy.

    This metric helps assess how much of the original meaning is preserved after translation
    and back-translation using cosine similarity between sentence embeddings.

    Args:
        original_text (str): The original text before translation.
        back_translated_text (str): The back-translated version of the translated text.

    Returns:
        None: Prints the translation preservation score (float between -1 and 1).
    """
    print("[*] Calculating Translation Preservation Score...")
    orig_embed = embedding_model.encode([original_text], convert_to_tensor=True)
    back_embed = embedding_model.encode([back_translated_text], convert_to_tensor=True)
    score = cosine_similarity(orig_embed.cpu().numpy(), back_embed.cpu().numpy())[0][0]
    print(f"[✓] Translation Preservation Score: {score:.4f}")



original_text = "Ceci est un document d’exemple contenant plusieurs phrases en espagnol. Le but de ce texte est de \
servir d’entrée pour les tests de traduction dans le système.  \
Les tests de traduction doivent être précis et efficaces. Assurez-vous que le contenu conserve sa \
structure et sa fluidité après la traduction. Ce document sera traduit en anglais ou en arabe selon ce qui \
est spécifié"

back_translated_text = "This is an example document containing several sentences in Spanish. The purpose of this text is to use input for translation tests in the system. Translation tests must be accurate and effective. Make sure that the content preserves its structure and fluidity after translation. This document will be translated into English or Arabic according to what is specified."
calculate_translation_preservation_score(original_text, back_translated_text)


In [None]:
# chunking and embedding evaluation with semantic overlap evalution metrics.

In [None]:
# Load the Sentence-BERT model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def load_embeddings_from_csv(file_path):
    """
    Loads embeddings from a CSV file where each embedding is stored as a stringified list in the 'embedding' column.

    Args:
    - file_path (str): Path to the CSV file containing the embeddings.

    Returns:
    - np.ndarray: A 2D numpy array where each row is a vector embedding.
    """
    df = pd.read_csv(file_path)
    embeddings = df['embedding'].apply(lambda x: np.fromstring(x[1:-1], sep=',')).tolist()
    return np.array(embeddings)

def calculate_semantic_overlap(embeddings):
    """
    Calculates the average cosine similarity between adjacent embedding vectors to estimate semantic coherence.

    Args:
    - embeddings (np.ndarray): A 2D array of vector embeddings, one per chunk.

    Returns:
    - float: The average cosine similarity between adjacent chunks.
    """
    similarities = []
    for i in range(len(embeddings) - 1):
        similarity = cosine_similarity([embeddings[i]], [embeddings[i+1]])[0][0]
        similarities.append(similarity)

    avg_similarity = np.mean(similarities)
    return avg_similarity

# === Example Usage ===

# Path to the CSV file containing chunk embeddings
embedding_csv_file = '/content/drive/MyDrive/osos_technical/csv_ouput_osos/embedding_csv/Dataset summaries and citations_chunked_text_details.csv_embedding_results.csv'  # Replace with your actual path

# Load the embeddings from the CSV
embeddings = load_embeddings_from_csv(embedding_csv_file)

# Calculate and print the semantic overlap score
semantic_overlap_score = calculate_semantic_overlap(embeddings)
print(f"Average Semantic Overlap Score between Chunks: {semantic_overlap_score:.4f}")


In [None]:
# summary evalutation metrics - factual_consistency_score

In [None]:
from transformers import pipeline
import numpy as np

# Load a QA pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

# Original text and summary
original_text = """
This is a long publication about advancements in AI. Artificial intelligence (AI) is the simulation of human intelligence in machines.
The goal is to create systems capable of performing tasks that typically require human intelligence. These tasks include speech recognition,
decision-making, and visual perception. In recent years, AI has seen significant growth in applications ranging from self-driving cars
to healthcare diagnostics. With advancements in deep learning and neural networks, AI is expected to revolutionize industries worldwide.
However, challenges like ethical considerations and data privacy remain.
"""

summary = """
Artificial intelligence is the simulation of human intelligence in machines. In recent years, AI has seen significant growth in applications
ranging from self-driving cars to healthcare diagnostics. With advancements in deep learning and neural networks, AI is expected to
revolutionize industries worldwide.
"""

# Generate key questions from the summary to evaluate consistency
questions = [
    "What is artificial intelligence?",
    "What are some applications of AI?",
    "What recent advancements are helping AI?",
    "What impact is AI expected to have on industries?",
]

def get_answers(text, questions):
    """
    Extract answers from a given text for a list of questions using a QA pipeline.

    Parameters:
        text (str): The input text to extract answers from.
        questions (list): A list of questions (str) to be asked about the text.

    Returns:
        list: A list of answer strings returned by the QA pipeline.
    """
    return [qa_pipeline(question=q, context=text)["answer"] for q in questions]

def factual_consistency_score(orig_ans, summ_ans):
    """
    Compute a simple factual consistency score between original and summary answers.

    Parameters:
        orig_ans (list): Answers extracted from the original text.
        summ_ans (list): Answers extracted from the summary.

    Returns:
        float: Factual consistency score (0 to 1), based on matching answers.
               A score of 1.0 means perfect factual alignment.
    """
    matches = [
        1 if a1.lower() in a2.lower() or a2.lower() in a1.lower() else 0
        for a1, a2 in zip(orig_ans, summ_ans)
    ]
    return np.mean(matches)

# Get answers from both original and summary
original_answers = get_answers(original_text, questions)
summary_answers = get_answers(summary, questions)

# Calculate and print score
score = factual_consistency_score(original_answers, summary_answers)
print(f"\nFactual Consistency Score: {score:.2f}")


In [None]:
# RAG System evaluation

In [None]:
# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def calculate_answer_grounding_score(answer: str, retrieved_chunks: list) -> float:
    """
    Calculates how well a generated answer is grounded in the provided retrieved context using cosine similarity.

    This function encodes the given answer and a list of retrieved text chunks into vector embeddings using
    a SentenceTransformer model. It then computes the cosine similarity between the answer and each chunk
    to determine the degree of semantic alignment. The highest similarity score among all chunks is returned
    as the grounding score.

    Args:
        answer (str): The generated answer whose grounding needs to be evaluated.
        retrieved_chunks (list of str): A list of text chunks retrieved from the context or knowledge base.

    Returns:
        float: The maximum cosine similarity score between the answer and any of the retrieved chunks.
               A score closer to 1 indicates stronger grounding.
    """
    print("[*] Calculating Answer Grounding Score...")
    # Encoding the answer and the retrieved chunks to embeddings
    answer_embedding = embedding_model.encode(answer, convert_to_tensor=True)
    chunk_embeddings = embedding_model.encode(retrieved_chunks, convert_to_tensor=True)

    # Calculate cosine similarities between the answer and each retrieved chunk
    similarities = util.pytorch_cos_sim(answer_embedding, chunk_embeddings)[0]

    # Get the maximum similarity score, which indicates the best-grounded chunk
    max_similarity_score = similarities.max().item()

    print(f"[✓] Answer Grounding Score: {max_similarity_score:.4f}")
    return max_similarity_score

# Example Answer
answer = "The basis for seasonality in the analysis of soil carbon dynamics in residential lawns is based on findings by Trammell et al. (2019). They showed that C3 species dominated lawns in Los Angeles, CA. The approach used here is different because it uses a different dataset and a different method to estimate SOC sequestration."

# Example Retrieved Chunks (Footnotes, Text, etc.)
retrieved_chunks = [
    "1. The total sequestration reported here differs from that reported by Br | au | n and Bremer (2019), because they reported mean rather than total SOC sequestration rate across depths.",
    "2. Campbell et al., (2014) reported 0-5 cm depth with a linear fit, and did not report the non-linear relationships for deeper intervals. We fit 3rd order polynomials to all intervals.",
    "3. Authors used linear regression in Huyler et al., 2014 and localized polynomial fitting in Huyler et al., 2017. We applied the most parsimonious polynomial regressions to each depth interval, which were a linear, 2nd order, and third order polynomial to the top, mid, and bottom interval, respectively.",
    "4. Raciti et al., (2011) applied a linear regression to the whole profile, but the 10-30, 30-70, and 70-100 cm intervals had non-linear trends. We fit these intervals with 3rd order polynomials, and fit a linear regression to the whole profile.",
    "5. Seasonality for these cities is based on findings by Trammell et al. (2019), https://doi.org/10.1002/eap.1884. Contrary to expectation based on climate, they showed C3 species dominated lawns in Los Angeles, CA.",
    "Raciti, S. M., Groffman, P. M., Jenkins, J. C., Pouyat, R. V., Fahey, T. J., Pickett, S. T. A., & Cadenasso, M. L. (2011). Accumulation of Carbon and Nitrogen in Residential Soils with Different Land-Use Histories. Ecosystems, 14 (2), 287–297. https://doi.org/10.1007/s10021-010-9409-3"
]


# Calculate the Answer Grounding Score
answer_grounding_score = calculate_answer_grounding_score(answer, retrieved_chunks)
print(f"Answer Grounding Score: {answer_grounding_score:.4f}")


In [None]:
# Implement create algorithms for tables and charts text extraction.

In [None]:
def extract_tables_from_pdf(pdf_path):
    """
    Extracts tables from a PDF file using pdfplumber.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        list of dict: A list of dictionaries where each dictionary represents a row
                      from the extracted tables, with column headers as keys.
    """
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            table = page.extract_table()
            if table:
                headers = [h.strip() if h else f"Column{i}" for i, h in enumerate(table[0])]
                for row in table[1:]:
                    if row:
                        row_dict = {headers[i]: cell for i, cell in enumerate(row)}
                        tables.append(row_dict)
    return tables


def extract_tables_from_docx(docx_path):
    """
    Extracts tables from a DOCX file using python-docx.

    Args:
        docx_path (str): Path to the DOCX file.

    Returns:
        list of dict: A list of dictionaries where each dictionary represents a row
                      from the extracted tables, with column headers as keys.
    """
    doc = docx.Document(docx_path)
    table_data = []
    for table in doc.tables:
        keys = [cell.text.strip() for cell in table.rows[0].cells]
        for row in table.rows[1:]:
            values = [cell.text.strip() for cell in row.cells]
            if len(values) == len(keys):
                table_data.append(dict(zip(keys, values)))
    return table_data


def extract_text_from_csv(file_path):
    """
    Reads a CSV file and returns its contents as a list of dictionaries.

    Tries multiple common encodings to handle potential encoding issues.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        list of dict: A list of dictionaries representing rows in the CSV file.

    Raises:
        UnicodeDecodeError: If the file cannot be decoded using common encodings.
    """
    for encoding in ['utf-8', 'ISO-8859-1', 'cp1252']:
        try:
            df = pd.read_csv(file_path, encoding=encoding)
            return df.to_dict(orient='records')
        except UnicodeDecodeError:
            continue
    raise UnicodeDecodeError("Unable to decode CSV with common encodings.")


def extract_text_from_excel(file_path):
    """
    Reads an Excel file and returns its contents as a list of dictionaries.

    Args:
        file_path (str): Path to the Excel file (.xls or .xlsx).

    Returns:
        list of dict: A list of dictionaries representing rows in the Excel file.
    """
    df = pd.read_excel(file_path)
    return df.to_dict(orient='records')


def extract_structured_text(file_path):
    """
    Extracts structured table/chart data from various file formats including:
    PDF, DOCX, CSV, XLS, XLSX.

    Args:
        file_path (str): Path to the file.

    Returns:
        list of dict: Extracted structured data from the file.

    Raises:
        NotImplementedError: If attempting to process unsupported .txt files.
        ValueError: If the file extension is not supported.
    """
    ext = os.path.splitext(file_path)[-1].lower()

    if ext == '.pdf':
        return extract_tables_from_pdf(file_path)

    elif ext == '.docx':
        return extract_tables_from_docx(file_path)

    elif ext == '.txt':
        raise NotImplementedError("Text files are not supported for structured chart/table extraction.")

    elif ext == '.csv':
        return extract_text_from_csv(file_path)

    elif ext in ['.xls', '.xlsx']:
        return extract_text_from_excel(file_path)

    else:
        raise ValueError(f"Unsupported file type: {ext}")


# --- Example Usage ---
# give file path here
file_path = "/content/drive/MyDrive/osos_technical_testing_folder/Dr.X Files/party budget1.xlsx"
structured_output = extract_structured_text(file_path)

print("Extracted Structured Table/Chart Data:")
for row in structured_output:
  print(row)

