In [None]:
%pip install -r requirements.txt
%pip install -U qdrant_client
%pip install git+https://github.com/openai/CLIP.git
%pip install llama-index-embeddings-clip
%pip install llama-index-embeddings-openai

In [None]:
from pathlib import Path
from llama_index.multi_modal_llms.openai import OpenAIMultiModal
from llama_index.core import SimpleDirectoryReader
import re
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
from docx import Document
import fitz  # PyMuPDF
import docx
import PyPDF2
from llama_index.core import PromptTemplate
from llama_index.core.query_engine import SimpleMultiModalQueryEngine
import os
from typing import List, Dict
from llama_index.core.indices import MultiModalVectorStoreIndex
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import SimpleDirectoryReader, StorageContext
from llama_index.multi_modal_llms.openai import OpenAIMultiModal
from llama_index.llms.openai import OpenAI

import qdrant_client
from llama_index.core import SimpleDirectoryReader
# # Load it
from llama_index.core import load_index_from_storage
from dotenv import load_dotenv
import shutil
load_dotenv()



In [None]:
from docx2pdf import convert


In [4]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

In [5]:

openai_mm_llm = OpenAIMultiModal(
    model="gpt-4o", api_key=OPENAI_API_KEY, max_new_tokens=1500
)

In [6]:
input_folder = "Proceduri_Interne"
image_directory = "./iframe_image_extraction"
pdf_directory = "pdfs"
output_folder = "iframe_image_extraction"

In [7]:

def extract_images_from_word(doc_path, output_folder):
    doc = Document(doc_path)
    word_image_list = []
    for i, rel in enumerate(doc.part.rels.values()):
        if "image" in rel.target_ref:
            img = rel.target_part.blob
            img_filename = os.path.join(output_folder, f"{os.path.basename(doc_path)}_image_{i+1}.png")
            with open(img_filename, "wb") as f:
                f.write(img)
            word_image_list.append({"image": img_filename, "text": "", "origin": doc_path})
    
    return word_image_list

def extract_images_from_pdf(pdf_path, output_folder):
    pdf_document = fitz.open(pdf_path)
    pdf_image_list = []
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            img_bytes = base_image["image"]
            img_filename = os.path.join(output_folder, f"{os.path.basename(pdf_path)}_page_{page_num+1}_image_{img_index+1}.png")
            with open(img_filename, "wb") as f:
                f.write(img_bytes)
            pdf_image_list.append({"image": img_filename, "text": "", "origin": pdf_path})
            
    return pdf_image_list

def extract_images_from_folder(folder_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    word_image_list = []
    pdf_image_list = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        # if filename.endswith(".docx"):
        #     word_image_list = word_image_list + extract_images_from_word(file_path, output_folder)
        # elif filename.endswith(".pdf"):
        #     pdf_image_list = pdf_image_list + extract_images_from_pdf(file_path, output_folder)
        if filename.endswith(".pdf"):
            pdf_image_list = pdf_image_list + extract_images_from_pdf(file_path, output_folder)
            
    return word_image_list+pdf_image_list
# Specify the folder containing the Word and PDF files and the output folder


# Put your local directory here


In [8]:


def is_nonsense_text(text):
    # Remove all whitespace
    cleaned_text = ''.join(text.split())
    # Check if the cleaned text consists only of digits
    if cleaned_text.isdigit():
        return True
    # Check if the text matches the pattern of numbers separated by newlines
    if re.match(r'^(\d+\n?)+$', text.strip()):
        return True
    return False

## Generate image reasoning from GPT4V Multi-Modal LLM

In [None]:
image_list = extract_images_from_folder(pdf_directory, output_folder)
image_list

In [None]:

image_paths = []
for img_path in os.listdir("./iframe_image_extraction"):
    if ".pdf" in img_path: 
        image_paths.append(str(os.path.join("./iframe_image_extraction", img_path)))


def plot_images(image_paths):
    images_shown = 0
    plt.figure(figsize=(16, 9))
    for img_path in image_paths[:6]:
        if os.path.isfile(img_path):
            image = Image.open(img_path)

            plt.subplot(2, 3, images_shown + 1)
            plt.imshow(image)
            plt.xticks([])
            plt.yticks([])

            images_shown += 1
            if images_shown >= 9:
                break


plot_images(image_paths)

### Using GPT4V to understand those input images 

In [None]:


def process_images(directory):
    results = []
    
    image_documents = SimpleDirectoryReader(directory).load_data()
    
    print(image_documents)
    for image_document in image_documents:
            
            # Generate description
            response = openai_mm_llm.complete(
                prompt="Descrie imaginea de mai jos ca text alternativ:",
                image_documents=[image_document],
            )
            
            # Add result to the list
            result = {
                "filename": image_document.metadata["file_path"],
                "open_ai_description": response.text
            }
            print(result)
            results.append(result)
    
    return results

# Process all images and get results
image_descriptions = process_images(image_directory)

# Print results
for item in image_descriptions:
    print(f"Filename: {item['filename']}")
    print(f"Description: {item['open_ai_description']}")
    print("-" * 50)

In [None]:
image_descriptions_df = pd.DataFrame(image_descriptions)
image_descriptions_df["filename"] = image_descriptions_df["filename"].apply(lambda x: os.path.basename(x))
image_descriptions_df.to_csv("image_descriptions.csv", index=False)
image_descriptions_df

In [None]:
image_df_origin_df = pd.DataFrame(image_list)
image_df_origin_df["image"] = image_df_origin_df["image"].apply(lambda x: os.path.basename(x))
image_df_origin_df
image_descriptions_df_2 = pd.merge(
    image_descriptions_df, image_df_origin_df, left_on="filename", right_on="image", how="left"
    ).drop(columns=["filename","text"]).rename(columns={"image": "image_path","origin":"origin_filename"})
image_descriptions_df_2["origin_filename"] = image_descriptions_df_2["origin_filename"].apply(lambda x: os.path.basename(x))
image_descriptions_df_2.to_csv("image_descriptions.csv", index=False)
image_descriptions_df_2

### Image description already extracted

In [None]:


def extract_text_from_docx(file_path):
    """
    Extract all text from a .docx file.
    
    :param file_path: str, path to the .docx file
    :return: str, all text content from the document
    """
    doc = docx.Document(file_path)
    full_text = []
    
    for para in doc.paragraphs:
        full_text.append(para.text)
    
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                full_text.append(cell.text)
    
    return '\n'.join(full_text)


def extract_text_from_pdf(file_path):
    """
    Extract all text from a PDF file.
    
    :param file_path: str, path to the PDF file
    :return: str, all text content from the document
    """
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        
        full_text = []
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            full_text.append(page.extract_text())
    
    return '\n'.join(full_text)


def process_folder(folder_path: str) -> List[Dict[str, str]]:
    """
    Process all supported files (.docx and .pdf) in a folder and extract text from them.
    
    :param folder_path: str, path to the folder containing the files
    :return: List[Dict[str, str]], a list of dictionaries containing filename and extracted raw text
    """
    results = []
    
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if filename.endswith('.docx'):
            extracted_text = extract_text_from_docx(file_path)
        elif filename.endswith('.pdf'):
            extracted_text = extract_text_from_pdf(file_path)
        else:
            continue  # Skip unsupported file types
        
        results.append({
            'origin_filename': filename,
            'extracted_raw_text': extracted_text
        })
    
    return results

extracted_text = process_folder(input_folder)
extracted_text_df = pd.DataFrame(extracted_text)
extracted_text_df.to_csv("extracted_text.csv", index=False)
extracted_text_df

In [None]:
from pathlib import Path
import requests
import urllib.request


image_descriptions_df = pd.read_csv("image_descriptions.csv")
extracted_text_df = pd.read_csv("extracted_text.csv")

merged_df = pd.merge(
    extracted_text_df, image_descriptions_df, left_on="origin_filename", right_on="origin_filename", how="outer"
)
merged_df


In [None]:
filename_image_descriptions_summed = image_descriptions_df.groupby("origin_filename").agg(
    image_descriptions_summed=("open_ai_description", lambda x: ' '.join(x))
).reset_index()
merged_df = pd.merge(
    merged_df, filename_image_descriptions_summed, left_on="origin_filename", right_on="origin_filename", how="left"
)
merged_df

In [None]:
merged_df["extracted_raw_text"] = merged_df["extracted_raw_text"].fillna("")
merged_df["open_ai_description"] = merged_df["open_ai_description"].fillna("")
merged_df["image_descriptions_summed"] = merged_df["image_descriptions_summed"].fillna("")
merged_df["extracted_raw_text_length"] = merged_df["extracted_raw_text"].apply(len)
merged_df["open_ai_description_length"] = merged_df["open_ai_description"].apply(len)
merged_df["image_descriptions_summed_length"] = merged_df["image_descriptions_summed"].apply(len)

merged_df

In [None]:
merged_df.groupby("origin_filename")[["extracted_raw_text_length","image_descriptions_summed_length"]].mean().reset_index()

### Build Multi-modal index and vector store to index both text and images (locally)

In [None]:
extracted_text_df["extracted_raw_text"] = extracted_text_df["extracted_raw_text"].fillna("")
extracted_text_df

In [None]:
print(extracted_text_df.shape)

import pandas as pd
from llama_index.core.node_parser import TokenTextSplitter

# Assuming you have your DataFrame loaded as 'df'
# with a column named 'extracted_text_df'

# Initialize the SentenceSplitter with overlap
splitter = TokenTextSplitter(
    chunk_size=4000,
    chunk_overlap=400,
)    
    
def chunk_text(text):    
    print(len(text))
    
    splitter = TokenTextSplitter(chunk_size=1500, chunk_overlap=100)
    chunks = splitter.split_text(text)
    print(len(chunks))
    return chunks

# Apply the chunking function to the column
extracted_text_df['chunked_text'] = extracted_text_df['extracted_raw_text'].apply(chunk_text)

# Explode the DataFrame to create one row per chunk
extracted_text_df_exploded = extracted_text_df.explode('chunked_text')

# Reset the index of the exploded DataFrame
extracted_text_df_exploded = extracted_text_df_exploded.reset_index(drop=True)

# Now df_exploded contains the chunked text, with each chunk in its own row
extracted_text_df_exploded


In [None]:
image_descriptions_df

### Uploading text in hosted Qdrant

In [None]:
from qdrant_client import QdrantClient
import openai
from qdrant_client.http import models

openai_client = openai.Client(
    api_key=os.environ["OPENAI_API_KEY"]
)

# embedding_model_name = "text-embedding-3-small"

qdrant_client = QdrantClient(
    url=QDRANT_URL, 
    api_key=QDRANT_API_KEY
)

print(qdrant_client.get_collections())

openai.api_key = os.environ["OPENAI_API_KEY"]

collection_name = "finaco_proceduri_interne_text_only"

In [None]:

# Create a new collection (if it doesn't exist)
qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
)

In [None]:
print(image_descriptions_df.columns)
print(extracted_text_df_exploded.columns)

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding

embedding_model = OpenAIEmbedding()
embedding_model_name = embedding_model.model_name
embedding_model

In [None]:
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore

# Step 1: Prepare the Documents
documents = []

for index, row in image_descriptions_df.iterrows():
    doc = Document(
        text=row['open_ai_description'],
        metadata={
            'image_path': row['image_path'],
            'origin_filename': row['origin_filename']
        }
    )
    documents.append(doc)

# Step 2: Set Up the Qdrant Vector Store
vector_store = QdrantVectorStore(client=qdrant_client, collection_name=collection_name)

# Step 3: Create the Storage and Service Contexts
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Step 4: Create the Index and Insert Documents
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
)

# Step 5: Verify the Data Insertion (Optional)
retriever = index.as_retriever(similarity_top_k=5)
response = retriever.retrieve("Cum declari reprezentantul Fiscal?")
print(response)


In [None]:
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore

# Step 1: Prepare the Documents
documents = []

for index, row in extracted_text_df_exploded.iterrows():
    doc = Document(
        text=row['chunked_text'],
        metadata={
            'origin_filename': row['origin_filename']
        }
    )
    documents.append(doc)

# Step 2: Set Up the Qdrant Vector Store
vector_store = QdrantVectorStore(client=qdrant_client, collection_name=collection_name)

# Step 3: Create the Storage and Service Contexts
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Step 4: Create the Index and Insert Documents
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
)

# Step 5: Verify the Data Insertion (Optional)
retriever = index.as_retriever(similarity_top_k=5)
response = retriever.retrieve("Cum declari reprezentantul Fiscal?")
print(response)

### Retrieve data

In [None]:
from qdrant_client import QdrantClient
import os
import openai
from qdrant_client.http import models
# from llama_index import VectorStoreIndex, StorageContext 
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import StorageContext
# from llama_index.vector_stores import QdrantVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding

openai_client = openai.Client(
    api_key=os.environ["OPENAI_API_KEY"]
)

qdrant_client = QdrantClient(
    url=QDRANT_URL, 
    api_key=QDRANT_API_KEY
)

collection_name = "finaco_proceduri_interne_text_only"

vector_store = QdrantVectorStore(client=qdrant_client, collection_name=collection_name)

print(qdrant_client.get_collections())

openai.api_key = os.environ["OPENAI_API_KEY"]

embed_model = OpenAIEmbedding(api_key=os.environ["OPENAI_API_KEY"])

index = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=embed_model)
index

In [None]:
query_text = "Cum declari reprezentantul Fiscal?"
query_vector = openai_client.embeddings.create(input=query_text, model=embedding_model_name).data[0].embedding

qdrant_client.search(
    collection_name=collection_name,
    query_vector=query_vector,
)

In [None]:
retriever = index.as_retriever(similarity_top_k=5)
response = retriever.retrieve("Cum declari reprezentantul Fiscal?")
response

In [None]:
# Now you can perform string-based queries
query_str = "Cum declari reprezentantul Fiscal?"
# set Logging to DEBUG for more detailed outputs
llm  = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
chat_engine = index.as_chat_engine(llm=llm)
response = chat_engine.chat("Cum declari reprezentantul Fiscal?")

response

### 2. Multi-Modal RAG Querying

In [14]:
from llama_index.core import PromptTemplate
from llama_index.core.query_engine import SimpleMultiModalQueryEngine

qa_tmpl_str = (
    "Informațiile de context sunt mai jos.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Având în vedere informațiile din context și nu cunoștințe anterioare, "
    "răspunde la întrebare.\n"
    "Întrebare: {query_str}\n"
    "Răspuns: "
)

qa_tmpl = PromptTemplate(qa_tmpl_str)
llm  = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
query_engine = index.as_query_engine(
    llm=llm, text_qa_template=qa_tmpl, similarity_top_k=5, 
)

query_str = "Cum declari reprezentantul Fiscal?"
response = query_engine.query(query_str)

In [None]:
response

In [None]:
source_nodes = response.source_nodes
source_nodes

In [None]:
images = []
for node in source_nodes:
    if "image_path" in node.metadata:
        print(node.metadata["image_path"])
        images.append("./iframe_image_extraction/"+node.metadata["image_path"])

print(images)

plot_images(images)


In [None]:
plot_images(images)
