In [11]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/MyDrive/surya/')

Mounted at /content/drive


In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
!pip install transformers faiss-cpu PyMuPDF pdf2image pillow
!pip install langchain
!pip install -U langchain-community
!apt-get install -y poppler-utils


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [15]:
import fitz  # PyMuPDF for PDF text extraction
from pdf2image import convert_from_path
from PIL import Image

# Path to your PDF file (upload to Colab first)
pdf_path = "/content/drive/MyDrive/surya/AR_24797_ZOMATO_2023_2024_03082024162627-32_Pages.pdf"

# Initialize lists to hold text and image data
pdf_texts = []
pdf_images = []

# Extract text and images from each page
with fitz.open(pdf_path) as pdf:
    for page_num in range(pdf.page_count):
        page = pdf[page_num]
        pdf_texts.append(page.get_text())  # Extract text from page

        # Convert page to an image
        images = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
        pdf_images.extend(images)  # Store images


In [17]:
from transformers import CLIPProcessor, CLIPModel
import torch
import numpy as np

# Initialize CLIP model and processor for embeddings
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Function to split long text into chunks that fit the model's limit
def split_text(text, chunk_size=77):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

# Prepare text data by splitting long texts into chunks of 77 tokens or less
chunked_texts = []
for text in pdf_texts:  # pdf_texts contains extracted text from the PDF
    chunked_texts.extend(split_text(text))

# Generate embeddings for each text chunk
text_inputs = processor(text=chunked_texts, return_tensors="pt", padding=True, truncation=True, max_length=77)
text_embeddings = clip_model.get_text_features(**text_inputs)
text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)  # Normalize embeddings

# Process and generate image embeddings
image_inputs = processor(images=pdf_images, return_tensors="pt")  # pdf_images contains images from PDF
image_embeddings = clip_model.get_image_features(**image_inputs)
image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)  # Normalize embeddings

# Convert embeddings to numpy arrays for FAISS compatibility
text_embeddings = text_embeddings.detach().cpu().numpy()
image_embeddings = image_embeddings.detach().cpu().numpy()


IndexError: list index out of range

In [21]:
from transformers import CLIPProcessor, CLIPModel
import torch
import numpy as np

# Check if CUDA is available and use GPU if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Clear any cached memory from previous sessions
torch.cuda.empty_cache()

# Attempt to load a model, handling memory issues gracefully
clip_model = None

try:
    # Initialize CLIP model and processor for embeddings
    clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)  # Move model to GPU
except Exception as e:
    if "CUDA out of memory" in str(e):
        print("Out of memory! Trying a smaller model.")
        clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(device)  # Try smaller model
    else:
        raise  # Raise any other exceptions

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Debug: Check the extracted PDF texts
print(f"Number of raw texts extracted from PDF: {len(pdf_texts)}")
if len(pdf_texts) == 0:
    raise ValueError("No text extracted from PDF. Please check the extraction process.")

# Function to split long text into chunks that fit the model's limit
def split_text(text, chunk_size=77):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

# Prepare text data by splitting long texts into chunks of 77 tokens or less
chunked_texts = []
for text in pdf_texts:  # pdf_texts contains extracted text from the PDF
    if text:  # Ensure text is not empty
        chunks = split_text(text)
        chunked_texts.extend(chunks)

# Debugging: Check the number of chunked texts
print(f"Number of text chunks: {len(chunked_texts)}")
if len(chunked_texts) == 0:
    raise ValueError("No text chunks found after processing. Please check the PDF extraction and splitting.")

# Generate embeddings for each text chunk
text_inputs = processor(text=chunked_texts, return_tensors="pt", padding=True, truncation=True, max_length=77)

# Move inputs to GPU
text_inputs = {key: value.to(device) for key, value in text_inputs.items()}

# Generate text embeddings on GPU
text_embeddings = clip_model.get_text_features(**text_inputs)
text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)  # Normalize embeddings

# Process and generate image embeddings
image_inputs = processor(images=pdf_images, return_tensors="pt")  # pdf_images contains images from PDF

# Move image inputs to GPU
image_inputs = {key: value.to(device) for key, value in image_inputs.items()}

# Generate image embeddings on GPU
image_embeddings = clip_model.get_image_features(**image_inputs)
image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)  # Normalize embeddings

# Convert embeddings to numpy arrays for FAISS compatibility (move to CPU for conversion)
text_embeddings = text_embeddings.detach().cpu().numpy()
image_embeddings = image_embeddings.detach().cpu().numpy()


Number of raw texts extracted from PDF: 0


ValueError: No text extracted from PDF. Please check the extraction process.

In [22]:
!apt-get install tesseract-ocr
!pip install pytesseract


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 2s (2,674 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 123652 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-

In [24]:
from transformers import CLIPProcessor, CLIPModel
import torch
import numpy as np
from pdf2image import convert_from_path
from pytesseract import image_to_string
import os

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load PDF file
pdf_path = "/content/drive/MyDrive/surya/AR_24797_ZOMATO_2023_2024_03082024162627-32_Pages.pdf"

# Convert PDF pages to images
pdf_images = convert_from_path(pdf_path)

# Extract text from images using OCR
pdf_texts = []
for image in pdf_images:
    text = image_to_string(image)
    pdf_texts.append(text)

# Check if any text was extracted
print(f"Number of raw texts extracted from PDF: {len(pdf_texts)}")
if len(pdf_texts) == 0:
    raise ValueError("No text extracted from PDF. Please check the extraction process.")

# Initialize CLIP model and processor for embeddings
try:
    clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)  # Move model to GPU
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
except OutOfMemoryError:
    print("Out of memory! Trying a smaller model.")
    clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(device)  # Try smaller model

# Function to split long text into chunks that fit the model's limit
def split_text(text, chunk_size=77):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

# Prepare text data by splitting long texts into chunks of 77 tokens or less
chunked_texts = []
for text in pdf_texts:
    chunked_texts.extend(split_text(text))

print(f"Number of text chunks: {len(chunked_texts)}")

# Generate embeddings for each text chunk
text_inputs = processor(text=chunked_texts, return_tensors="pt", padding=True, truncation=True, max_length=77)
text_inputs = {k: v.to(device) for k, v in text_inputs.items()}  # Move inputs to GPU

text_embeddings = clip_model.get_text_features(**text_inputs)
text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)  # Normalize embeddings

# Process and generate image embeddings
image_inputs = processor(images=pdf_images, return_tensors="pt")  # pdf_images contains images from PDF
image_inputs = {k: v.to(device) for k, v in image_inputs.items()}  # Move inputs to GPU

image_embeddings = clip_model.get_image_features(**image_inputs)
image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)  # Normalize embeddings

# Convert embeddings to numpy arrays for FAISS compatibility
text_embeddings = text_embeddings.detach().cpu().numpy()
image_embeddings = image_embeddings.detach().cpu().numpy()

# Combine text and image embeddings
combined_texts = chunked_texts + ["Image"] * len(pdf_images)  # Text labels for each embedding
combined_embeddings = np.concatenate((text_embeddings, image_embeddings), axis=0)  # Combined embeddings

# Initialize FAISS with precomputed embeddings and metadata
from langchain.vectorstores import FAISS

# Assuming `metadatas` is a list of dictionaries containing metadata for each embedding
metadatas = [{"text": text} for text in combined_texts]  # Create metadata

# Create FAISS vector store
vector_store = FAISS.from_embeddings(
    text_embeddings=[(text, embedding) for text, embedding in zip(combined_texts, combined_embeddings)],
    embedding=combined_embeddings
)

print("FAISS vector store created successfully.")

# Now you can query the vector store using text or images as input.


PDFPageCountError: Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table


In [22]:
from langchain.vectorstores import FAISS
import numpy as np

# Combine text and image embeddings and metadata
combined_texts = chunked_texts + ["Image"] * len(pdf_images)  # Text labels for each embedding
combined_embeddings = np.concatenate((text_embeddings, image_embeddings), axis=0)  # Combined embeddings

# Initialize FAISS with precomputed embeddings and metadata
vector_store = FAISS.from_embeddings(
    text_embeddings=[(text, embedding) for text, embedding in zip(combined_texts, combined_embeddings)],
    embedding=combined_embeddings
)

print("FAISS vector store created successfully.")




FAISS vector store created successfully.


In [23]:
import torch

# Function to generate embedding for a query
def generate_text_embedding(query):
    text_input = processor(text=query, return_tensors="pt", padding=True, truncation=True, max_length=77)
    text_embedding = clip_model.get_text_features(**text_input)
    text_embedding = text_embedding / text_embedding.norm(dim=-1, keepdim=True)  # Normalize embedding
    return text_embedding.detach().cpu().numpy()

def generate_image_embedding(image):
    image_input = processor(images=image, return_tensors="pt")
    image_embedding = clip_model.get_image_features(**image_input)
    image_embedding = image_embedding / image_embedding.norm(dim=-1, keepdim=True)  # Normalize embedding
    return image_embedding.detach().cpu().numpy()

In [25]:
# Example 1: Text Query
text_query = "Who Is Suryakanta Karan"  # Example query text
text_query_embedding = generate_text_embedding(text_query)
text_results = vector_store.similarity_search_by_vector(text_query_embedding[0], k=5)  # Retrieve top 5 matches
print("Text Query Results:")
for result in text_results:
    print(result.page_content)  # Display the retrieved text descriptions


Text Query Results:
Suryakanta Karan M: +91 8770228646 Email: suryakantakaran93@gmail.com LinkedIn: in/suryakanta-karan-595b4b34/ Professional Summary • Seasoned Sr. Lead Data Scientist with over 10+ years of expertise in developing, fine-tuning, and deploying Large Language Models (LLMs) such as GPT and LLaMA. Proven ability to deliver scalable NLP solutions using cutting-edge techniques including Retrieval-Augmented Generation (RAG), Reinforcement Learning from Human Feedback (RLHF), and advanced prompt engineering. Demonstrated leadership in managing cross-functional teams, mentoring engineers, and driving AI innovations that improve model accuracy
Sampling) and Top-K Sampling techniques in cloud environments to optimize text generation models, balancing response quality and computational efficiency
• Trained machine learning models (e.g., Decision Trees, Random Forest, and Neural Networks) to classify fraudulent and legitimate transactions, achieving high precision and recall score

In [26]:
# Example 2: Image Query
image_query = pdf_images[0]  # Example image from PDF
image_query_embedding = generate_image_embedding(image_query)
image_results = vector_store.similarity_search_by_vector(image_query_embedding[0], k=5)  # Retrieve top 5 matches
print("\nImage Query Results:")
for result in image_results:
    print(result.page_content)


Image Query Results:
Image
Image
Image
Image
KPIT Technology Ltd. : Data Analyst 16th Apr 2018 – 8th Feb 2019 Responsibilities: • Performed data analysis and visualization to support decision-making in automotive and manufacturing projects. • Extracted insights from large datasets, leading to improved efficiency in production processes and supply chain management. IL&FS Securities Ltd.: Developer May 2, 2017 – Mar 30, 2018 Responsibilities: • Automated processes using Unix and developed shell scripts to streamline operations. • Implemented Python for file transfers and integrated


In [28]:
# Function to generate an embedding for the text query
def generate_text_embedding(query):
    text_input = processor(text=query, return_tensors="pt", padding=True, truncation=True, max_length=77)
    text_embedding = clip_model.get_text_features(**text_input)
    text_embedding = text_embedding / text_embedding.norm(dim=-1, keepdim=True)  # Normalize embedding
    return text_embedding.detach().cpu().numpy()

# Example text query asking for an image
text_query = "Provide me an image of biodata."  # Example text query
text_query_embedding = generate_text_embedding(text_query)

# Search the vector store using the text embedding
results = vector_store.similarity_search_by_vector(text_query_embedding[0], k=10)  # Retrieve top 10 matches

print("\nResults for Query About an Image:")
for result in results:
    # Check if the result is an image based on metadata
    if result.metadata.get("type") == "image":
        display_image(result.page_content)  # Replace with actual display logic for the image
    else:
        print("Description:", result.page_content)  # Show text description for non-image results



Results for Query About an Image:
Description: Sampling) and Top-K Sampling techniques in cloud environments to optimize text generation models, balancing response quality and computational efficiency
Description: • Trained machine learning models (e.g., Decision Trees, Random Forest, and Neural Networks) to classify fraudulent and legitimate transactions, achieving high precision and recall scores. • Employed advanced anomaly detection techniques to identify deviations in transaction patterns, flagging potential fraud cases for review. • Integrated real-time fraud detection mechanisms into banking platforms, ensuring the system could detect and prevent fraudulent transactions as they occur. • Tuned model hyperparameters and adjusted detection thresholds to reduce false positives and
Description: and Entity Recognition: • Developed predictive models using machine learning algorithms (e.g., XGBoost, LSTM) to forecast commodity prices, leading to a 15% improvement in decision-making. • 