In [9]:
# Step 1: Mount Google Drive
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/MyDrive/surya/')

Mounted at /content/drive


In [10]:
# Step 2: Install Required Libraries
!pip install transformers faiss-cpu PyMuPDF pdf2image pillow
!apt-get install -y poppler-utils
!pip install langchain
!pip install -U langchain-community

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [11]:
# Step 3: Extract Text and Images from PDF
import fitz  # PyMuPDF for PDF text extraction
from pdf2image import convert_from_path
from PIL import Image

# Path to your PDF file
pdf_path = "/content/drive/MyDrive/surya/Surya_Resume_AI.pdf"

# Initialize lists to hold text and image data
pdf_texts = []
pdf_images = []

# Extract text and images from each page
with fitz.open(pdf_path) as pdf:
    for page_num in range(pdf.page_count):
        page = pdf[page_num]
        pdf_texts.append(page.get_text())  # Extract text from page

        # Convert page to an image
        images = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
        pdf_images.extend(images)  # Store images

In [12]:
# Step 4: Initialize CLIP Model and Processor
from transformers import CLIPProcessor, CLIPModel
import torch
import numpy as np

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Function to split long text into chunks for embedding
def split_text(text, chunk_size=77):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

# Prepare text data by splitting into chunks
chunked_texts = []
for text in pdf_texts:
    chunked_texts.extend(split_text(text))

# Generate embeddings for text chunks
text_inputs = processor(text=chunked_texts, return_tensors="pt", padding=True, truncation=True, max_length=77)
text_embeddings = clip_model.get_text_features(**text_inputs)
text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
text_embeddings = text_embeddings.detach().cpu().numpy()

# Generate embeddings for images
image_inputs = processor(images=pdf_images, return_tensors="pt")
image_embeddings = clip_model.get_image_features(**image_inputs)
image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)
image_embeddings = image_embeddings.detach().cpu().numpy()



In [17]:
from langchain.vectorstores import FAISS
import numpy as np

# Combine text and image embeddings and metadata
combined_texts = chunked_texts + ["Image"] * len(pdf_images)  # Text labels for each embedding
combined_embeddings = np.concatenate((text_embeddings, image_embeddings), axis=0)  # Combined embeddings

# Create a list of tuples containing text and corresponding embeddings
text_embedding_pairs = [(text, embedding) for text, embedding in zip(combined_texts, combined_embeddings)]

# Initialize FAISS with precomputed embeddings and metadata
vector_store = FAISS.from_embeddings(
    text_embeddings=text_embedding_pairs,  # Pass the text-embedding pairs directly
    embedding=combined_embeddings,  # This is usually the embedding model
    metadatas=[{"type": t} for t in ["text"] * len(chunked_texts) + ["image"] * len(pdf_images)]  # Include metadata
)

print("FAISS vector store created successfully.")




FAISS vector store created successfully.


In [18]:
# Step 6: Define Query Functions for Text or Image-Based Output

# Function to check if a query is asking for an image
def is_image_query(query):
    image_keywords = ["image", "picture", "photo", "screenshot", "diagram", "biodata"]
    return any(keyword in query.lower() for keyword in image_keywords)

# Function to generate an embedding for a text query
def generate_text_embedding(query):
    text_input = processor(text=query, return_tensors="pt", padding=True, truncation=True, max_length=77)
    text_embedding = clip_model.get_text_features(**text_input)
    text_embedding = text_embedding / text_embedding.norm(dim=-1, keepdim=True)
    return text_embedding.detach().cpu().numpy()

In [20]:
# Step 7: Query the Vector Store and Display Results Based on Query Type
#query = "Provide me an image of biodata."  # Example query; adjust as needed
query = "Who is Suryakanta Karan"  # Example query; adjust as needed
query_embedding = generate_text_embedding(query)

# Perform the similarity search
results = vector_store.similarity_search_by_vector(query_embedding[0], k=10)  # Retrieve top 10 matches

# Display results based on query type
print("\nResults for Query:")
image_query = is_image_query(query)
for result in results:
    if image_query and result.metadata.get("type") == "image":
        print("Image Result:")
        display(result.page_content)  # Replace with actual display code for images in Colab
    elif not image_query and result.metadata.get("type") == "text":
        print("Text Description:", result.page_content)


Results for Query:
Text Description: Suryakanta Karan M: +91 8770228646 Email: suryakantakaran93@gmail.com LinkedIn: in/suryakanta-karan-595b4b34/ Professional Summary • Seasoned Sr. Lead Data Scientist with over 10+ years of expertise in developing, fine-tuning, and deploying Large Language Models (LLMs) such as GPT and LLaMA. Proven ability to deliver scalable NLP solutions using cutting-edge techniques including Retrieval-Augmented Generation (RAG), Reinforcement Learning from Human Feedback (RLHF), and advanced prompt engineering. Demonstrated leadership in managing cross-functional teams, mentoring engineers, and driving AI innovations that improve model accuracy
Text Description: Sampling) and Top-K Sampling techniques in cloud environments to optimize text generation models, balancing response quality and computational efficiency
Text Description: • Trained machine learning models (e.g., Decision Trees, Random Forest, and Neural Networks) to classify fraudulent and legitimate t