In [6]:
# %pip install langchain-community pypdf
# %pip install --upgrade pip

In [2]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("/Users/saimammahi/Documents/Work/Interview/GCP-GenAI:ML/Metro AG – Wikipedia.pdf")

docs = loader.load()

print("-----------Model length--------------------")
print(len(docs))
print("-----------Model content--------------------")   
print(docs[0].page_content[:100])
print("-----------Model metadata--------------------")
print(docs[0].metadata)
print("-----------Model dump json--------------------")
print(docs[0].model_dump_json()) 
print("-----------Model fields set--------------------")
print(docs[0].model_fields_set)
print("-----------Model json schema--------------------")

-----------Model length--------------------
8
-----------Model content--------------------
METRO AG
Legal form stock corporation
Founding 2017
seat Düsseldorf , Germany
  
Line Steffen Greube
-----------Model metadata--------------------
{'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20250816005411', 'source': '/Users/saimammahi/Documents/Work/Interview/GCP-GenAI:ML/Metro AG – Wikipedia.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}
-----------Model dump json--------------------
{"id":null,"metadata":{"producer":"PDFium","creator":"PDFium","creationdate":"D:20250816005411","source":"/Users/saimammahi/Documents/Work/Interview/GCP-GenAI:ML/Metro AG – Wikipedia.pdf","total_pages":8,"page":0,"page_label":"1"},"page_content":"METRO AG\nLegal form stock corporation\nFounding 2017\nseat Düsseldorf , Germany\n  \nLine Steffen Greubel , ( Chairman of\nthe Board ) [ 1 ]\nGuillaume Deruyter\nChristiane Giesen\nEric Riegger\nRoman Šilha, ( Chairman of the\nSupervisory Board 

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# SIMPLE FIX: Optimal chunk size for semantic search
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,    # Smaller, focused chunks
    chunk_overlap=10   # Minimal overlap
)

splitted_docs = text_splitter.split_documents(docs[:3])

print(f"Created {len(splitted_docs)} chunks")
print("\nFirst chunk:")
print(splitted_docs[0].page_content)
print("\n" + "="*50)

Created 126 chunks

First chunk:
METRO AG
Legal form stock corporation
Founding 2017
seat Düsseldorf , Germany



In [5]:
import os
import getpass
from dotenv import load_dotenv

load_dotenv()

True

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
import time

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001", api_key=os.getenv("GOOGLE_API_KEY"))

# Create new vector store with improved chunks
vectorstore = InMemoryVectorStore(embedding=embeddings)

print(f"Adding {len(splitted_docs)} improved chunks to vector store...")

# Process documents in smaller batches to avoid ResourceExhausted error
batch_size = 2  # Smaller batch size since chunks are larger now
for i in range(0, len(splitted_docs), batch_size):
    batch = splitted_docs[i:i+batch_size]
    vectorstore.add_documents(documents=batch)
    print(f"Processed batch {i//batch_size + 1}/{(len(splitted_docs) + batch_size - 1)//batch_size}")
    time.sleep(20)  # Small delay between batches

print("✅ Vector store created successfully with improved chunks!")


Adding 126 improved chunks to vector store...
Processed batch 1/63
Processed batch 2/63


In [None]:
# DIAGNOSTIC: Let's see what we actually have in our chunks
print("=== DIAGNOSTIC: EXAMINING OUR CHUNKS ===\n")

# Show first 3 chunks to see what content we have
print("📋 First 3 chunks:")
for i in range(min(3, len(splitted_docs))):
    print(f"\n--- Chunk {i+1} ---")
    print(splitted_docs[i].page_content)
    print("-" * 40)

# Test simple search to see what we get
print("\n🔍 TESTING BASIC SEARCH:")
print("Query: 'Metro AG'")
test_results = vectorstore.similarity_search("Metro AG", k=3)

for i, result in enumerate(test_results):
    print(f"\nResult {i+1}:")
    print(result.page_content[:200] + "...")
    
print("\n" + "="*60)


=== DIAGNOSTIC: EXAMINING OUR CHUNKS ===

📋 First 3 chunks:

--- Chunk 1 ---
METRO AG
Legal form stock corporation
Founding 2017
seat Düsseldorf , Germany
  
Line Steffen Greubel , ( Chairman of
the Board ) [ 1 ]
Guillaume Deruyter
Christiane Giesen
Eric Riegger
Roman Šilha, ( Chairman of the
Supervisory Board ) [ 2 ]
Number of
employees
87,810 (2023/24) [ 3 ]
Sales volume 31.029 billion euros (2023/24) [ 3 ]
Industry Trade
Website www.metroag.de (https://www.metr
oag.de)
As of September 30, 2024
Logo of the wholesale markets in
Germany
Metro AG
Metro AG is a German wholesale group
headquartered in Düsseldorf . It operates 624
----------------------------------------

--- Chunk 2 ---
headquartered in Düsseldorf . It operates 624
stores in over 30 countries, over 100 [ 4 ] of
which are in Germany.
Today's Metro was created in 2017 through
the spin-off of the Metro Cash & Carry and
Real retail chains from the old Metro into
Metro Wholesale & Food Specialist AG ,
which later renamed itsel

In [None]:
# SIMPLE TEST: Direct questions with clear answers
print("=== SIMPLE SEMANTIC SEARCH TESTS ===\n")

# Test 1: Company name
print("❓ What company is this about?")
result = vectorstore.similarity_search("company name")
print("💡 Answer:")
print([r.page_content for r in result])
print("\n" + "="*50 + "\n")

# Test 2: Location  
print("❓ Where is it located?")
result = vectorstore.similarity_search("location headquarters", k=1)
print("💡 Answer:")
print([r.page_content for r in result])
print("\n" + "="*50 + "\n")

# Test 3: Revenue
print("❓ What is the revenue?")
result = vectorstore.similarity_search("revenue sales money", k=1)
print("💡 Answer:")
print([r.page_content for r in result])

print("\n✅ If these look good, your semantic search is working!")
print("❌ If these are weird/wrong, we need to investigate further.")


=== SIMPLE SEMANTIC SEARCH TESTS ===

❓ What company is this about?
💡 Answer:
['company\'s services in the real estate sector, logistics, IT, advertising, and procurement. [ 109 ]\nGroup structure of Metro AG 2019/2020\nmetro\nMetro\nGermany\nMetro\nWestern\nEurope\n(excluding\nGermany)\nMetro\nRussia\nMetro\nEastern\nEurope\n(excluding\nRussia)\nMetro\nAsia\nother:\nDISH\nDigital\nSolutions\nMetro\nMarkets\nMetro\nProperties\nother\nservice\ncompanies\nIn various proceedings, Metro attempted to secure its rights to use the term "Metro". The Lower\nSaxony railway company MetroRail was forced to change its name to Metronom', 'Eisenbahngesellschaft  mbH , but is still allowed to call its trains Metro . The Metrorapid\nKey figures\nCorporate structure\nRights to the name “Metro”\n14/08/2025, 22:52 Metro AG – Wikipedia\nhttps://de.wikipedia.org/wiki/Metro_AG 8/18', 'Other developments from 2019\n14/08/2025, 22:52 Metro AG – Wikipedia\nhttps://de.wikipedia.org/wiki/Metro_AG 6/18', 'METRO AG

In [None]:
# Test improved similarity search with better questions
print("=== IMPROVED SIMILARITY SEARCH TESTS ===\n")

# Test 1: Company name
print("1. What is the company name?")
result1 = vectorstore.similarity_search("What is the company name?", k=1)
for i, doc in enumerate(result1):
    print(f"Result {i+1}: {doc.page_content}...\n")

# Test 2: Revenue/Financial information  
print("2. What is the company's revenue?")
result2 = vectorstore.similarity_search("revenue sales financial figures")
for i, doc in enumerate(result2):
    print(f"Result {i+1}: {doc.page_content}...\n")

# Test 3: Location
print("3. Where is the company located?")
result3 = vectorstore.similarity_search("headquarters location Düsseldorf")
for i, doc in enumerate(result3):
    print(f"Result {i+1}: {doc.page_content}...\n")


=== IMPROVED SIMILARITY SEARCH TESTS ===

1. What is the company name?
Result 1: Other developments from 2019
14/08/2025, 22:52 Metro AG – Wikipedia
https://de.wikipedia.org/wiki/Metro_AG 6/18...

2. What is the company's revenue?
Result 1: The company’s key figures developed as follows:
Year 2016/17
[ 101 ]
2017/18
[ 102 ]
2018/19
[ 103 ]
2019/20
[ 104 ]
2020/21
[ 105 ]
2021/22
[ 106 ]
2022/23
[ 107 ]
2023/24
[ 108 ]
Sales (in
million euros) 37,140 29,476 27,082 25,632 24,765 29,754 30,551 31,029
Proﬁt after tax
(in million
euros)
345 348 −115 471 −45 −331 439 −125
Number of
employees 155,082 117,078 101,654 97,639 95,141 94,944 91,201 87,810
Since June 2017, Metro AG has served as the central management holding company for the Metro
Group. The core wholesale business, with 678 stores (as of May 2020), is managed by the Metro...

Result 2: was divided into four sales divisions. [ 36 ] The largest revenue generators are shown in the
following table (with sales figures for 2011): [ 37 ]