# Retrieval-Augmented Generation (RAG)

In [1]:
%pip install --quiet --upgrade langchain langchain-community langchain-openai langgraph faiss-cpu

Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import load_dotenv
load_dotenv()
import pandas as pd
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
import os
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

from langchain_openai import OpenAIEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
import faiss

from langchain import hub
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, api_key=api_key)
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [3]:
# FAISS
embedding_dim = len(embeddings.embed_query("hello world"))
index = faiss.IndexFlatL2(embedding_dim)

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [4]:
'''
Test Prompt
'''
# Add a document
doc = Document(page_content="LangChain is a framework for developing applications powered by LLMs.")

# Embed and add to vector store
vector_store.add_documents([doc])

# Similarity search
query = "What is LangChain?"
results = vector_store.similarity_search(query, k=1)

# Print result
print("Top result:", results[0].page_content)

Top result: LangChain is a framework for developing applications powered by LLMs.


### Loading Document

In [5]:
df = pd.read_csv("cura_db.digital_footprint_data.csv")

#--------------

df.head()

Unnamed: 0,_id,name,websiteUrl,products_services,industry,country,size,country_iso_2,company_footprint,isValidUrl
0,6836e41076d2558cf288833e,Plug and Play,https://www.plugandplayapac.com,Global open innovation platform connecting sta...,"Innovation platform, Venture Capital, Startup ...",United States,enterprise,US,"Announced a new location in Phnom Penh, Cambod...",True
1,6836e41176d2558cf288833f,Ministry of Post and Telecommunications (MPTC),https://mptc.gov.kh/en/,Management and regulation of Cambodia's postal...,"Government Ministry, Telecommunications",Cambodia,medium,KH,Became a founding anchor partner for Plug and ...,True
2,6836e41476d2558cf2888340,American University of Phnom Penh Technology C...,,"Technology startup incubation, mentorship, res...","Education, Technology Incubator",Cambodia,small,KH,Serving as an advisory member and collaboratin...,False
3,6836e41876d2558cf2888341,Tuya Smart,https://www.tuya.com,"AI cloud platform services, smart home ecosyst...","Artificial Intelligence, Cloud Computing, Inte...",China,enterprise,CN,Named to the '2025 Forbes China AI Tech Enterp...,True
4,6836e46576d2558cf2888342,Itential,,"Network automation and orchestration platform,...","Computer Software, Network Automation",United States,medium,US,Announced the launch of the Itential MCP Serve...,False


In [6]:
documents = []

for _, row in df.iterrows():
    content = f"""
    ID: {row.get('_id','')}
    Name: {row.get('name', '')}
    Website URL: {row.get('websiteUrl','')}
    Products & Services: {row.get('products_services', '')}
    Industry: {row.get('industry', '')}
    Country: {row.get('country', '')}
    Size: {row.get('size', '')}
    Country ISO 2: {row.get('country_iso_2','')}
    Company Footprint: {row.get('company_footprint','')}
    Valid URL: {row.get('isValidUrl','')}
    """
    documents.append(Document(page_content=content.strip()))

#---------------

print(documents[3].page_content)

ID: 6836e41876d2558cf2888341
    Name: Tuya Smart
    Website URL: https://www.tuya.com
    Products & Services: AI cloud platform services, smart home ecosystem solutions, IoT platform
    Industry: Artificial Intelligence, Cloud Computing, Internet of Things
    Country: China
    Size: enterprise
    Country ISO 2: CN
    Company Footprint: Named to the '2025 Forbes China AI Tech Enterprises Top 50' list by Forbes China, recognized for robust technological capabilities and contributions to AI accessibility and commercialization; hosted the 2025 TUYA Global Developer Summit in Shenzhen.
    Valid URL: True


### Splitting Document

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,  # chunk size (characters)
    chunk_overlap=100,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(documents)

#------------

print(f"Split blog post into {len(all_splits)} sub-documents.")

Split blog post into 14 sub-documents.


### Storing Document in FAISS

In [8]:
document_ids = vector_store.add_documents(documents=all_splits)

#------------

print(document_ids[:3])

['c4274038-b6b9-4e2b-82b6-4eee2d04dee2', '73ceecf3-0961-4f1f-bfaf-2132d56c28a9', 'd032954e-417c-4f17-bd74-1c13d09c1335']


### Retrieval and Generation

In [9]:
prompt = hub.pull("rlm/rag-prompt")



### Tracking prompts

In [10]:
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

### Application Step

In [11]:
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}

def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

### Control Flow

In [12]:
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

### Sample Prompts

In [13]:
response = graph.invoke({"question": "Which companies announced partnerships?"})

answer = response["answer"]
print("Answer:", answer)

Answer: Zilliz announced partnerships with leading global e-commerce platforms for its Milvus vector database solutions. Plug and Play announced collaboration with the Cambodian Ministry of Post and Telecommunications and American University of Phnom Penh Technology Center. Autonomous Inc. announced the availability of Brainy, a local GPU server, for accelerating AI and deep learning innovation.


In [14]:
response = graph.invoke({"question": "Which companies announced partnerships with Schneider Electric?"})

answer = response["answer"]
print("Answer:", answer)

Answer: Schneider Electric announced partnerships with Advanced Power Technology (APT) to deploy a new modular containerised data centre at The Pirbright Institute, recognized at the DCS Awards 2025. Schneider Electric also announced a new location in Phnom Penh, Cambodia, collaborating with the Cambodian Ministry of Post and Telecommunications and American University of Phnom Penh Technology Center.


In [15]:
response = graph.invoke({"question": "Which company expanded to a new region?"})

answer = response["answer"]
print("Answer:", answer)

Answer: Plug and Play expanded to a new region by launching the Plug and Play Cambodia Program in Phnom Penh, Cambodia. They are collaborating with the Cambodian Ministry of Post and Telecommunications and the American University of Phnom Penh Technology Center.


In [16]:
response = graph.invoke({"question": "List all companies that partnered with Plug and Play."})

answer = response["answer"]
print("Answer:", answer)

Answer: The companies that partnered with Plug and Play are the American University of Phnom Penh Technology Center (ATC) and the Ministry of Post and Telecommunications (MPTC) in Cambodia. They collaborated to support the Plug and Play Cambodia Program in fostering digital innovation and entrepreneurship in Cambodia.


In [17]:
response = graph.invoke({"question": "List companies from the UK involved in international projects."})

answer = response["answer"]
print("Answer:", answer)

Answer: The companies from the UK involved in international projects are Advanced Power Technology (APT) and FARFETCH. Advanced Power Technology partnered with Schneider Electric to deploy a new modular containerised data centre at The Pirbright Institute. FARFETCH adopted Zilliz's vector database solutions for faster search and smarter recommendations.


In [18]:
response = graph.invoke({"question": "Which projects won awards or were recognized in 2025?"})

answer = response["answer"]
print("Answer:", answer)

Answer: Schneider Electric won the 'Data Centre Consolidation/Upgrade Project of the Year' category at the DCS Awards 2025 for deploying a new modular data centre at The Pirbright Institute in partnership with Advanced Power Technology (APT). Advanced Power Technology (APT) partnered with Schneider Electric to deploy a new modular containerised data centre at The Pirbright Institute, which was recognized at the DCS Awards 2025.
