In [1]:
import pandas as pd

In [2]:
# from datasets import load_dataset

# data = load_dataset('squad', split='train')
# data

In [3]:
data  = pd.read_csv('news_data.csv')
data.columns

Index(['Unnamed: 0', 'id', 'heading', 'impact', 'sector', 'publish_date',
       'link', 'summary', 'content', 'startDate', 'duration', 'reasoning',
       'image'],
      dtype='object')

In [4]:
# data = data.to_pandas()
# data.head()

In [5]:
# data.drop_duplicates(subset='context', keep='first', inplace=True)
# data.head()

In [6]:
data.dtypes

Unnamed: 0       int64
id               int64
heading         object
impact          object
sector          object
publish_date    object
link            object
summary         object
content         object
startDate       object
duration        object
reasoning       object
image           object
dtype: object

In [7]:
data['summary']

0     The Best Tea Harvester Competition, hosted by ...
1     The AgriTech Conference 2025, held at the Univ...
2     The Korean Textile Trading Association (KTTA) ...
3     Brandix Apparel has been named the Overall Bes...
4     Excel World Entertainment Park in Colombo has ...
5     Chill Colombo has opened in Port City Colombo,...
6     Dialog Enterprise, the ICT solutions arm of Di...
7     ASUS has launched its 2025 AI-powered laptop l...
8     Prime Group launched PrimeMax on 16 March in C...
9     Industry experts at the Lanka Property Show 20...
10    Taj Samudra, Colombo celebrated Earth Hour by ...
11    The Sri Lanka Tourism Promotion Bureau, alongs...
Name: summary, dtype: object

In [8]:
# data2 = pd.read_csv('articles/art.csv')
# data2.dtypes

In [9]:
import os
from getpass import getpass
from langchain.embeddings.openai import OpenAIEmbeddings

# get API key from top-right dropdown on OpenAI website
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or getpass("Enter your OpenAI API key: ")
model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

  embed = OpenAIEmbeddings(


In [10]:
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.getenv("PINECONE_API_KEY") or getpass("Enter your Pinecone API key: ")

# configure client
pc = Pinecone(api_key=api_key)

  from tqdm.autonotebook import tqdm


In [11]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [12]:
pc.delete_index('doc-index')

In [13]:
import time

index_name = "doc-index"
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [14]:
data['id'] = data['id'].astype(str)

In [15]:
from tqdm.auto import tqdm

batch_size = 100

texts = []
metadatas = []

for i in tqdm(range(0, len(data), batch_size)):
    # get end of batch
    i_end = min(len(data), i+batch_size)
    batch = data.iloc[i:i_end]
    # first get metadata fields for this record
    metadatas = [{
        'title': record['heading'],
        'text': record['summary']
    } for j, record in batch.iterrows()]
    # get the list of contexts / documents
    documents = batch['summary']
    # create document embeddings
    embeds = embed.embed_documents(documents)
    # get IDs
    ids = batch['id']
    # add everything to pinecone
    index.upsert(vectors=zip(ids, embeds, metadatas))

  0%|          | 0/1 [00:04<?, ?it/s]


APIConnectionError: Connection error.

In [None]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
from langchain.vectorstores import Pinecone

text_field = "text"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

  vectorstore = Pinecone(


In [None]:
query = "Is Gusta an online  conference or a meeting?"

vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

[Document(metadata={'title': 'Specialty food platform Gusta upgrades online experience and opens Ahangama store'}, page_content='Gusta is an online platform connecting specialty food providers to discerning customers, offering high-quality foods such as cheeses, meats, fish, and baked goods. The website has attracted a diverse customer base and recently launched its second physical store in Ahangama, catering to tourists and locals alike.'),
 Document(metadata={'title': 'AgriTech Conference 2025 brings innovation and entrepreneurship in agriculture to Jaffna'}, page_content='The AgriTech Conference 2025, held at the University of Jaffna, brought together academics, industry leaders, and students to discuss technology and entrepreneurship in agriculture. The event featured keynote addresses, a problem-pitching session, and an exhibition showcasing innovations. It emphasized the importance of agripreneurship and collaboration for a sustainable agricultural future.'),
 Document(metadata={

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA

# chat completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-4o-mini',
    temperature=0.0
)
# conversational memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)
# retrieval qa chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

  llm = ChatOpenAI(
  conversational_memory = ConversationBufferWindowMemory(


In [None]:
qa.run(query)

  qa.run(query)


'Gusta is not an online conference or meeting; it is an online platform that connects specialty food providers to customers.'

In [None]:
from langchain.agents import Tool

tools = [
    Tool(
        name='Knowledge Base',
        func=qa.run,
        description=(
            'use this tool when answering general knowledge queries to get '
            'more information about the topic'
        )
    )
]

In [None]:
from langchain.agents import initialize_agent

agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conversational_memory
)

  agent = initialize_agent(


In [None]:
agent(query)

  agent(query)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "Is Gusta an online conference or a meeting?"
}
```[0m
Observation: [36;1m[1;3mGusta is not an online conference or meeting; it is an online platform that connects specialty food providers to customers.[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "Gusta is not an online conference or meeting; it is an online platform that connects specialty food providers to customers."
}
```[0m

[1m> Finished chain.[0m


{'input': 'Is Gusta an online  conference or a meeting?',
 'chat_history': [],
 'output': 'Gusta is not an online conference or meeting; it is an online platform that connects specialty food providers to customers.'}

In [None]:
agent("what does gusta mean in any spanish?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "In Spanish, 'gusta' is a form of the verb 'gustar,' which means 'to like' or 'to please.' It is used to express that someone likes something. For example, 'Me gusta el chocolate' means 'I like chocolate.'"
}
```[0m

[1m> Finished chain.[0m


{'input': 'what does gusta mean in any spanish?',
 'chat_history': [HumanMessage(content='Is Gusta an online  conference or a meeting?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Gusta is not an online conference or meeting; it is an online platform that connects specialty food providers to customers.', additional_kwargs={}, response_metadata={})],
 'output': "In Spanish, 'gusta' is a form of the verb 'gustar,' which means 'to like' or 'to please.' It is used to express that someone likes something. For example, 'Me gusta el chocolate' means 'I like chocolate.'"}

In [None]:
agent("can you summarize these facts in two short sentences")