In [None]:
!pip install -q "langchain==0.3.27" "langchain-community==0.3.31" faiss-cpu sentence-transformers unstructured libmagic python-magic langchain-groq

In [2]:
import os
import pickle
import time
import langchain
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.vectorstores import FAISS

In [5]:
from google.colab import userdata
GROQ_API_KEY = userdata.get("GROQ_API_KEY")

In [10]:
llm=ChatGroq(model="openai/gpt-oss-120b", api_key=GROQ_API_KEY)

In [3]:
loaders = UnstructuredURLLoader(urls=[
    "https://blogs.worldbank.org/en/opendata/gold-shines-amid-uncertainty",
    "https://www.jpmorgan.com/insights/global-research/commodities/gold-prices"
])
data = loaders.load()
len(data)

2

### Create chunks

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

docs = text_splitter.split_documents(data)

In [8]:
len(docs)

37

### Create embeddings for these chunks and save them to FAISS index

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-MiniLM-L6-v2" # Alibaba-NLP/gte-base-en-v1.5

embeddings = HuggingFaceEmbeddings(model_name=model_name)

vectorindex_hf = FAISS.from_documents(docs, embeddings)

In [11]:
file_path="vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectorindex_hf, f)

In [12]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

In [13]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorIndex.as_retriever())

In [None]:
query = "How much percentage increased gold in 2025?"

chain({"question": query}, return_only_outputs=True)

  chain({"question": query}, return_only_outputs=True)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2000 > 1024). Running this sequence through the model will result in indexing errors


{'answer': 'Gold is expected to rise sharply in\u202f2025.  According to a World Bank analysis, gold prices are projected to increase by **about\u202f35\u202fpercent** over the full year 2025 (year‑over‑year)【',
 'sources': 'https://blogs.worldbank.org/en/opendata/gold-shines-amid-uncertainty】.  A separate observation notes that gold “surged nearly\u202f25\u202fpercent during the first half of\u202f2025,” confirming a strong upward trend in the first half of the year【'}