In [1]:
!pip install -U datasets huggingface_hub fsspec python-dotenv pinecone

Collecting fsspec
  Using cached fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)


In [2]:
!pip install -U sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

### Connect to Pinecone

In [5]:
from pinecone.grpc import PineconeGRPC as Pinecone
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Create .env file
# PINECONE_API_KEY = "pcsk_......._enjo6wtVm4ya............zojvuenqHzFb.........."

# Retrieve API key from environment variables
pinecone_api_key = os.getenv('PINECONE_API_KEY')


pc = Pinecone(api_key=pinecone_api_key)

In [6]:
response = pc.list_indexes()
print(response)

[{
    "name": "chatbot-index",
    "metric": "cosine",
    "host": "chatbot-index-5risz3f.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}]


### Load In Embedding Model

In [7]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [8]:
sentences = ["This is an example sentence", "Each sentence is converted"]

embeddings = model.encode(sentences)
print(embeddings.shape)

(2, 384)


## Building Vector DB

### Load Dataset

In [9]:
from datasets import load_dataset
from itertools import islice

ds_iter = load_dataset("abisee/cnn_dailymail", "3.0.0", split="train", streaming=True)
ds_10k = list(islice(ds_iter, 10_000))

README.md: 0.00B [00:00, ?B/s]

In [10]:
ds_10k[0]['article']

'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details o

In [11]:
!pip install langchain



In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size = 400
chunk_overlap = 80
splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

chunked_docs = []

for i, doc in enumerate(ds_10k):
    chunks = splitter.split_text(doc['article'])
    for chunk_idx, chunk in enumerate(chunks):
        # Copy original doc, but add 'chunk' and 'chunk_idx'
        new_doc = dict(doc)  # shallow copy to preserve fields
        new_doc['chunk'] = chunk
        new_doc['chunk_idx'] = chunk_idx
        chunked_docs.append(new_doc)

print(f"Total chunked docs: {len(chunked_docs)}")
print(chunked_docs[0])

Total chunked docs: 117918
{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movi

### Insert Docs into VectorDB

In [13]:
doc_text = [doc['chunk'] for doc in chunked_docs[:10000]]
doc_embeddings = model.encode(doc_text)
print(doc_embeddings.shape)

(10000, 384)


In [14]:
for i, embedding in enumerate(doc_embeddings):
    chunked_docs[i]['embedding'] = embedding.tolist()

In [15]:
from pinecone import ServerlessSpec
import time

def create_pinecone_index(index_name, embed_dim, client, spec):
    # check if index already exists (it shouldn't if this is your first run)
    if index_name not in client.list_indexes().names():
        # if does not exist, create index
        client.create_index(
            index_name,
            dimension=embed_dim,  # dimensionality of text-embed-3-small
            metric='dotproduct',
            spec=spec
        )
        # wait for index to be initialized
        while not client.describe_index(index_name).status['ready']:
            time.sleep(1)

    # connect to index
    index = client.Index(index_name)
    time.sleep(1)

    stats = index.describe_index_stats()

    return index, stats



spec = ServerlessSpec(cloud="aws", region="us-east-1")
index_name = "cnn-dailymail-news"  # choose a unique name
dimension = len(doc_embeddings[0])
metric = "dotproduct"


index, stats = create_pinecone_index(index_name, dimension, pc, spec)

In [16]:
vectors = [
    (str(i), doc_embeddings[i].tolist(), {'chunk': chunked_docs[i]['chunk'],
                                          'chunk_idx': chunked_docs[i]['chunk_idx'],
                                          'article': chunked_docs[i]['article']})
    for i in range(len(doc_embeddings))
]

In [17]:
batch_size = 100  # adjust as needed

for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i+batch_size]
    index.upsert(vectors=batch)

## Search


In [18]:
index_name = "cnn-dailymail-news"
index = pc.Index(index_name)

In [19]:
def retrieve_results(query):
  query_embedding = model.encode(query).tolist()

  results = index.query(
      vector=query_embedding,
      top_k=5,
      include_metadata=True
  )

  results = [result for result in results['matches']]

  return results

In [20]:
query = "latest news in fashion"

results = retrieve_results(query)

for match in results:
  print(f"Score: {match['score']:.4f}")
  print(f"Text: {match['metadata']['chunk']}\n")

Score: 0.5849
Text: stand for something. Fashion is about establishing an image that consumers can adapt to their own individuality. And it's an image that can change, that can evolve. It doesn't reinvent itself every two years." However, with a media that is insatiable for the new, the now and the next, being steadfast doesn't always make for good copy. "The spotlight is always going to search for the newcomer,"

Score: 0.5300
Text: thanks to the tome Ralph Lauren (Rizzoli), celebrating his 40-years-and-growing career. But far from giving his customary over-the-head wave and riding off into his Colorado-ranch sunset, the designer is going even more global. "Americans have a real inferiority about their own style. We've brought sportswear to the world, and yet we have a long way to go." Already in Milan, London, Paris and

Score: 0.4800
Text: Sinatra, Cary Grant and Astaire, the ones who last the longest are the ones whose style has a consistency, whose naturalness is part of their exc

## RAG

In [23]:
import openai
from dotenv import load_dotenv
import os

load_dotenv()

# Initialize the client with your API key
# OPENAI_API_KEY = "sk-proj-9...bCBZ...............dUA"
openai_api_key = os.getenv('OPENAI_API_KEY')
client = openai.OpenAI(
        api_key=openai_api_key,
)

In [45]:
def generate_rag_prompt(user_message):
    results = retrieve_results(user_message)
    context = "\n\n".join(result['metadata']['chunk'] for result in results)
    prompt = f"""

User Message:
{user_message}

{"="*100}

Context:
{context}
                """

    print(prompt)
    return prompt


def chat_openai(user_message):
    response = client.chat.completions.create(
        model="gpt-4.1-nano",
        messages=[
            {"role": "system", "content": """You are an assistant that will receive a user message along with additional context information. Your task is to generate a helpful and accurate response to the user message.
                                             Important: Always base your response on the context provided—make sure your answer is directly supported by and grounded in the given context. If the context does not contain enough information to fully answer the user's message, let the user know or ask for clarification."""
            },
            {"role": "user", "content": generate_rag_prompt(user_message)}
        ]
    )
    # Grab the assistant's reply
    chat_output = response.choices[0].message.content
    return chat_output

In [46]:
user_message = "What is the latest news in sports?"
response = chat_openai(user_message)
print("="*100, "\n", "Response:\n\n")
print(response)



User Message:
What is the latest news in sports?


Context:
by the football's English Premier League to play matches overseas and underline the growing trend of globalization in major sports leagues. This was further evidenced by Super Bowl champions New York Giants playing a regular NFL season game against the Miami Dolphins at Wembley Stadium in London, but the NBA's leaked plans go far beyond that. A report on the Sports Illustrated Web site, says NBA

became an official Olympic sport. Copyright 2008 CNN. All rights reserved.This material may not be published, broadcast, rewritten, or redistributed. Associated Press contributed to this report.

plans go far beyond that. A report on the Sports Illustrated Web site, says NBA commisioner David Stern will reveal his proposals on the eve of the All-Star Game in New Orleans this weekend. The plans are understood to include the formation of a European division with five new teams in major markets. The teams would play a full 82-game sche

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

## Recommender Systems

In [32]:
index_name = "cnn-dailymail-news"
index = pc.Index(index_name)

In [33]:
user_profiles = [
    "A college-educated professional in their 30s who checks CNN for political and global news on weekday mornings, often sharing trending stories with coworkers.",
    "A retiree who reads Daily Mail on a tablet, following celebrity gossip, royal family updates, and lighthearted human interest stories each afternoon.",
    "A busy parent in their 40s who scans CNN headlines on a smartphone during commutes, interested in US politics, health news, and educational trends.",
    "A recent college graduate who prefers Daily Mail for viral stories, entertainment news, and quirky international events, mostly browsing during lunch breaks.",
    "A mid-level manager who starts the day with CNN business updates and international affairs, occasionally commenting on articles and saving major stories to read later."
]

In [34]:
user_profile = user_profiles[-1]
print("User Profile:", user_profile, "\n\n")

results = retrieve_results(user_profile)

print("Results:\n")
for match in results:
  print(f"Score: {match['score']:.4f}")
  print(f"Text: {match['metadata']['chunk']}\n")

User Profile: A mid-level manager who starts the day with CNN business updates and international affairs, occasionally commenting on articles and saving major stories to read later. 


Results:

Score: 0.4642
Text: (CNN) -- As part of an effort to share the best practices of modern business among organizations across the globe, CNN is talking to some of the world's top executives. In line with this, CNN will be hosting three events looking at modern strategies in today's business arena. The first of these master classes will be based at the China Europe International Business School in Shanghai and will be

Score: 0.4144
Text: a site in which Chinese bloggers criticize CNN's coverage. In a statement, CNN said, "We have provided comprehensive coverage of all sides of this story," adding that the network's "reputation is based on reporting global news accurately and impartially." Read the full statement . Earlier this week, China offered some media organizations -- not including CNN -- a