# Lesson 3 - Recommender Systems

### Import the Needed Packages

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm, trange

import uuid

import pandas as pd
import time
import os

from dotenv import load_dotenv
load_dotenv()

True

In [12]:
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

### Load the Dataset

**Note:** To access the dataset outside of this course, just copy the following two lines of code and run it (remember to uncomment them first before executing):

!wget -q --show-progress -O all-the-news-3.zip "https://www.dropbox.com/scl/fi/wruzj2bwyg743d0jzd7ku/all-the-news-3.zip?rlkey=rgwtwpeznbdadpv3f01sznwxa&dl=1"

!unzip all-the-news-3.zip

In [13]:
with open('./data/all-the-news-3.csv', 'r') as f:
    header = f.readline()
    print(header)

date,year,month,day,author,title,article,url,section,publication



In [14]:
df = pd.read_csv('./data/all-the-news-3.csv', nrows=99)
df.head()

Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider
2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Davos,Reuters
3,2019-06-27 00:00:00,2019,6.0,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,World News,Reuters
4,2016-01-27 00:00:00,2016,1.0,27,,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...,https://www.tmz.com/2016/01/27/paris-hilton-mo...,,TMZ


### Setup Pinecone

In [16]:
openai_client = OpenAI(api_key=OPENAI_API_KEY)
INDEX_NAME = f'dl-ai-{str(uuid.uuid4())}'
pinecone = Pinecone(api_key=PINECONE_API_KEY)

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-east-1'))

index = pinecone.Index(INDEX_NAME)

### 1.  Create Embeddings of the News Titles

In [17]:
def get_embeddings(articles, model="text-embedding-ada-002"):
   return openai_client.embeddings.create(input = articles, model=model)

In [18]:
CHUNK_SIZE=400
TOTAL_ROWS=10000
progress_bar = tqdm(total=TOTAL_ROWS)
chunks = pd.read_csv('./data/all-the-news-3.csv', chunksize=CHUNK_SIZE, 
                     nrows=TOTAL_ROWS)
chunk_num = 0
for chunk in chunks:
    titles = chunk['title'].tolist()
    embeddings = get_embeddings(titles)
    prepped = [{'id':str(chunk_num*CHUNK_SIZE+i), 'values':embeddings.data[i].embedding,
                'metadata':{'title':titles[i]},} for i in range(0,len(titles))]
    chunk_num = chunk_num + 1
    if len(prepped) >= 200:
      index.upsert(prepped)
      prepped = []
    progress_bar.update(len(chunk))

100%|██████████| 10000/10000 [05:17<00:00, 31.76it/s]

100%|██████████| 10000/10000 [05:30<00:00, 31.76it/s]

In [19]:
index.describe_index_stats()

{'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '190',
                                    'content-type': 'application/json',
                                    'date': 'Mon, 29 Dec 2025 04:12:48 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '35',
                                    'x-pinecone-request-id': '8172543766958925435',
                                    'x-pinecone-request-latency-ms': '35',
                                    'x-pinecone-response-duration-ms': '37'}},
 'dimension': 1536,
 'index_fullness': 0.0,
 'memoryFullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'__default__': {'vector_count': 10000}},
 'storageFullness': 0.0,
 'total_vector_count': 10000,
 'vector_type': 'dense'}

### Build the Recommender System

In [20]:
def get_recommendations(pinecone_index, search_term, top_k=10):
  embed = get_embeddings([search_term]).data[0].embedding
  res = pinecone_index.query(vector=embed, top_k=top_k, include_metadata=True)
  return res

In [21]:
reco = get_recommendations(index, 'obama')
for r in reco.matches:
    print(f'{r.score} : {r.metadata["title"]}')

0.849975586 : Barack Obama just stepped off the sidelines to defend Obamacare
0.848938 : “Our democracy is at stake”: Obama delivers his first post-presidency campaign speech
0.848388672 : Obama: if you were fine with big government until it served black people, rethink your biases
0.846557617 : President Obama has a new plan to fight the opioid epidemic
0.846313477 : President Obama: Michelle & I Are Gonna Be Renters
0.84375 : Vox Sentences: Obama got a warmer welcome in Hiroshima than the Japanese prime minister
0.843078613 : Obama meets with national security team on Syria, Islamic State
0.842041 : Barack Obama in talks to create shows for Netflix: New York Times
0.84198004 : Watch President Obama dance the tango in Argentina
0.840942383 : Obama and Supreme Court Tag Team on Juvenile Justice Reform


### 2.  Create Embeddings of All News Content

In [22]:
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(name=INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-east-1'))
articles_index = pinecone.Index(INDEX_NAME)

In [23]:
def embed(embeddings, title, prepped, embed_num):
  for embedding in embeddings.data:
    prepped.append({'id':str(embed_num), 'values':embedding.embedding, 'metadata':{'title':title}})
    embed_num += 1
    if len(prepped) >= 100:
        articles_index.upsert(prepped)
        prepped.clear()
  return embed_num

<p style="background-color:#fff1d7; padding:15px; "> <b>(Note: <code>news_data_rows_num = 100</code>):</b> In this lab, we've initially set <code>news_data_rows_num</code> to 100 for speedier results, allowing you to observe the outcomes faster. Once you've done an initial run, consider increasing this value to 200, 400, 700, and 1000. You'll likely notice better and more relevant results.</p>

In [24]:
news_data_rows_num = 100

embed_num = 0 #keep track of embedding number for 'id'
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, 
    chunk_overlap=20) # how to chunk each article
prepped = []
df = pd.read_csv('./data/all-the-news-3.csv', nrows=news_data_rows_num)
articles_list = df['article'].tolist()
titles_list = df['title'].tolist()

for i in range(0, len(articles_list)):
    print(".",end="")
    art = articles_list[i]
    title = titles_list[i]
    if art is not None and isinstance(art, str):
      texts = text_splitter.split_text(art)
      embeddings = get_embeddings(texts)
      embed_num = embed(embeddings, title, prepped, embed_num)

....................................................................................................

In [25]:
articles_index.describe_index_stats()

{'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '188',
                                    'content-type': 'application/json',
                                    'date': 'Mon, 29 Dec 2025 04:17:43 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '40',
                                    'x-pinecone-request-id': '844627059822415430',
                                    'x-pinecone-request-latency-ms': '39',
                                    'x-pinecone-response-duration-ms': '41'}},
 'dimension': 1536,
 'index_fullness': 0.0,
 'memoryFullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'__default__': {'vector_count': 1000}},
 'storageFullness': 0.0,
 'total_vector_count': 1000,
 'vector_type': 'dense'}

### Build the Recommender System

In [29]:
reco = get_recommendations(articles_index, 'trump', top_k=10)
seen = {}
for r in reco.matches:
    title = r.metadata['title']
    if title not in seen:
        print(f'{round(r.score,2)} : {title}')
        seen[title] = '.'

0.84 : Trump keeping options open as Republican feud rages
0.83 : Trump tells anti-abortion marchers he will support them
0.83 : The government official in charge of ethics just harshly condemned Trump’s plan
0.83 : Trump denies report he ordered Mueller fired
0.83 : How the Clinton campaign is making #ThatMexicanThing a thing, explained
0.83 : Why Jews Are Getting Themselves Arrested at ICE Centers Around the Country
0.82 : The most revealing Republican ad of the election is an attack ad against Tim Kaine
