<a href="https://colab.research.google.com/github/Pavun-KumarCH/Research-Notebooks/blob/main/Recommender_System_VDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recommender System with Vector Database

In [None]:
#@title requirements
%pip install --q langchain openai pinecone

In [None]:
# Load Dependencies
import os
import time
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm
from pinecone import Pinecone, ServerlessSpec
from IPython.display import Markdown, display
from langchain.text_splitter import RecursiveCharacterTextSplitter

import warnings
warnings.filterwarnings('ignore')

# Environment
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')

In [None]:
# Download the dataset
#!wget -q --show-progress -O all-the-news-3.zip "https://www.dropbox.com/scl/fi/wruzj2bwyg743d0jzd7ku/all-the-news-3.zip?rlkey=rgwtwpeznbdadpv3f01sznwxa&dl=1"

#!unzip all-the-news-3.zip

In [None]:
with open('all-the-news-3.csv', 'r') as file:
  header = file.readline()
  print(header)

In [None]:
df = pd.read_csv('all-the-news-3.csv', nrows = 99)
df.head()

In [None]:
def create_dlai_index_name(index_name):
  openai_key = ""
  try:
    # For Google Colab
    from google.colab import userdata
    openai_key = userdata.get("OPENAI_API_KEY")
  except ImportError:
    # For Jupyter or other environments
    openai_key = os.getenv("OPENAI_API_KEY")

    # Ensure openai_key is not empty
  if not openai_key:
    raise ValueError("OpenAI API key is missing.")

  return f'{index_name}-{openai_key[-36:].lower().replace("_", "-")}'

In [None]:
# Setup PineCone

pinecone = Pinecone(api_key = PINECONE_API_KEY)

INDEX_NAME = create_dlai_index_name("rsdl-ai")

# delete index if already exists
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

# create index
pinecone.create_index(name = INDEX_NAME,
                      dimension = 1536,
                      metric = 'cosine',
                      spec = ServerlessSpec(cloud = 'aws', region = 'us-east-1'))
index = pinecone.Index(INDEX_NAME)
index

In [None]:
#@title Create Embeddings of the News Titles
openai_client = OpenAI(api_key = OPENAI_API_KEY)

def get_embeddings(articles, model = "text-embedding-ada-002"):
  return openai_client.embeddings.create(input = articles, model = model)

In [None]:
#@title Prepare the Embeddings and Upsert(upload) to Pinecone

chunk_size = 400
total_rows = 10000
progress_bar = tqdm(total = total_rows)
chunks = pd.read_csv('all-the-news-3.csv',
                     chunksize = chunk_size,
                     nrows = total_rows)

chunk_num = 0
for chunk in chunks:
  titles = chunk['title'].tolist()
  embeddings = get_embeddings(titles)
  prepared = [{'id':str(chunk_num * chunk_size + i),
               'values': embeddings.data[i].embedding,
               'metadata': {'title': titles[i]},} for i in range(0, len(titles))]

  chunk_num += 1
  if len(prepared) >= 250:
    index.upsert(vectors = prepared)
    prepared = []
  progress_bar.update(chunk_size)
progress_bar.close()

In [None]:
index.describe_index_stats()

### Build the Recommender System

In [None]:
def get_recommendations(pinecone_index, search_term, top_k = 5):
  embed = get_embeddings([search_term]).data[0].embedding
  res = pinecone_index.query(vector = embed,
                             top_k = top_k,
                             include_metadata = True)
  return res

In [None]:
reco = get_recommendations(index, "trump")
for r in reco.matches:
  print(f'{r.score} : {r.metadata["title"]}')

## Create Embeddings of ALL News Content


In [None]:
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(name=INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-east-1'))
articles_index = pinecone.Index(INDEX_NAME)

In [None]:
def embed(embeddings, title, prepared, embed_num):
  for embedding in embeddings.data:
    prepared.append({'id': str(embed_num),
                     'values': embedding.embedding,
                     'metadata': {'title':title}})
    embed_num += 1
    if len(prepared) >= 100:
        articles_index.upsert(prepared)
        prepared.clear()
  return embed_num

In [None]:
news_data_rows_num = 100

embed_num = 0 #keep track of embedding number for 'id'
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400,
    chunk_overlap=20) # how to chunk each article
prepped = []
df = pd.read_csv('all-the-news-3.csv', nrows=news_data_rows_num)
articles_list = df['article'].tolist()
titles_list = df['title'].tolist()

for i in range(0, len(articles_list)):
    print(".",end="")
    art = articles_list[i]
    title = titles_list[i]
    if art is not None and isinstance(art, str):
      texts = text_splitter.split_text(art)
      embeddings = get_embeddings(texts)
      embed_num1 = embed(embeddings, title, prepared, embed_num)

### Build the Recommender System

In [None]:
reco  = get_recommendations(articles_index, "trump")
seen = {}
for r in reco.matches:
  title = r.metadata['title']
  if title not in seen:
    print(f'{r.score} : {title}')
    seen[title] = '.'