In [1]:
import openai
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

import pinecone
from langchain.vectorstores import Pinecone

import os
import sys
import streamlit as st
import pickle
import time
import pandas as pd
import numpy
from dotenv import load_dotenv
load_dotenv()

In [2]:
df = pd.read_excel(r"books_data.xlsx")
df.head()

Unnamed: 0,ISBN,Genre,Title,Description
0,978-9364499739,Kids,My First 100 Learnings,This is a box set of 5 early learning board bo...
1,978-9388369999,Kids,My First Five Minutes Fairy Tales,This amazing box containing a set of 20 beauti...
2,978-9399391746,Kids,The Universe within Space,Ever looked up into the sky and wondered what’...
3,978-1639969993,Health,Natural Cures,Hundreds of thousands of readers have relied o...
4,978-1639999161,Health,If Your Mouth Could Talk with me,Your mouth is the gateway to your body and is ...


In [3]:
df['text'] = 'genre: ' + df['Genre'] +  \
            '; title: ' + df['Title'] + \
            '; description: ' + df['Description']
df.head()

Unnamed: 0,ISBN,Genre,Title,Description,text
0,978-9364499739,Kids,My First 100 Learnings,This is a box set of 5 early learning board bo...,genre: Kids; title: My First 100 Learnings; de...
1,978-9388369999,Kids,My First Five Minutes Fairy Tales,This amazing box containing a set of 20 beauti...,genre: Kids; title: My First Five Minutes Fair...
2,978-9399391746,Kids,The Universe within Space,Ever looked up into the sky and wondered what’...,genre: Kids; title: The Universe within Space;...
3,978-1639969993,Health,Natural Cures,Hundreds of thousands of readers have relied o...,genre: Health; title: Natural Cures; descripti...
4,978-1639999161,Health,If Your Mouth Could Talk with me,Your mouth is the gateway to your body and is ...,genre: Health; title: If Your Mouth Could Talk...


In [4]:
df['text'][0]

'genre: Kids; title: My First 100 Learnings; description: This is a box set of 5 early learning board books, each comprising 100 well-researched and attractive images. These books will help your child build vocabulary, observation skills, and prepare them for school'

In [None]:
"""
# To text into chuncks

from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size=200,
    chunk_overlap=0
)

"""

In [5]:
openai.api_key = os.getenv("OPENAI_API_KEY")


In [6]:
def get_embedding(column_name):
    result = openai.Embedding.create(model= "text-embedding-ada-002", input=[column_name])
    embeddings = result["data"][0]["embedding"]
    
    return embeddings

In [13]:
# To prevent RateLimitError or TimeLimitError

import time
import random

embeddings = []

for i in range(len(df)): 
    temp_encoding = get_embedding(df['text'][i])
    embeddings.append(temp_encoding)
    
    time.sleep(random.uniform(6,15))

len(embeddings)

10

In [24]:
df['embeddings'] = embeddings
df.head()

Unnamed: 0,ISBN,Genre,Title,Description,text,embeddings
0,978-9364499739,Kids,My First 100 Learnings,This is a box set of 5 early learning board bo...,genre: Kids; title: My First 100 Learnings; de...,"[0.005151285789906979, 0.011917251162230968, -..."
1,978-9388369999,Kids,My First Five Minutes Fairy Tales,This amazing box containing a set of 20 beauti...,genre: Kids; title: My First Five Minutes Fair...,"[-0.0037359504494816065, -0.01687755063176155,..."
2,978-9399391746,Kids,The Universe within Space,Ever looked up into the sky and wondered what’...,genre: Kids; title: The Universe within Space;...,"[0.040042463690042496, 0.005222073290497065, -..."
3,978-1639969993,Health,Natural Cures,Hundreds of thousands of readers have relied o...,genre: Health; title: Natural Cures; descripti...,"[0.011477301828563213, 0.026183856651186943, -..."
4,978-1639999161,Health,If Your Mouth Could Talk with me,Your mouth is the gateway to your body and is ...,genre: Health; title: If Your Mouth Could Talk...,"[0.0029444003012031317, 0.0050220368430018425,..."


In [25]:
# Saving embeddings for future use to prevent OpenAI cost

from uuid import uuid4

df['id'] = [str(uuid4()) for i in range(len(df))]
df.head()


Unnamed: 0,ISBN,Genre,Title,Description,text,embeddings,id
0,978-9364499739,Kids,My First 100 Learnings,This is a box set of 5 early learning board bo...,genre: Kids; title: My First 100 Learnings; de...,"[0.005151285789906979, 0.011917251162230968, -...",fb5307ff-0a6a-400e-8d7d-cf35234f13b1
1,978-9388369999,Kids,My First Five Minutes Fairy Tales,This amazing box containing a set of 20 beauti...,genre: Kids; title: My First Five Minutes Fair...,"[-0.0037359504494816065, -0.01687755063176155,...",fd782260-acad-4f0e-93e6-93c59172e3bb
2,978-9399391746,Kids,The Universe within Space,Ever looked up into the sky and wondered what’...,genre: Kids; title: The Universe within Space;...,"[0.040042463690042496, 0.005222073290497065, -...",81e0f931-12fa-478c-ba14-424255119ac1
3,978-1639969993,Health,Natural Cures,Hundreds of thousands of readers have relied o...,genre: Health; title: Natural Cures; descripti...,"[0.011477301828563213, 0.026183856651186943, -...",9ca5fbfb-113d-4dff-8d05-6516abea76fc
4,978-1639999161,Health,If Your Mouth Could Talk with me,Your mouth is the gateway to your body and is ...,genre: Health; title: If Your Mouth Could Talk...,"[0.0029444003012031317, 0.0050220368430018425,...",e6903527-6b61-4811-9ea5-ff5caff80e3e


In [76]:
# It is better to save df into pickle file rather than excel file as excel may not store embedding of size 1536

import os
# //.
directory_path = 'Embeddings_dir'
os.makedirs(directory_path, exist_ok=True)

path = os.path.join(directory_path, "books_embeddings.pkl")

In [31]:
with open(path, "wb") as f:
    pickle.dump(df, f)    
    
with open(path, "rb") as f:
    df = pickle.load(f)

In [32]:
df.head()

Unnamed: 0,ISBN,Genre,Title,Description,text,embeddings,id
0,978-9364499739,Kids,My First 100 Learnings,This is a box set of 5 early learning board bo...,genre: Kids; title: My First 100 Learnings; de...,"[0.005151285789906979, 0.011917251162230968, -...",fb5307ff-0a6a-400e-8d7d-cf35234f13b1
1,978-9388369999,Kids,My First Five Minutes Fairy Tales,This amazing box containing a set of 20 beauti...,genre: Kids; title: My First Five Minutes Fair...,"[-0.0037359504494816065, -0.01687755063176155,...",fd782260-acad-4f0e-93e6-93c59172e3bb
2,978-9399391746,Kids,The Universe within Space,Ever looked up into the sky and wondered what’...,genre: Kids; title: The Universe within Space;...,"[0.040042463690042496, 0.005222073290497065, -...",81e0f931-12fa-478c-ba14-424255119ac1
3,978-1639969993,Health,Natural Cures,Hundreds of thousands of readers have relied o...,genre: Health; title: Natural Cures; descripti...,"[0.011477301828563213, 0.026183856651186943, -...",9ca5fbfb-113d-4dff-8d05-6516abea76fc
4,978-1639999161,Health,If Your Mouth Could Talk with me,Your mouth is the gateway to your body and is ...,genre: Health; title: If Your Mouth Could Talk...,"[0.0029444003012031317, 0.0050220368430018425,...",e6903527-6b61-4811-9ea5-ff5caff80e3e


In [36]:
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(
    api_key = os.getenv("PINECONE_API_KEY"),
    environment = "gcp-starter"
)

index_name = "test-1"

if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension = 1536, metric='cosine')
    print(f"Pinecone index: {index_name} is created")
    
index = pinecone.Index(index_name)

Pinecone index: test-1 is created


In [None]:
# To Insert items into the Pinecone index

#items = [{'id': str(idx), 'values': embedding} for idx, embedding in zip(df['id'], df['embeddings'])]
#index.upsert(items)

In [37]:
# To Insert items into the Pinecone index

items = [{'id': str(idx), 
          'values': embedding, 
          'metadata': {'isbn': str(isbn), 'genre': str(genre), 'title': str(title)}}
         for idx, embedding, isbn, genre, title in zip(df['id'], df['embeddings'], df['ISBN'], df['Genre'], df['Title'])]

index.upsert(items)


{'upserted_count': 10}

In [39]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0001,
 'namespaces': {'': {'vector_count': 10}},
 'total_vector_count': 10}

In [40]:
# To retrieve the vector embeddings from Pinecone index

query = "Improve communication and build confidence"

query_embedding = get_embedding(query)
query_embedding

[-0.01622501201927662,
 0.0192844420671463,
 0.024061668664216995,
 -0.017228104174137115,
 0.0014058960368856788,
 0.010701737366616726,
 -0.018369121477007866,
 -0.012839577160775661,
 0.011817676946520805,
 -0.03392958641052246,
 -0.01425644475966692,
 0.013015118427574635,
 -0.0008518444956280291,
 0.014569910243153572,
 -0.008237892761826515,
 -0.004648704081773758,
 0.03571007400751114,
 -0.012168759480118752,
 0.006407249718904495,
 -0.021955175325274467,
 -0.009121867828071117,
 -0.011786330491304398,
 0.004090734291821718,
 0.004814841318875551,
 -0.034431129693984985,
 -0.014532295055687428,
 0.021980252116918564,
 -0.015159226953983307,
 -0.014996225014328957,
 -0.013830129988491535,
 0.024262286722660065,
 -0.009172022342681885,
 0.009172022342681885,
 -0.01892082206904888,
 0.005498197861015797,
 -0.0019638659432530403,
 -0.010275423526763916,
 0.01085847057402134,
 0.013403816148638725,
 -0.009780146181583405,
 0.02318396233022213,
 -0.006607868243008852,
 0.0043853926472

In [45]:
query_path = os.path.join(directory_path,"query_embedding_1.pkl")

with open(query_path, "wb") as f:
    pickle.dump(query_embedding, f)
    

with open(query_path, "rb") as f:
    query_embedding = pickle.load(f)

In [84]:
results = index.query(query_embedding, top_k=3)
results

{'matches': [{'id': '53d3cd29-88a3-4cac-930c-114b7a1a9b44',
              'score': 0.803779304,
              'values': []},
             {'id': 'b67ef3d6-5f8a-4139-b449-c0d7f2995ee7',
              'score': 0.792869449,
              'values': []},
             {'id': 'e6903527-6b61-4811-9ea5-ff5caff80e3e',
              'score': 0.763074,
              'values': []}],
 'namespace': ''}

In [47]:
results['matches'][0]['id']

'53d3cd29-88a3-4cac-930c-114b7a1a9b44'

In [65]:
metadata = index.fetch([results['matches'][0]['id']])
metadata

{'namespace': '',
 'vectors': {'53d3cd29-88a3-4cac-930c-114b7a1a9b44': {'id': '53d3cd29-88a3-4cac-930c-114b7a1a9b44',
                                                      'metadata': {'genre': 'Self-Learning',
                                                                   'isbn': '978-1619625601',
                                                                   'title': 'How '
                                                                            'to '
                                                                            'Talk '
                                                                            'to '
                                                                            'Anyone '
                                                                            'at '
                                                                            'anytime'},
                                                      'values': [-0.00721780909,
                            

In [70]:
isbn = metadata['vectors'][results['matches'][0]['id']]['metadata']['isbn']
title = metadata['vectors'][results['matches'][0]['id']]['metadata']['title']


print(f'{isbn} ----> {title}')

978-1619625601 ----> How to Talk to Anyone at anytime


In [72]:
for match in results['matches']:
    metadata = index.fetch([match['id']])
    isbn = metadata['vectors'][match['id']]['metadata']['isbn']
    title = metadata['vectors'][match['id']]['metadata']['title']
    
    print(f'{isbn} ----> {title}')
    

978-1619625601 ----> How to Talk to Anyone at anytime
978-1619693184 ----> How to Win Friends and Influence People
978-1639999161 ----> If Your Mouth Could Talk with me


In [74]:
# By using Pandas Dataframe

df[df['id'] == results['matches'][0]['id']]

Unnamed: 0,ISBN,Genre,Title,Description,text,embeddings,id
8,978-1619625601,Self-Learning,How to Talk to Anyone at anytime,The author has spent her career teaching peopl...,genre: Self-Learning; title: How to Talk to An...,"[-0.007217809092253447, 0.02839965559542179, 0...",53d3cd29-88a3-4cac-930c-114b7a1a9b44


In [75]:
for result in results['matches']:  
    isbn = df[df['id'] == result['id']]['ISBN'].values[0]
    title = df[df['id'] == result['id']]['Title'].values[0]

    print(str(isbn) + " ---> " + str(title))

978-1619625601 ---> How to Talk to Anyone at anytime
978-1619693184 ---> How to Win Friends and Influence People
978-1639999161 ---> If Your Mouth Could Talk with me
