## Libraries

In [93]:
# System Utilities
import os
from dotenv import load_dotenv
from IPython.display import Markdown, display

# LLM Utilities
from llama_index.core import Document
from llama_index.core import Settings
from llama_index.core import SimpleDirectoryReader
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.llms.gemini import Gemini

# Scraping
import re
from llama_index.readers.web import SimpleWebPageReader
from bs4 import BeautifulSoup

# Vector Database
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.gemini import GeminiEmbedding

## Environment Variables

In [4]:
load_dotenv()

GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')

## Application Logic

### Scraping Website Data

LlamaIndex can be used to load data from different sources. In this case we can use the `SimpleWebPageReader` to retrieve data from a webpage. For more information please visit the [LlamaIndex documentation](https://docs.llamaindex.ai/en/stable/understanding/loading/loading/).

In [84]:
web_documents = SimpleWebPageReader().load_data(
    urls=['https://www.scotiabank.com/ca/en/personal/credit-cards/sceneplus.html']
)

html_content = web_documents[0].text

Now that we have the html content we need to parse the data, retrieving the relevant information we need.

In [85]:
# The soup is the html content structured in HTML format
soup = BeautifulSoup(markup=html_content, features='html.parser')

# The information we're looking for is structured in cards, so we need to find them first
card_content_div = soup.find_all('div', class_='card-content')

credit_cards = []
information = []

# Inside the cards, relevant information is located inside <span> and <p> tags.
for card in card_content_div:
    credit_cards.append(card.find('span', class_='subtitle-1').text)
    information.append(card.find_all('p'))

In [86]:
credit_cards

['Scotiabank® Scene+™ Visa* Card ',
 'Scotiabank Passport® Visa Infinite* Card ',
 'Scotiabank Gold American Express® Card  ',
 'Scotiabank American Express® Card ',
 'Scotiabank Platinum American Express® Card ',
 'Scotiabank® Scene+™ Visa* Card (for students) ']

This cycle will unify the content of all `<p>` tags inside a String for each card, and then append it to a list

In [88]:
information_text = []

for list in information:
    text = ''
    for tag in list:
        text += tag.text + '\n'
    information_text.append(text)

# This code will remove the '\xa0' elements from the text
information_text = [s.replace('\xa0', ' ') for s in information_text]

In [89]:
information_text

['Earn up to 7,500 bonus Scene+ points within your first 3 months.2\n \nEarn 2 Scene+ points4 on every $1 you spend at Sobeys, Safeway, Foodland & Participating Co-ops, FreshCo and more.\n \nEarn 2 Scene+ points4 on every $1 you spend at Home Hardware.\n \nAnnual fee: $0\nInterest rates: 20.99% purchases / 22.99% cash advances\n',
 'Earn up to $1,300+ in value in the first 12 months, including up to 40,000 bonus Scene+ points and first year annual fee waived.‡\n \nEarn 3 Scene+ points1 on every $1 you spend at Sobeys, Safeway, IGA, Foodland & Participating Co-ops and more.\n \nAnnual fee: $150\nInterest rates: 20.99% purchases / 22.99% cash advances\n',
 'Earn up to $650* in value in the first 12 months, including up to 40,000 bonus Scene+ points.1\n \nEarn 6 Scene+ points2 on every $1 CAD you spend in Canada at Sobeys, Safeway, FreshCo, Foodland and more.\n \nAnnual fee: $120\nInterest rates: 20.99% purchases / 22.99% cash advances\n',
 'Earn up to 7,500 bonus Scene+ points within you

The following code will combine and add the information of both lists to a string, instantiating a Document object with the `Document` class.

In [91]:
text_content = ''

for i in range(len(credit_cards)):
    text_content += credit_cards[i] + information_text[i]

documents = [Document(text=text_content)]
documents

[Document(id_='c416a861-4f0d-4741-a21a-642b15413bd4', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Scotiabank® Scene+™ Visa* Card Earn up to 7,500 bonus Scene+ points within your first 3 months.2\n \nEarn 2 Scene+ points4 on every $1 you spend at Sobeys, Safeway, Foodland & Participating Co-ops, FreshCo and more.\n \nEarn 2 Scene+ points4 on every $1 you spend at Home Hardware.\n \nAnnual fee: $0\nInterest rates: 20.99% purchases / 22.99% cash advances\nScotiabank Passport® Visa Infinite* Card Earn up to $1,300+ in value in the first 12 months, including up to 40,000 bonus Scene+ points and first year annual fee waived.‡\n \nEarn 3 Scene+ points1 on every $1 you spend at Sobeys, Safeway, IGA, Foodland & Participating Co-ops and more.\n \nAnnual fee: $150\nInterest rates: 20.99% purchases / 22.99% cash advances\nScotiabank Gold American Express® Card  Earn up to $650* in value in the first 12 months, including up to

### Initializing Models

Embeddings Model (**Embedding-001**)

In [95]:
gemini_embedding_model = GeminiEmbedding(api_key=GEMINI_API_KEY, model_name='models/embedding-001')

LLM Model (**Gemini Pro**)

In [96]:
llm = Gemini(api_key=GEMINI_API_KEY, model_name='models/gemini-pro')

### Store Data with Chroma