In [10]:
# Had to pip install jupyter first
!pip install python-dotenv



In [1]:
%load_ext autoreload
%autoreload 2


In [20]:

import os
import sys
from dotenv import load_dotenv
load_dotenv()

import pandas as pd


from langchain.chains import RetrievalQA
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import Language
from langchain_openai import OpenAIEmbeddings, ChatOpenAI, OpenAI
from langchain_community.vectorstores import Chroma
from langchain.schema.document import Document

from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.streaming_stdout_final_only import FinalStreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.llms import GPT4All
from langchain.prompts import PromptTemplate


from langchain.embeddings import HuggingFaceEmbeddings

# Data

In [3]:
df_reviews = pd.read_csv("dummy_data_new.csv", index_col=0)

In [4]:
df_reviews.head()

Unnamed: 0,Product Name,Product Description,Review Text,Rating
0,iPhone 15,The Apple iPhone 15 redefines smartphone innov...,The iPhone 15 is a masterpiece! The sleek desi...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."
1,MacBook Pro 2023,Experience the ultimate in computing power wit...,The MacBook Pro 2023 is a game-changer! The pe...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."
2,Kindle Paperwhite,"Enjoy reading your favorite books anytime, any...",The Kindle Paperwhite is a must-have for book ...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."
3,Canon EOS R5,Capture life's moments in stunning detail with...,The Canon EOS R5 is a game-changer! The image ...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."
4,Nike Air Zoom Pegasus 38,Experience unparalleled comfort and performanc...,The Nike Air Zoom Pegasus 38 is a game-changer...,"{""durability"": 5, ""ease of use"": 4, ""pleasant ..."


In [5]:
df_reviews.shape

(109, 4)

In [6]:
# Check out some descriptions to use as input
df_reviews["Product Name"].sample(1).iloc[0]

'Game of Thrones: A Song of Ice and Fire'

# Criteria generation

### A) Langchain - OpenAI

In [7]:
# !pip --quiet install langchain langchain-community langchain-openai chromadb

In [8]:

def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100, )
    docs = [Document(page_content=x) for x in text_splitter.split_text(text)]
    return docs


def embed_texts_openai(texts, openai_api_key):
    print(f"Embedding {len(texts)} texts...", end=' ')
    # Instantiate an embedder
    embedder = OpenAIEmbeddings(openai_api_key=openai_api_key)
    # Use the embedder to populate a Chroma vector store with our texts.
    doc_search = Chroma.from_documents(texts, embedder)
    print("✅")
    return doc_search


def run_qa_openai(doc_search, prompt, openai_api_key):
    print(f"Running QA...", end=' ')

    # Retrieval QA
    # - chain_type="stuff": the model 'stuffs' all our texts into a single prompt (sufficiently small)
    # - model: latest GPT-3.5-Turbo model.
    qa = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(name="gpt-3.5-turbo", api_key=openai_api_key),
        chain_type="stuff",
        retriever=doc_search.as_retriever(search_kwargs={"k": 1})  # 1 doc to return max
    )

    answer = qa.invoke(prompt)
    print("✅")
    return answer["result"]

In [9]:
OPEN_API_KEY = os.environ.get('OPENAI_API_KEY')
PRODUCT_INPUT = "Nike Men's Revolution 5 Running Shoes"


chunks = get_text_chunks(PRODUCT_INPUT)
chunks

[Document(page_content="Nike Men's Revolution 5 Running Shoes")]

In [10]:
doc_search = embed_texts_openai(chunks, OPEN_API_KEY)
doc_search

Embedding 1 texts... ✅


<langchain_community.vectorstores.chroma.Chroma at 0x7f67badd74f0>

In [11]:
prompt = """
Given this product title, please select between 3 and 6 criteria to rate in order to compose a product review.
"""
answer = run_qa_openai(doc_search, prompt, OPEN_API_KEY)

print(f"Product: {PRODUCT_INPUT}\n")
print(f"Some rating criteria:\n{answer}")

Running QA... ✅
Product: Nike Men's Revolution 5 Running Shoes

Some rating criteria:
1. Comfort
2. Durability
3. Fit
4. Cushioning
5. Design
6. Breathability


### B) Langchain - GPT4 all products and reviews

### Model

Pick a model from the "Model Explorer" section on the [GPT4All page](https://gpt4all.io/index.html).

In [12]:
#MODEL_NAME = 'orca-2-7b.Q4_0.gguf'  # Change here
#MODEL_NAME = 'mistral-7b-openorca.gguf2.Q4_0.gguf'
MODEL_NAME = 'orca-mini-3b-gguf2-q4_0.gguf'
MODEL_PATH = '../models/' + MODEL_NAME

# -C - option to continue transfer automatically (so reuse file if already downloaded)
!curl -C - -o {MODEL_PATH} https://gpt4all.io/models/gguf/{MODEL_NAME}
!ls -lh ../models

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1888M  100 1888M    0     0  16.9M      0  0:01:51  0:01:51 --:--:-- 12.5MM    0     0  17.4M      0  0:01:48  0:00:20  0:01:28 16.7M
total 5.5G
-rw-r--r-- 1 shahab shahab 3.6G Jun 14 02:07 orca-2-7b.Q4_0.gguf
-rw-r--r-- 1 shahab shahab 1.9G Jun 14 14:58 orca-mini-3b-gguf2-q4_0.gguf


In [13]:
# Callback that supports token-wise streaming but will only return the final output
# rather than intermediary steps
callbacks = [FinalStreamingStdOutCallbackHandler()]

# verbose=True is required for the callback manager
llm = GPT4All(model=MODEL_PATH, callbacks=callbacks, verbose=True)

Failed to load libllamamodel-mainline-cuda.so: dlopen: libcudart.so.12: cannot open shared object file: No such file or directory
Failed to load libllamamodel-mainline-cuda-avxonly.so: dlopen: libcudart.so.12: cannot open shared object file: No such file or directory


### LLM chain

In [None]:
original_prompt = """
For this product, please produce exactly 6 criteria that could be rated by a user for a review. No more details needed. Each criterion should be numbered and described in detail. Here’s an example format:

1. Comfort - How comfortable are these shoes?
2. Performance - Are these shoes good at providing performance?
3. Style - Do these shoes have a stylish design?
4. Durability - Can these shoes withstand wear and tear?
5. Price - How much are these shoes compared to similar products in the market?
6. Size - Are these shoes available in different sizes?
"""

template = f"""Product: '{{product_type}}'
{original_prompt}"""
prompt = PromptTemplate(template=template, input_variables=["product_type"])

In [None]:
# Create the LLM chain
llm_chain = LLMChain(prompt=prompt, llm=llm, return_final_only=True)
llm_chain

LLMChain(prompt=PromptTemplate(input_variables=['product_type'], template="Product: '{product_type}'\n\nFor this product, please produce between 3 and 6 criteria, with at least 3, that could be rated by a user for a review. do not rate it. No more details needed just only the criteria. Each criterion should be numbered and described in detail.\n\n"), llm=GPT4All(verbose=True, callbacks=[<langchain.callbacks.streaming_stdout_final_only.FinalStreamingStdOutCallbackHandler object at 0x7f67badd7fd0>], model='../models/orca-mini-3b-gguf2-q4_0.gguf', client=<gpt4all.gpt4all.GPT4All object at 0x7f67badd4730>))

In [None]:
%%time
res = llm_chain.run(product_type=PRODUCT_INPUT)
# Note that the result needs to be printed explicitly to be shown properly since
# it contains line returns
print(res)

1. Comfort - How comfortable are these shoes? (5-10)
2. Performance - Are these shoes good at providing performance? (8-10)
3. Style - Do these shoes have a stylish design? (9-10)
4. Durability - Can these shoes withstand wear and tear? (7-10)
5. Price - How much are these shoes compared to similar products in the market? (6-8)
6. Size - Are these shoes available in different sizes? (1-2)
CPU times: user 4min 5s, sys: 7.29 s, total: 4min 12s
Wall time: 1min 4s


### Retrieval QA chain
#### Prompt only


In [25]:
#!pip install sentence-transformers --quiet

In [26]:
EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
PERSIST_DIRECTORY = '../db/chroma_3/'


def embed_texts_hg(texts):
    print(f"Embedding {len(texts)} texts...", end=' ')

    embedder = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    vector_db = Chroma.from_documents(
        documents=texts,
        embedding=embedder,
        persist_directory=PERSIST_DIRECTORY
    )
    print("✅")
    return vector_db

In [27]:
chunks


[Document(page_content="Nike Men's Revolution 5 Running Shoes")]

In [28]:
doc_search = embed_texts_hg(chunks)
doc_search

Embedding 1 texts... 

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


✅


<langchain_community.vectorstores.chroma.Chroma at 0x7f66238debf0>

In [29]:
%%time

def run_qa_hg(doc_search, prompt):
    print(f"Running QA...", end=' ')

    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",  # the model 'stuffs' all our texts into a single prompt (sufficiently small)
        retriever=doc_search.as_retriever(search_kwargs={"k": 1}),
        return_source_documents=False,
        verbose=False,
    )

    answer = qa.invoke(prompt)
    print("✅")
    return answer["result"]

answer = run_qa_hg(doc_search, original_prompt)
print(answer)

Running QA... ✅

1. Comfort - The shoes provide excellent comfort due to their lightweight design and air cushion sole.
2. Performance - The shoes are designed for long-distance running, providing adequate support and cushioning for the feet.
3. Style - The shoes have a sleek and modern design that is both stylish and functional.
4. Durability - The shoes are made with high-quality materials that make them durable enough to withstand regular use and wear.
5. Price - The shoes are competitively priced, making them an affordable option for those looking for a reliable running shoe.
CPU times: user 39min 25s, sys: 1min 40s, total: 41min 6s
Wall time: 18min 33s


In [40]:
text = " In the heart of a bustling city, amidst the towering skyscrapers and the constant hum of traffic, there lies a hidden gem—a small, serene park known to few but cherished by those who do. The park, with its lush greenery and tranquil ponds, offers a stark contrast to the surrounding urban jungle. Tall trees provide shade and a sense of calm, while birds chirp melodiously, creating a soothing symphony that drowns out the city's noise. Winding paths lead to quaint benches and secluded spots, perfect for contemplation or a quiet read. It is a place where time seems to slow down, allowing visitors to escape the rush and reconnect with nature, even if just for a moment."
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100, )
page = text_splitter.split_text(text)
print(page[0])
print(page[1])
len(page)
docs = [Document(page_content=x) for x in page]

In the heart of a bustling city, amidst the towering skyscrapers and the constant hum of traffic, there lies a hidden gem—a small, serene park known to few but cherished by those who do. The park, with its lush greenery and tranquil ponds, offers a stark contrast to the surrounding urban jungle. Tall trees provide shade and a sense of calm, while birds chirp melodiously, creating a soothing symphony that drowns out the city's noise. Winding paths lead to quaint benches and secluded spots,
symphony that drowns out the city's noise. Winding paths lead to quaint benches and secluded spots, perfect for contemplation or a quiet read. It is a place where time seems to slow down, allowing visitors to escape the rush and reconnect with nature, even if just for a moment.


2

In [51]:

OPEN_API_KEY = os.environ.get('OPENAI_API_KEY')
embedder = OpenAIEmbeddings(openai_api_key=OPEN_API_KEY)
doc_search = Chroma.from_documents(docs, embedder)
doc_search

<langchain_community.vectorstores.chroma.Chroma at 0x7f7bb4d38d90>

In [58]:
# prompt = """
# Given this product title, please select between 3 and 6 criteria to rate in order to compose a product review.
# """
prompt = 'which city is the capital of Iran'
qa = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(model_name="gpt-3.5-turbo", openai_api_key=OPEN_API_KEY),
        chain_type="stuff",
        retriever=doc_search.as_retriever(search_kwargs={"k": 1})  # 1 doc to return max
    )

answer = qa.invoke(prompt)
answer

{'query': 'which city is the capital of Iran',
 'result': 'The capital of Iran is Tehran.'}