In [10]:
# Had to pip install jupyter first
#!pip install python-dotenv



In [1]:
%load_ext autoreload
%autoreload 2


In [39]:

import os
import sys
from dotenv import load_dotenv
load_dotenv()

import pandas as pd


from langchain.chains import RetrievalQA
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import Language
from langchain_openai import OpenAIEmbeddings, ChatOpenAI, OpenAI
from langchain_community.vectorstores import Chroma
from langchain.schema.document import Document

from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.streaming_stdout_final_only import FinalStreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.llms import GPT4All
from langchain.prompts import PromptTemplate


#from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings


from langchain_community.document_loaders import TextLoader

# Data

In [3]:
df_reviews = pd.read_csv("dummy_data_new.csv", index_col=0)

In [4]:
df_reviews.head()

Unnamed: 0,Product Name,Product Description,Review Text,Rating
0,iPhone 15,The Apple iPhone 15 redefines smartphone innov...,The iPhone 15 is a masterpiece! The sleek desi...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."
1,MacBook Pro 2023,Experience the ultimate in computing power wit...,The MacBook Pro 2023 is a game-changer! The pe...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."
2,Kindle Paperwhite,"Enjoy reading your favorite books anytime, any...",The Kindle Paperwhite is a must-have for book ...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."
3,Canon EOS R5,Capture life's moments in stunning detail with...,The Canon EOS R5 is a game-changer! The image ...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."
4,Nike Air Zoom Pegasus 38,Experience unparalleled comfort and performanc...,The Nike Air Zoom Pegasus 38 is a game-changer...,"{""durability"": 5, ""ease of use"": 4, ""pleasant ..."


In [5]:
df_reviews.shape

(109, 4)

In [6]:
# Check out some descriptions to use as input
df_reviews["Product Name"].sample(1).iloc[0]

'Cuisinart ICE-21 1.5 Quart Frozen Yogurt-Ice Cream Maker'

# Criteria generation

### A) Langchain - OpenAI

In [7]:
# !pip --quiet install langchain langchain-community langchain-openai chromadb

In [8]:

def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100, )
    docs = [Document(page_content=x) for x in text_splitter.split_text(text)]
    return docs


def embed_texts_openai(texts, openai_api_key):
    print(f"Embedding {len(texts)} texts...", end=' ')
    # Instantiate an embedder
    embedder = OpenAIEmbeddings(openai_api_key=openai_api_key)
    # Use the embedder to populate a Chroma vector store with our texts.
    doc_search = Chroma.from_documents(texts, embedder)
    print("✅")
    return doc_search


def run_qa_openai(doc_search, prompt, openai_api_key):
    print(f"Running QA...", end=' ')

    # Retrieval QA
    # - chain_type="stuff": the model 'stuffs' all our texts into a single prompt (sufficiently small)
    # - model: latest GPT-3.5-Turbo model.
    qa = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(name="gpt-3.5-turbo", api_key=openai_api_key),
        chain_type="stuff",
        retriever=doc_search.as_retriever(search_kwargs={"k": 1})  # 1 doc to return max
    )

    answer = qa.invoke(prompt)
    print("✅")
    return answer["result"]

In [32]:
PRODUCT_INPUT = "Nike Men's Revolution 5 Running Shoes"


In [9]:
OPEN_API_KEY = os.environ.get('OPENAI_API_KEY')


chunks = get_text_chunks(PRODUCT_INPUT)
chunks

[Document(page_content="Nike Men's Revolution 5 Running Shoes")]

In [10]:
doc_search = embed_texts_openai(chunks, OPEN_API_KEY)
doc_search

Embedding 1 texts... ✅


<langchain_community.vectorstores.chroma.Chroma at 0x7f071c7cbe50>

In [11]:
prompt = """
Given this product title, please select between 3 and 6 criteria to rate in order to compose a product review.
"""
answer = run_qa_openai(doc_search, prompt, OPEN_API_KEY)

print(f"Product: {PRODUCT_INPUT}\n")
print(f"Some rating criteria:\n{answer}")

Running QA... ✅
Product: Nike Men's Revolution 5 Running Shoes

Some rating criteria:
1. Comfort: How comfortable are the Nike Men's Revolution 5 Running Shoes to wear for long periods of time?
2. Durability: How well do the shoes hold up over time with regular use?
3. Fit: How true to size are the shoes and how well do they fit?
4. Performance: How do the shoes perform during running or other physical activities?
5. Style: What is your opinion on the design and style of the Nike Men's Revolution 5 Running Shoes?
6. Value for Money: Are the shoes worth the price based on their quality and performance?


### B) Langchain - GPT4 all products and reviews

### Model

Pick a model from the "Model Explorer" section on the [GPT4All page](https://gpt4all.io/index.html).

In [12]:
#MODEL_NAME = 'orca-2-7b.Q4_0.gguf'  # Change here
#MODEL_NAME = 'mistral-7b-openorca.gguf2.Q4_0.gguf'
MODEL_NAME = 'orca-mini-3b-gguf2-q4_0.gguf'
MODEL_PATH = '../models/' + MODEL_NAME

# -C - option to continue transfer automatically (so reuse file if already downloaded)
!curl -C - -o {MODEL_PATH} https://gpt4all.io/models/gguf/{MODEL_NAME}
!ls -lh ../models

** Resuming transfer from byte position 1979946720
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 27242    0 24606    0     0  60404      0 --:--:-- --:--:-- --:--:-- 60308  0 27242    0     0  66694      0 --:--:-- --:--:-- --:--:-- 66606
total 5.5G
-rw-r--r-- 1 shahab shahab 3.6G Jun 14 02:07 orca-2-7b.Q4_0.gguf
-rw-r--r-- 1 shahab shahab 1.9G Jun 14 14:58 orca-mini-3b-gguf2-q4_0.gguf


In [33]:
# Callback that supports token-wise streaming but will only return the final output
# rather than intermediary steps
callbacks = [FinalStreamingStdOutCallbackHandler()]

# verbose=True is required for the callback manager
llm = GPT4All(model=MODEL_PATH, callbacks=callbacks, verbose=True)

### LLM chain

In [34]:
original_prompt = """
For this product, please produce exactly 6 criteria that could be rated by a user for a review. No more details needed. Each criterion should be numbered and described in detail.

"""

template = f"""Product: '{{product_type}}'
{original_prompt}"""
prompt = PromptTemplate(template=template, input_variables=["product_type"])

In [36]:
prompt

PromptTemplate(input_variables=['product_type'], template="Product: '{product_type}'\n\nFor this product, please produce exactly 6 criteria that could be rated by a user for a review. No more details needed. Each criterion should be numbered and described in detail.\n\n")

In [57]:
# Create the LLM chain
llm_chain = LLMChain(prompt=prompt, llm=llm, return_final_only=True)
llm_chain

LLMChain(prompt=PromptTemplate(input_variables=['product_type'], template="Product: '{product_type}'\n\nFor this product, please produce exactly 6 criteria that could be rated by a user for a review. No more details needed. Each criterion should be numbered and described in detail.\n\n"), llm=GPT4All(verbose=True, callbacks=[<langchain.callbacks.streaming_stdout_final_only.FinalStreamingStdOutCallbackHandler object at 0x7f05a80b1750>], model='../models/orca-mini-3b-gguf2-q4_0.gguf', client=<gpt4all.gpt4all.GPT4All object at 0x7f05a80b19f0>))

In [20]:
%%time
res = llm_chain.run(product_type=PRODUCT_INPUT)
# Note that the result needs to be printed explicitly to be shown properly since
# it contains line returns
print(res)

1. Comfort - How well the shoes fit and provide comfort during long runs or workouts.
2. Performance - How well the shoes perform on the feet, such as cushioning, support, and stability.
3. Style - The appearance of the shoes, including color, design, and materials used.
4. Durability - How long the shoes can withstand wear and tear, such as abrasion resistance and waterproofing.
5. Price - The cost of the shoes in relation to their quality and features.
6. Environmental Impact - The shoes' impact on the environment, including sustainability practices and eco-friendly materials used.
CPU times: user 2min 34s, sys: 939 ms, total: 2min 35s
Wall time: 40.2 s


### Retrieval QA chain
#### Prompt only


In [25]:
#!pip install sentence-transformers --quiet

In [49]:
EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
PERSIST_DIRECTORY = '../db/chroma_3/'


def embed_texts_hg(texts):
    print(f"Embedding {len(texts)} texts...", end=' ')

    embedder = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    vector_db = Chroma.from_documents(
        documents=texts,
        embedding=embedder,
        persist_directory=PERSIST_DIRECTORY

    )
    print("✅")
    return vector_db

In [50]:
chunks


[Document(page_content="Nike Men's Revolution 5 Running Shoes")]

In [51]:
#!pip install -U langchain-huggingface

In [53]:
doc_search = embed_texts_hg(chunks)
doc_search

Embedding 1 texts... ✅


<langchain_community.vectorstores.chroma.Chroma at 0x7f05a806ba60>

In [54]:
%%time

def run_qa_hg(doc_search, prompt):
    print(f"Running QA...", end=' ')

    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",  # the model 'stuffs' all our texts into a single prompt (sufficiently small)
        retriever=doc_search.as_retriever(search_kwargs={"k": 1}),
        return_source_documents=False,
        verbose=False,
    )

    answer = qa.invoke(prompt)
    print("✅")
    return answer["result"]

answer = run_qa_hg(doc_search, original_prompt)
print(answer)

Running QA... ✅

1. Ease of Use - The shredder is easy to use and can handle up to six sheets of paper at once, making it convenient for those who need to shred frequently.
2. Speed - The shredder has a fast shredding speed, which makes it ideal for quickly disposing of sensitive documents.
3. Quality - The Amazon Basics 6-Sheet Cross-Cut Paper and Credit Card Shredder is made with high-quality materials and is designed to last for many years.
4. Price - At $29.99, the shredder offers excellent value for money and provides a cost-effective solution for those who need to shred frequently.
5. Size - The small bin capacity of 6 sheets makes it convenient for those who need to shred smaller amounts of paper.
6. Maintenance - The shredder is easy to maintain, with removable parts that can be easily cleaned and sanitized after use.
CPU times: user 8min 16s, sys: 9.89 s, total: 8min 26s
Wall time: 2min 11s


## More context

In [25]:

df_reviews.head()

Unnamed: 0,Product Name,Product Description,Review Text,Rating
0,iPhone 15,The Apple iPhone 15 redefines smartphone innov...,The iPhone 15 is a masterpiece! The sleek desi...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."
1,MacBook Pro 2023,Experience the ultimate in computing power wit...,The MacBook Pro 2023 is a game-changer! The pe...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."
2,Kindle Paperwhite,"Enjoy reading your favorite books anytime, any...",The Kindle Paperwhite is a must-have for book ...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."
3,Canon EOS R5,Capture life's moments in stunning detail with...,The Canon EOS R5 is a game-changer! The image ...,"{""durability"": 5, ""ease of use"": 5, ""pleasant ..."
4,Nike Air Zoom Pegasus 38,Experience unparalleled comfort and performanc...,The Nike Air Zoom Pegasus 38 is a game-changer...,"{""durability"": 5, ""ease of use"": 4, ""pleasant ..."


In [26]:
CONTEXT_REVIEWS_FILE = 'context_reviews.txt'

# records = df_reviews.drop(columns='Rating').to_json(orient='records')

# List of dictionaries
records = df_reviews.drop(columns='Rating').to_dict(orient='records')
with open(CONTEXT_REVIEWS_FILE, 'w') as f:
    f.writelines(map(lambda d: str(d) + '\n', records))

!head -3 {CONTEXT_REVIEWS_FILE}

{'Product Name': 'iPhone 15', 'Product Description': 'The Apple iPhone 15 redefines smartphone innovation with its cutting-edge features and design.', 'Review Text': 'The iPhone 15 is a masterpiece! The sleek design, powerful performance, and advanced features make it the best smartphone on the market. Highly recommend!'}
{'Product Name': 'MacBook Pro 2023', 'Product Description': 'Experience the ultimate in computing power with the MacBook Pro 2023, featuring blazing-fast performance and stunning Retina display.', 'Review Text': 'The MacBook Pro 2023 is a game-changer! The performance is lightning-fast, the Retina display is breathtaking, and the build quality is top-notch. Absolutely love it!'}
{'Product Name': 'Kindle Paperwhite', 'Product Description': 'Enjoy reading your favorite books anytime, anywhere with the Kindle Paperwhite e-reader.', 'Review Text': 'The Kindle Paperwhite is a must-have for book lovers! The adjustable lighting, crisp display, and long battery life make it p

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [28]:
# Loading reviews rather than just one product name




def load_file(file_path) -> list[Document]:
    assert os.path.exists(file_path), f"File not found: {file_path}"

    print(f"Loading {file_path}...", end=' ')
    documents = TextLoader(file_path).load()

    # A) Recursive splitter
    splitter = RecursiveCharacterTextSplitter(
        separators=['\n'],
        chunk_size=500, chunk_overlap=100
    )
    # B) Text splitter
    # splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

    texts = splitter.split_documents(documents)
    print("✅")
    return texts

review_chunks = load_file(CONTEXT_REVIEWS_FILE)
type(review_chunks[0]), len(review_chunks)

Loading context_reviews.txt... ✅


(langchain_core.documents.base.Document, 106)

In [56]:
review_chunks[0]

Document(page_content="{'Product Name': 'iPhone 15', 'Product Description': 'The Apple iPhone 15 redefines smartphone innovation with its cutting-edge features and design.', 'Review Text': 'The iPhone 15 is a masterpiece! The sleek design, powerful performance, and advanced features make it the best smartphone on the market. Highly recommend!'}", metadata={'source': 'context_reviews.txt'})

In [29]:
doc_search_2 = embed_texts_hg(review_chunks)

Embedding 106 texts... 



✅


In [30]:

template_filled = f"""Product: '{{product_text}}'
{original_prompt}""".format(product_text=PRODUCT_INPUT)
template_filled

"Product: 'Nike Men's Revolution 5 Running Shoes'\n\nFor this product, please produce exactly 6 criteria that could be rated by a user for a review. No more details needed. Each criterion should be numbered and described in detail.\n\n"

In [31]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=doc_search_2.as_retriever(search_kwargs={"k": 1}),
    return_source_documents=False,
    verbose=False,
)
res = qa_chain.invoke(template_filled)
print(res['result'])



1. Comfort - The Nike Men's Revolution 5 Running Shoes are designed to provide comfort and support, with a breathable upper and cushioning that absorbs impact.
2. Performance - These shoes are optimized for speed and agility, featuring a responsive sole and lightweight design.
3. Style - The sleek black and white color scheme gives these running shoes a modern look, while the reflective details add visibility in low-light conditions.
4. Durability - The Revolution 5 is made with high-quality materials that are durable and long-lasting, making them ideal for athletes who demand reliable performance.
5. Fit - These shoes offer a secure fit, with a locked-down sole and responsive cushioning that supports the foot during every stride.
6. Price - The Nike Men's Revolution 5 Running Shoes are priced competitively, making them an affordable option for runners of all levels.
