## Prepare and Inspect Your Dataset

In [3]:
import pathlib
import polars as pl

def prepare_car_reviews_data(data_path: pathlib.Path, vehicle_years: list[int] = [2017]):
    """Prepare the car reviews dataset for ChromaDB"""

    # Define the schema to ensure proper data types are enforced
    dtypes = {
        "": pl.Int64,
        "Review_Date": pl.Utf8,
        "Author_Name": pl.Utf8,
        "Vehicle_Title": pl.Utf8,
        "Review_Title": pl.Utf8,
        "Review": pl.Utf8,
        "Rating": pl.Float64,
    }

    # Scan the car reviews dataset(s)
    car_reviews = pl.scan_csv(data_path, dtypes=dtypes)

    # Extract the vehicle title and year as new columns
    # Filter on selected years
    car_review_db_data = (
        car_reviews.with_columns(
            [
                (
                    pl.col("Vehicle_Title").str.split(
                        by=" ").list.get(0).cast(pl.Int64)
                ).alias("Vehicle_Year"),
                (pl.col("Vehicle_Title").str.split(by=" ").list.get(1)).alias(
                    "Vehicle_Model"
                ),
            ]
        )
        .filter(pl.col("Vehicle_Year").is_in(vehicle_years))
        .select(["Review_Title", "Review", "Rating", "Vehicle_Year", "Vehicle_Model"])
        .sort(["Vehicle_Model", "Rating"])
        .collect()
    )

    # Create ids, documents, and metadatas data in the format chromadb expects
    ids = [f"review{i}" for i in range(car_review_db_data.shape[0])]
    documents = car_review_db_data["Review"].to_list()
    metadatas = car_review_db_data.drop("Review").to_dicts()

    return {"ids": ids, "documents": documents, "metadatas": metadatas}

In [4]:
DATA_PATH = "data/archive/*"

chroma_car_reviews_dict = prepare_car_reviews_data(DATA_PATH)

In [7]:
chroma_car_reviews_dict.keys()

dict_keys(['ids', 'documents', 'metadatas'])

In [8]:
len(chroma_car_reviews_dict["ids"])

5870

## Create a Collection and Add Reviews

In [9]:
import pathlib
import chromadb
from chromadb.utils import embedding_functions
from more_itertools import batched

def build_chroma_collection(
    chroma_path: pathlib.Path,
    collection_name: str,
    embedding_func_name: str,
    ids: list[str],
    documents: list[str],
    metadatas: list[dict],
    distance_func_name: str = "cosine",
):
    """Create a ChromaDB collection"""

    chroma_client = chromadb.PersistentClient(chroma_path)

    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=embedding_func_name
    )

    collection = chroma_client.create_collection(
        name=collection_name,
        embedding_function=embedding_func,
        metadata={"hnsw:space": distance_func_name},
    )

    document_indices = list(range(len(documents)))

    for batch in batched(document_indices, 166):
        start_idx = batch[0]
        end_idx = batch[-1]

        collection.add(
            ids=ids[start_idx:end_idx],
            documents=documents[start_idx:end_idx],
            metadatas=metadatas[start_idx:end_idx],
        )

In [10]:
import chromadb
from chromadb.utils import embedding_functions

DATA_PATH = "data/archive/*"
CHROMA_PATH = "chroma_data"
EMBEDDING_FUNC_NAME = "multi-qa-MiniLM-L6-cos-v1"
COLLECTION_NAME = "car_reviews"

In [11]:
chroma_car_reviews_dict = prepare_car_reviews_data(DATA_PATH)

In [12]:
build_chroma_collection(
    CHROMA_PATH,
    COLLECTION_NAME,
    EMBEDDING_FUNC_NAME,
    chroma_car_reviews_dict["ids"],
    chroma_car_reviews_dict["documents"],
    chroma_car_reviews_dict["metadatas"],
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
client = chromadb.PersistentClient(CHROMA_PATH)
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBEDDING_FUNC_NAME
)

In [14]:
collection = client.get_collection(COLLECTION_NAME, embedding_function=embedding_func)

In [15]:
great_reviews = collection.query(
    query_texts=["Find me some positive reviews that discuss the car's performance"],
    n_results=5,
    include=["documents", "distances", "metadatas"],
)

In [16]:
great_reviews["documents"]

[[' Great all around car with great balance of performance and comfort. Terrific technology too.',
  " Have had the car for less than a month but, I am impressed with it's acceleration & responsivness. Car holds the road very well. Electronics are nice but not spectacular,(everything works fine but I am not crazy about the volume control for the radio), other than that, I am very happy with it's performance & looks.",
  ' Excellent car',
  " Im not into writing long reviews. my run down and i got one of the first new Imprezas. Super quiet ride, all wheel drive, very safe and great sounding stereo. If you don't buy this car in that price range your an idiot.",
  " I don't normally review anything, but I feel compelled to write about this car.  With a daily round trip commute of 108 miles, I was tired of the constant trips to gas stations and the expense.  After just a month with this car, I am blown away.  It is the most economical, practical, comfortable commuter vehicle that you can b

## Connect to an LLM Service

In [18]:
from dotenv import load_dotenv, find_dotenv, dotenv_values
from openai import OpenAI


def hide_api_key(s):
    if len(s) < 6:
        return "Input string must be at least 6 characters long"
    return s[:3] + '...' + s[-3:]

dotenv_path = find_dotenv(filename=".env", raise_error_if_not_found=True)
load_dotenv(dotenv_path, override=True)

env_vars = dotenv_values(dotenv_path)
for k, v in env_vars.items():
    if "OPENAI_API_KEY" in k:
        print(f"{k}={hide_api_key(v)}")
    else:
        print(f"{k}={v}")

OPENAI_API_KEY=sk-...TBW


In [21]:
context = "You are a customer success employee at a large car dealership."
question = "What's the key to great customer satisfaction?"

client = OpenAI(api_key=env_vars["OPENAI_API_KEY"])

completion = client.chat.completions.create( 
    model = "gpt-4o",
    messages = [
        {"role": "system", "content": context},
        {"role": "user", "content": question},
    ],
    temperature = 0,
)

completion.choices[0].message.content

'Great customer satisfaction in a car dealership hinges on several key factors:\n\n1. **Exceptional Customer Service**: This includes being friendly, approachable, and genuinely interested in helping customers. Staff should be well-trained to handle inquiries, provide detailed information, and assist with any issues that arise.\n\n2. **Transparency**: Be honest and upfront about pricing, financing options, and any additional costs. Customers appreciate transparency and it builds trust.\n\n3. **Personalization**: Tailor the experience to meet the individual needs and preferences of each customer. This can involve remembering past interactions, understanding their preferences, and making personalized recommendations.\n\n4. **Efficient Processes**: Streamline the buying process to make it as smooth and quick as possible. This includes everything from test drives to financing and paperwork.\n\n5. **Quality Products**: Ensure that the vehicles you sell are of high quality and well-maintaine

### Provide Context to the LLM

In [33]:
context = """
You are a customer success employee at a large
car dealership. Use the following car reviews
to answer questions: {}
"""

question = """
What's the key to great customer satisfaction
based on detailed positive reviews?
"""

In [34]:
good_reviews = collection.query(
    query_texts=[question],
    n_results=10,
    include=["documents"],
    where={"Rating": {"$gt": 4.5}},
)

reviews_str = ",".join(good_reviews["documents"][0])

In [35]:
good_review_summaries = client.chat.completions.create( 
    model = "gpt-4o",
    messages = [
        {"role": "system", "content": context.format(reviews_str)},
        {"role": "user", "content": question},
    ],
    temperature = 0,
)

good_review_summaries.choices[0].message.content

'Based on the detailed positive reviews provided, the key to great customer satisfaction in the car dealership industry includes several critical factors:\n\n1. **Value for Money**: Customers appreciate getting more features and equipment for their money compared to competitors. This includes aspects like warranties and additional equipment.\n\n2. **Reliability and Quality**: High marks for reliability and build quality are crucial. Customers value vehicles that are dependable and have fewer recalls or issues.\n\n3. **Comfort and Ergonomics**: Comfortable seating, good ergonomics, and a quiet ride are highly valued. Customers want a vehicle that is pleasant to drive, especially for long commutes.\n\n4. **Fuel Efficiency**: High fuel efficiency and economical operation are significant factors, especially for customers with long daily commutes. Vehicles that offer good mileage and hybrid options are particularly appreciated.\n\n5. **Technology and Features**: Advanced technology and feat

#### see the power of using ChromaDB, see question

In [36]:
context = """
You are a customer success employee at a large
car dealership. Use the following car reviews
to answer questions: {}
"""

question = """
Which of these poor reviews has the
worst implications about our dealership?
Explain why.
"""

poor_reviews = collection.query(
    query_texts=[question],
    n_results=5,
    include=["documents"],
    where={"Rating": {"$lte": 3}},
)

poor_reviews_str = ",".join(poor_reviews["documents"][0])

In [38]:

poor_review_analysis = client.chat.completions.create( 
    model = "gpt-4o",
    messages = [
        {"role": "system", "content": context.format(poor_reviews_str)},
        {"role": "user", "content": question},
    ],
    temperature = 0,
)

poor_review_analysis.choices[0].message.content

'The review with the worst implications about your dealership is the first one:\n\n"I have been to the dealership four times and I still have unresolved electrical issues, from the Bluetooth, backup camera, truck wont open, black screen every other trip turning off, clock wont work, seatbelts NOT WORKING !!!!!!!! which really pisses me off as I have a child in the car. So many issues in a short period of time it has become the worst vehicle I have owned unfortunately and to the point where I will drop the car off back at the dealership and purchase something else. I don\'t have the time to deal with a vehicle of 53k value as if it were pieced together from spare parts from a junkyard. I always read reviews before buying vehicles and I have stuck with Nissan & Infiniti for the past ten years for their workhorse engines as two of my Nissans went to 289k & 377k miles but after this SUV I am jumping ship as soon as possible."\n\n### Explanation:\n1. **Frequency of Visits**: The customer ha

wow, check what will happen when you leave `context.format(reviews_str)`, GPT will notice that all reviews are positive ...