In [1]:
import pandas as pd


In [2]:
data = pd.read_csv('vehicles.csv')

In [3]:
# Specify the columns you want to keep
columns_to_keep = ['Year', 'Make', 'Model', 'Type', 'SellingPrice', 'Miles']

# Create a new DataFrame with only the specified columns
filtered_data = data[columns_to_keep]

In [4]:
filtered_data

Unnamed: 0,Year,Make,Model,Type,SellingPrice,Miles
0,2019,Mazda,CX-9,Used,18498,60081
1,2024,Maserati,Levante,Used,86998,75
2,2019,Volkswagen,Tiguan,Used,14498,72266
3,2022,INFINITI,Q50,Used,29998,42796
4,2019,Hyundai,Elantra,Used,10498,71265
...,...,...,...,...,...,...
2225,2020,Toyota,Corolla,New,14497,0
2226,2021,Dodge,Durango,Used,25998,52798
2227,2021,Alfa Romeo,Giulia,New,16997,0
2228,2024,Nissan,Sentra,New,16998,0


In [24]:
initial_rows = len(filtered_data)
print(f"Initial number of rows: {initial_rows}")

Initial number of rows: 2230


In [25]:
# Fill missing values for 'Year', 'SellingPrice', 'Miles' with the average value and assign back
filtered_data['Year'] = filtered_data['Year'].fillna(filtered_data['Year'].mean())
filtered_data['SellingPrice'] = filtered_data['SellingPrice'].fillna(filtered_data['SellingPrice'].mean())
filtered_data['Miles'] = filtered_data['Miles'].fillna(filtered_data['Miles'].mean())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Year'] = filtered_data['Year'].fillna(filtered_data['Year'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['SellingPrice'] = filtered_data['SellingPrice'].fillna(filtered_data['SellingPrice'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['

In [26]:
# Step 2: Drop rows where any of ['Make', 'Model', 'Type'] is missing
filtered_data.dropna(subset=['Make', 'Model', 'Type'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.dropna(subset=['Make', 'Model', 'Type'], inplace=True)


In [28]:
# Checking the number of rows after the operation
final_rows = len(filtered_data)
print(f"Final number of rows: {final_rows}")

Final number of rows: 2230


In [29]:
# Display the modified DataFrame
print(filtered_data)

      Year        Make       Model  Type  SellingPrice  Miles
0     2019       Mazda        CX-9  Used         18498  60081
1     2024    Maserati     Levante  Used         86998     75
2     2019  Volkswagen      Tiguan  Used         14498  72266
3     2022    INFINITI         Q50  Used         29998  42796
4     2019     Hyundai     Elantra  Used         10498  71265
...    ...         ...         ...   ...           ...    ...
2225  2020      Toyota     Corolla   New         14497      0
2226  2021       Dodge     Durango  Used         25998  52798
2227  2021  Alfa Romeo      Giulia   New         16997      0
2228  2024      Nissan      Sentra   New         16998      0
2229  2020      Toyota  Tacoma 2WD  Used         18998  57923

[2230 rows x 6 columns]


In [30]:
# Create descriptions using the specified columns
filtered_data['Description'] = filtered_data.apply(
    lambda x: f"This {x['Type'].lower()} {x['Year']} {x['Make']} {x['Model']} is available with {x['Miles']} miles on it, priced at ${x['SellingPrice']}.",
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Description'] = filtered_data.apply(


In [31]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
car_descriptions = filtered_data['Description'].tolist()
embeddings = model.encode(car_descriptions)  # Tokenization happens inside this method

In [12]:
%pip install qdrant-client --quiet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [32]:
from qdrant_client import QdrantClient

# Connect to Qdrant
qdrant_client = QdrantClient(host="localhost", port=6333)

# Recreate collection in Qdrant
qdrant_client.recreate_collection(
    collection_name="car_inventory",
    vectors_config={
        "size": embeddings.shape[1],  # Set the size of the embeddings
        "distance": "Cosine"
    }
)

# Insert embeddings and payload data (the car metadata)
payloads = filtered_data.to_dict(orient='records')

for idx, embedding in enumerate(embeddings):
    qdrant_client.upsert(
        collection_name="car_inventory",
        points=[{
            "id": idx,
            "vector": embedding,
            "payload": payloads[idx]  # Metadata like make, model, year, etc.
        }]
    )


  qdrant_client.recreate_collection(


In [33]:
user_query = "I want a car made in 2020"
query_embedding = model.encode([user_query])[0]

In [34]:
results = qdrant_client.search(
    collection_name="car_inventory",
    query_vector=query_embedding,
    limit=20  
)

# Print results
for result in results:
    car_details = result.payload
    print(f"Match: {car_details['Year']} {car_details['Make']} {car_details['Model']}, Price: ${car_details['SellingPrice']}")

Match: 2020 Mercedes-Benz C-Class, Price: $18998
Match: 2020 Hyundai Veloster N, Price: $19498
Match: 2020 Volkswagen Atlas, Price: $18498
Match: 2023 Dodge Challenger, Price: $25998
Match: 2020 Mercedes-Benz C-Class, Price: $39998
Match: 2020 Chevrolet Camaro, Price: $21898
Match: 2020 Lexus IS, Price: $25498
Match: 2020 Chevrolet Camaro, Price: $15498
Match: 2020 Mercedes-Benz GLE, Price: $31498
Match: 2023 BMW 3 Series, Price: $29998
Match: 2020 Nissan Altima, Price: $16998
Match: 2023 Nissan Altima, Price: $20298
Match: 2021 Mercedes-Benz GLA, Price: $0
Match: 2022 Toyota Corolla, Price: $18998
Match: 2021 Porsche Cayenne, Price: $40898
Match: 2023 Mercedes-Benz EQE, Price: $0
Match: 2020 Mercedes-Benz GLC, Price: $22998
Match: 2020 Acura TLX, Price: $17498
Match: 2020 Honda Civic Sedan, Price: $0
Match: 2020 Dodge Challenger, Price: $30997


In [16]:
%pip install transformers torch


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [17]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

In [35]:
# Load GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


In [43]:
# Create text generation pipeline for GPT-2
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Assuming we have retrieved the top 20 candidates
results = qdrant_client.search(
    collection_name="car_inventory",
    query_vector=query_embedding,
    limit=20  
)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [52]:
# Get the payloads from the results
candidate_data = [result.payload for result in results]

candidate_data = candidate_data[:5]

# Create a description of the top 5 cars
candidate_descriptions = [
    f"{idx+1}. {car['Year']} {car['Make']} {car['Model']}, {car['Miles']} miles, priced at ${car['SellingPrice']}."
    for idx, car in enumerate(candidate_data)
]

In [55]:
# Construct the prompt for GPT-2
prompt = (
    "Here are some great car options I found for you:\n"
    + "\n".join(candidate_descriptions)
    + "\n\nDo any of these cars sound interesting to you? Or do you have more preferences you'd like me to consider?"
)



# Generate a conversational response using GPT-2
response = generator(prompt, max_length=150, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

# Output the response
print(response[0]['generated_text'])

Here are some great car options I found for you:
1. 2020 Mercedes-Benz C-Class, 70231 miles, priced at $18998.
2. 2020 Hyundai Veloster N, 54107 miles, priced at $19498.
3. 2020 Volkswagen Atlas, 70402 miles, priced at $18498.
4. 2023 Dodge Challenger, 20576 miles, priced at $25998.
5. 2020 Mercedes-Benz C-Class, 38404 miles, priced at $39998.

Do any of these cars sound interesting to you? Or do you have more preferences you'd like me to consider? Send us your queries below.
