# The Price is Right

Today we build a more complex solution for estimating prices of goods.

Use Random Forest pricer and Create a Ensemble pricer that allows contributions from all the pricers.

In [1]:
# imports

import os
import re
import math
import json
from tqdm import tqdm
import random
from dotenv import load_dotenv
from huggingface_hub import login
import numpy as np
import pickle
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import chromadb
from items import Item
from testing import Tester
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib


In [2]:
# CONSTANTS

QUESTION = "How much does this cost to the nearest dollar?\n\n"
DB = "products_vectorstore"

In [3]:
# environment

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')

In [4]:
# Load in the test pickle file:

with open('test.pkl', 'rb') as file:
    test = pickle.load(file)

In [5]:
client = chromadb.PersistentClient(path=DB)
collection = client.get_or_create_collection('products')

In [8]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors_all = np.array(result['embeddings'])
documents_all = result['documents']
prices_all = [metadata['price'] for metadata in result['metadatas']]

In [11]:
# Sample 10,000 from 400,000
import random
random.seed(42)
sampled_indices = random.sample([i for i in range(len(vectors_all))], 10000)
print(sampled_indices)

[1824, 409, 4506, 4012, 3657, 2286, 1679, 8935, 1424, 9674, 6912, 520, 488, 1535, 3582, 3811, 8279, 9863, 434, 9195, 3257, 8928, 6873, 3611, 7359, 9654, 4557, 106, 2615, 6924, 5574, 4552, 2547, 3527, 5514, 1674, 1519, 6224, 1584, 5881, 5635, 9891, 4333, 711, 7527, 8785, 2045, 6201, 1291, 9044, 4803, 5925, 9459, 3150, 1139, 750, 3733, 4741, 1307, 3814, 1654, 6227, 4554, 7428, 5977, 2664, 6065, 5820, 3432, 4374, 1169, 2803, 8751, 4010, 2677, 7573, 6216, 4422, 9125, 3598, 5313, 916, 3752, 525, 5168, 6572, 4386, 1084, 3456, 9292, 5155, 3483, 8179, 6482, 7517, 2340, 4339, 2287, 4040, 9197, 8830, 4304, 9577, 7019, 9560, 6543, 5930, 3593, 2266, 8348, 8085, 1489, 771, 1796, 2504, 2621, 6916, 9771, 1040, 6304, 6252, 9763, 7668, 8669, 4119, 9064, 188, 1876, 8797, 4371, 5573, 1827, 4808, 7123, 2591, 7433, 53, 4315, 8201, 2927, 8317, 1743, 4889, 9859, 3258, 9885, 6126, 2646, 8837, 8689, 9, 9813, 5310, 8005, 319, 1832, 5947, 5038, 3923, 949, 3946, 9295, 1290, 1403, 7962, 1133, 8727, 2060, 2103, 778

In [14]:
vectors  = vectors_all[sampled_indices]
documents = np.array(documents_all)[sampled_indices]
prices = np.array(prices_all)[sampled_indices]

# Random Forest

We will now train a Random Forest model with the vectors as inputs we already have in Chroma, from the SentenceTransformer model.

In [15]:
# This next line takes an hour on my M1 Mac!

rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(vectors, prices)

In [16]:
# Save the model to a file

joblib.dump(rf_model, 'random_forest_model.pkl')

['random_forest_model.pkl']

In [17]:
# Load it back in again

rf_model = joblib.load('random_forest_model.pkl')

In [18]:
from agents.specialist_agent import SpecialistAgent
from agents.frontier_agent import FrontierAgent
from agents.random_forest_agent import RandomForestAgent

In [None]:
specialist = SpecialistAgent()
frontier = FrontierAgent(collection)
random_forest = RandomForestAgent()

In [20]:
def description(item):
    return item.prompt.split("to the nearest dollar?\n\n")[1].split("\n\nPrice is $")[0]

In [21]:
def rf(item):
    return random_forest.price(description(item))

In [25]:
#Tester.test(rf, test)

In [23]:
product = "Quadcast HyperX condenser mic for high quality audio for podcasting"

In [24]:
#print(specialist.price(product))
#print(frontier.price(product))
print(random_forest.price(product))

202.42160000000004


In [None]:
specialists = []
frontiers = []
random_forests = []
prices = []
for item in tqdm(test[1000:1250]):
    text = description(item)
    specialists.append(specialist.price(text))
    frontiers.append(frontier.price(text))
    random_forests.append(random_forest.price(text))
    prices.append(item.price)

In [None]:
mins = [min(s,f,r) for s,f,r in zip(specialists, frontiers, random_forests)]
maxes = [max(s,f,r) for s,f,r in zip(specialists, frontiers, random_forests)]

X = pd.DataFrame({
    'Specialist': specialists,
    'Frontier': frontiers,
    'RandomForest': random_forests,
    'Min': mins,
    'Max': maxes,
})

# Convert y to a Series
y = pd.Series(prices)

In [None]:
# Train a Linear Regression
np.random.seed(42)

lr = LinearRegression()
lr.fit(X, y)

feature_columns = X.columns.tolist()

for feature, coef in zip(feature_columns, lr.coef_):
    print(f"{feature}: {coef:.2f}")
print(f"Intercept={lr.intercept_:.2f}")

In [None]:
joblib.dump(lr, 'ensemble_model.pkl')

In [None]:
from agents.ensemble_agent import EnsembleAgent
ensemble = EnsembleAgent(collection)

In [None]:
ensemble.price(product)

In [None]:
def ensemble_pricer(item):
    return max(0,ensemble.price(description(item)))

In [None]:
Tester.test(ensemble_pricer, test)

# WHAT A DAY!

We got so much done - a Fronter RAG pipeline, a Random Forest model using transformer-based encodings, and an Ensemble model.

You can do better, for sure!

Tweak this, and try adding components into the ensemble, to beat my performance.