In [1]:
import os
import random
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import numpy as np
import pickle
import sys
sys.path.append("../../llm_engineering")
from api_clients import create_clients

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
clients = create_clients()
hf_token = os.environ['HF_API_KEY']
login(hf_token, add_to_git_credential=True)

In [7]:
sys.path.append("week6/")
from loaders import ItemLoader
from items import Item

In [8]:
%matplotlib inline

In [9]:
items = ItemLoader("Appliances").load(workers=6)

Loading dataset Appliances


100%|██████████████████████████████████████████████████████████████████████████████████| 95/95 [00:54<00:00,  1.74it/s]


Completed Appliances with 28,625 datapoints in 1.0 mins


In [15]:
slots = defaultdict(list)
for item in items:
    slots[round(item.price)].append(item)

In [18]:
np.random.seed(42)
random.seed(42)

MAX_PER_SLOT = 200  
sample = []

for i in range(len(slots)):
    slot = slots[i]
    if len(slot) <= MAX_PER_SLOT:
        sample.extend(slot)  # take all items from small slots
    else:
        # randomly sample from large slots
        selected = random.sample(slot, MAX_PER_SLOT)
        sample.extend(selected)

print(f"There are {len(sample):,} items in the sample")

There are 15,379 items in the sample


In [23]:
def report(item):
    prompt = item.prompt
    tokens = Item.tokenizer.encode(item.prompt)
    print(prompt)
    print(tokens[-10:])
    print(Item.tokenizer.batch_decode(tokens[-10:]))

In [29]:
report(sample[0])

How much does this cost to the nearest dollar?

OR Dryer Rear Bearing Support
or Dryer Rear Drum Bearing Support Cup. The bearing support cup helps the drum rotate smoothly and quietly. This part works with the following brands Frigidaire, White Westinghouse, Kelvinator, Gibson, Kenmore, Sears, Tappan, & Electrolux. Replaces Old Numbers 175427 774936 85-951 DE727 or Dryer Rear Drum Bearing Support Cup. The bearing support cup helps the drum rotate smoothly and quietly. This part works with the following brands Frigidaire, White Westinghouse, Kelvinator, Gibson, Kenmore, Sears, Tappan, & Electrolux. Replaces Old Numbers; 175427 774936 85-951 DE727 Manufacturer ERP, Part Weight 

Price is $1.00
[11, 3744, 16923, 4815, 7117, 374, 400, 16, 13, 410]
[',', ' Part', ' Weight', ' \n\n', 'Price', ' is', ' $', '1', '.', '00']


In [39]:
random.seed(42)
random.shuffle(sample)
train = sample[:12000]
test = sample[12000:15378]
print(f"Divided into a training set of {len(train):,} items and test set of {len(test):,} items")

Divided into a training set of 12,000 items and test set of 3,378 items


In [40]:
print(train[0].prompt)

How much does this cost to the nearest dollar?

Valore Cascade 30 Contemporary Under Cabinet Range Hood
The Valore Cascade under-cabinet range hood’s thoughtful design transitions seamlessly into a homeowner’s most distinctive kitchen. The Valore Cascade offers exceptional value with 400 CFM operation to efficiently remove airborne residues, easy-to-use two speed mechanical controls, and dual level halogen lighting. The sleek design of the Cascade easily installs under the most common kitchen cabinets and provides proficient ventilation for all your cooking needs. The low profile body style and lustrous stainless steel construction leave a sharp impression as an integral piece to your kitchen. Dual halogen lamps provide for exceptional task and ambient lighting. The Cascade’s multi-layered aluminum mesh filters also incorporate highly effective performance with a well polished aesthetic to accent the range hood. The Cascade is conveniently available in a 

Price is $270.00


In [41]:
print(test[0].test_prompt())

How much does this cost to the nearest dollar?

Refresh Replacement Refrigerator Water Filter for GE GSWF SmartWater and Kenmore 9914 (3 Pack)
Buy with confidence from Refresh and refresh your water today! Up to 50% less than comparable manufactuer part! TESTED & CERTIFIED to NSF/ANSI 42 standards. Removes Chlorine, Taste & Odor. Doesn't remove beneficial minerals that support overall health. Fast Flow filter meets or exceeds OEM filter specifications. Capacity 300 gallons Compatible with the following GE fridge filters GSWF, GSWFDS, Also fits for these Kenmore models Kenmore 9914 Also fits for these models AQF-GSWF-P, AQF-GSWF-D, WDS-GSWF, WDP-GSWF, AMZN-GSWF-S, Also

Price is $


In [42]:
train_prompts = [item.prompt for item in train]
train_prices = [item.price for item in train]
test_prompts = [item.test_prompt() for item in test]
test_prices = [item.price for item in test]

In [43]:
train_dataset = Dataset.from_dict({"text": train_prompts, "price": train_prices})
test_dataset = Dataset.from_dict({"text": test_prompts, "price": test_prices})
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [None]:
HF_USER = "ed-donner"
DATASET_NAME = f"{HF_USER}/pricer-data"
dataset.push_to_hub(DATASET_NAME, private=True)

In [44]:
with open('train.pkl', 'wb') as file:
    pickle.dump(train, file)

with open('test.pkl', 'wb') as file:
    pickle.dump(test, file)