In [None]:
import os
import random
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import Dataset, DatasetDict
from items import Item
from loaders import ItemLoader
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import numpy as np
import pickle

In [None]:
%matplotlib inline

In [None]:

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")
os.environ['ATHROPIC_API_KEY'] = os.getenv("ANTHROPIC_API_KEY")
os.environ['HF_TOKEN'] = os.getenv("HF_TOKEN")

In [None]:
# logging into huggingFace

login(os.environ["HF_TOKEN"], add_to_git_credential=True)

In [None]:
# Loading Datasets

dataset_names = [
    "Automotive",
    "Electronics",
    "Office_Products",
    "Tools_and_Home_Improvement",
    "Cell_Phones_and_Accessories",
    "Toys_and_Games",
    "Appliances",
    "Musical_Instruments",
]

In [None]:
items = []
for dataset_name in dataset_names:
    loader = ItemLoader(dataset_name)
    items.extend(loader.load())

In [None]:
print(f"A grand total of {len(items):,} items")

In [None]:
# plot of distribution of token counts

tokens = [item.token_count for item in items]
plt.figure(figsize=(15,6))
plt.title(f"Token Counts: Avg {sum(tokens)/len(tokens):,.1f} and highest {max(tokens):,} \n")
plt.xlabel("Length (tokens)")
plt.ylabel("Count")
plt.hist(tokens, rwidth=0.7, color="skyblue", bins=range(0, 300, 10))
plt.show()

In [None]:
# Plot of distribution of prices

prices = [items.price for item in items]
plt.figure(figsize=(15,6))
plt.title(f"Prices: Avg {sum(prices)/len(prices):,.1f} and highest {max(prices):,}")
plt.xlabel("Price ($)")
plt.ylabel("Count")
plt.hist(prices, width=0.7, color="blue", bind=range(0, 300, 10))
plt.show()

In [None]:
# Plot of bar chart by Category

category_counts = Counter()
for item in items:
    category_counts[item.category]+=1

categories = category_counts.keys()
counts = [category_counts[category] for category in categories]

plt.figure(figsize=(15,6))
plt.title(f"Category Count")
plt.xlabel("Categories")
plt.ylabel("Count")
plt.bar(categories, counts, color="magenta")
plt.xticks(rotation=30, ha='right')

for i,v in enumerate(counts):
    plt.text(i, v, "{v:,}", ha='center', va='bottom')

plt.show()


In [None]:
# Preprocessing the datasets i.e. feature scaling

# storing items in buckets based on their price
slots = defaultdict()
for item in items:   
    slots[round(item.price).append(item)]  

np.random.seed(42)
random.seed(42)
sample = []

for i in range(0,1000):
    slot = slots[i]

    if i>=240 or len(slot) < 1200:
        sample.extend(slot)
        
    
    else:
        weights = np.array([1 if item.categroy == 'Automotive' else 4 for item in slot])
        weights = weights / np.sum(weights)
        selected_indices =np.random.choice(len(slot), size =1200, replace=False, p=weights)
        selected = [slot[i] for i in selected_indices]
        sample.extend(selected)

print(f"There are {len(sample):,} items in the sample")

In [None]:
# plot the destribution of the prices in sample 

prices = [float(item.price) for item in sample]
plt.figure(figsize=(15,6))
plt.title(f"Avg {sum(prices)/len(prices):.2f} and highest {max(prices):,.2f}\n")
plt.xlabel("Price ($)")
plt.ylabel("Count")
plt.hist(prices, rwidth=0.7, color='darkblue', bins=range(0, 1000, 10))
plt.show()

In [None]:
# Plot the Categories from the sample

category_counts = Counter()
for item in sample:
    category_counts[item.category]+=1

categories = category_counts.keys()
counts = [category_counts[category] for category in categories]

# Create bar chart
plt.figure(figsize=(15, 6))
plt.bar(categories, counts, color="lightgreen")
plt.title('How many in each category')
plt.xlabel('Categories')
plt.ylabel('Count')
plt.xticks(rotation=30, ha='right')

# Add value labels on top of each bar
for i, v in enumerate(counts):
    plt.text(i, v, f"{v:,}", ha='center', va='bottom')

plt.show()

In [None]:
# Plotting a scatter plot of price against the character count of the description/prompt (Expensive things may have more description)
sizes = [len(item.prompt) for item in sample]
prices = [item.price for item in sample]

plt.figure(figsize=(15,6))
plt.scatter(sizes, prices, s=0.2, color='teal')
plt.xlabel("Size")
plt.ylabel("Price")
plt.title("Prices against Description Length")

In [None]:
def report(item):
    prompt = item.prompt
    tokens = Item.tokenizer.encode(item.prompt)
    print(prompt)
    print(tokens[-10:])
    print(Item.tokenizer.batch_decode(tokens[-10:]))


In [None]:
report(sample[49990])

In [None]:
random.seed(42)
random.shuffle(sample)
train = sample[:400_000]
test = sample[400_000:]
print(f"Divided into a training set of {len(train):,} items and test set of {len(test):,} items")

In [None]:
print(train[0].prompt+ "\n")
print(test[0].test_prompt())

In [None]:
# Creating dataset to upload to hugging face

train_prompts = [item.prompt for item in train]
train_prices = [item.price for item in train]
test_prompts = [item.test_prompt() for item in test]
test_prices = [item.price for item in test]

train_dataset = Dataset.from_dict({"text":train_prompts, "price":train_prices})
test_dataset = Dataset.from_dict({"text":test_prompts, "price":test_prices})
dataset = DatasetDict({
    "train":train_dataset,
    "test":test_dataset
})

In [None]:
HF_USER = "SidiBoi"
DATASET_NAME = f"{HF_USER}/pricer-data"
dataset.push_to_hub(DATASET_NAME, private=True)

In [None]:
with open('train.pkl', 'wb') as file:
    pickle.dump(train, file)

with open('test.pkl', 'wb') as file:
    pickle.dump(test, file)