In [1]:
import os
import random
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

In [2]:
load_dotenv()
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')

In [3]:
hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

Token has not been saved to git credential helper.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m


In [4]:
from items import Item
from loaders import ItemLoader

In [5]:
dataset_names = [
    # "Automotive",
    # "Electronics",
    # "Office_Products",
    # "Tools_and_Home_Improvement",
    # "Cell_Phones_and_Accessories",
    # "Toys_and_Games",
    "Appliances",
    # "Musical_Instruments",
]

In [6]:
items = []
for dataset_name in dataset_names:
    loader = ItemLoader(dataset_name)
    items.extend(loader.load())

Loading dataset Appliances


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 95/95 [00:20<00:00,  4.56it/s]

Completed Appliances with 28,625 datapoints in 0.4 mins





In [7]:
print(f"A grand total of {len(items):,} items")

A grand total of 28,625 items


In [8]:
train_items, test_items = train_test_split(items, test_size=0.2, random_state=42)

In [9]:
test_items[0]

<Discount Parts Direct 316075104 Oven Bake Element Heating Element for Frigidaire Kenmore, Replaces 316282600, 09990062, 1465763, 316075100, 316075102, 316075103, 3203534, AH2332301, EA2332301, F83-455, PS2332301 = $26.97>

In [12]:
train_prompts = [item.prompt for item in train_items]
train_prices = [item.price for item in train_items]
test_prompts = [item.test_prompt() for item in test_items]
test_prices = [item.price for item in test_items]

In [13]:
train_dataset = Dataset.from_dict({"text": train_prompts, "price": train_prices})
test_dataset = Dataset.from_dict({"text": test_prompts, "price": test_prices})
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [14]:
with open('./../data/app_train_dataset.pkl', 'wb') as f:
    pickle.dump(train_dataset, f)
with open('./../data/app_test_dataset.pkl', 'wb') as f:
    pickle.dump(test_dataset, f)