In [None]:
import re

def clean_price(data):
    """
    Cleans and converts the 'price' field from a string to a float.
    Handles cases with missing values, commas, and unexpected formats.

    :param data: Dictionary containing the 'price' field as a string.
    :return: Updated dictionary with 'price' as a float.
    """
    try:
        price_str = str(data.get("price", "")).strip()
        if not price_str:
            raise ValueError("Empty price value")

        price_match = re.search(r"[\d,]+(\.\d{1,2})?", price_str)
        if not price_match:
            raise ValueError(f"Invalid price format: {price_str}")

        clean_price = float(price_match.group().replace(",", ""))
        data["price"] = clean_price
    except (ValueError, TypeError) as e:
        print(f"Warning: {e}. Setting price to None.")
        data["price"] = None 

    return data

In [None]:
def clean_condition(data):
    """
    Cleans and standardizes the 'condition' field by extracting key condition labels
    and identifying special flags such as '4 Pack Minimum' and 'KOREAN'.

    :param data: Dictionary containing the 'condition' field as a string
    :return: Updated dictionary with standardized 'condition' and extracted flags
    """
    condition_text = data.get("condition", "").replace("\n", " ").strip()

    condition_keywords = ["Unopened", "Sealed", "Opened", "New", "Used"]

    primary_condition = None
    for keyword in condition_keywords:
        if keyword.lower() in condition_text.lower():
            primary_condition = keyword
            break

    data["condition"] = primary_condition if primary_condition else "Unknown"

    return data


In [None]:
import math

def clean_quantity(data, apply_log=False):
    """
    Converts the 'quantity' field to an integer.
    If conversion fails, sets the quantity to 0.
    Optionally applies a log transform to the value.

    :param data: Dictionary containing the 'quantity' field.
    :param apply_log: Boolean to indicate whether to apply a logarithm transform.
    :return: The updated dictionary with cleaned 'quantity'.
    """
    try:
        quantity = int(str(data.get("quantity", "")).strip())
    except Exception as e:
        print(f"Warning: {e}. Setting quantity to 0.")
        quantity = 0

    if apply_log and quantity > 0:
        quantity = round(math.log(quantity), 4)

    data["quantity"] = quantity
    return data


In [None]:
from datetime import datetime

def clean_date(data):
    """
    Converts the 'date' field from a string like "2/15/25" to a datetime object.
    Also extracts additional features such as day_of_year and month.
    
    :param data: Dictionary containing the 'date' field.
    :return: Updated dictionary with 'date' as a datetime object and extra date features.
    """
    try:
        date_str = data.get("date", "").strip()
        if not date_str:
            raise ValueError("Empty date string")
        
        dt = datetime.strptime(date_str, "%m/%d/%y")
        data["date"] = dt
        
        data["day_of_year"] = dt.timetuple().tm_yday
        data["month"] = dt.month
        
    except Exception as e:
        print(f"Warning: {e}. Date conversion failed for: {data.get('date')}")
        data["date"] = None
        data["day_of_year"] = None
        data["month"] = None
    
    return data

In [None]:

def cleaning_process(market_history):
    for item in market_history:
        clean_price(item)
        clean_condition(item)
        clean_quantity(item)
        clean_date(item)
    return market_history

In [None]:
import pandas as pd
import json

# Load and prepare data
def read_json_file(filepath):
    """Read JSON data from file"""
    with open(filepath, 'r') as f:
        return json.load(f)

A = read_json_file('scraped_results.json')
B = read_json_file('scraped_results2.json')

for item in B:
    A.append(item)
for item in A:
    cleaning_process(item)

flattened_data = []
for item_index, item_data in enumerate(A):
    for record in item_data:
        record["item_id"] = item_index
        flattened_data.append(record)

# Convert to DataFrame
df = pd.DataFrame(flattened_data)
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')
df['item_id'] = df['item_id'].astype('category').cat.codes
