In [None]:
import re

def clean_price(data):
    """
    Cleans and converts the 'price' field from a string to a float.
    Handles cases with missing values, commas, and unexpected formats.

    :param data: Dictionary containing the 'price' field as a string.
    :return: Updated dictionary with 'price' as a float.
    """
    try:
        price_str = str(data.get("price", "")).strip()
        if not price_str:
            raise ValueError("Empty price value")

        price_match = re.search(r"[\d,]+(\.\d{1,2})?", price_str)
        if not price_match:
            raise ValueError(f"Invalid price format: {price_str}")

        clean_price = float(price_match.group().replace(",", ""))
        data["price"] = clean_price
    except (ValueError, TypeError) as e:
        print(f"Warning: {e}. Setting price to None.")
        data["price"] = None 

    return data

In [None]:
def clean_condition(data):
    """
    Cleans and standardizes the 'condition' field by extracting key condition labels
    and identifying special flags such as '4 Pack Minimum' and 'KOREAN'.

    :param data: Dictionary containing the 'condition' field as a string
    :return: Updated dictionary with standardized 'condition' and extracted flags
    """
    condition_text = data.get("condition", "").replace("\n", " ").strip()

    condition_keywords = ["Unopened", "Sealed", "Opened", "New", "Used"]

    primary_condition = None
    for keyword in condition_keywords:
        if keyword.lower() in condition_text.lower():
            primary_condition = keyword
            break

    data["condition"] = primary_condition if primary_condition else "Unknown"

    return data


In [None]:
import math

def clean_quantity(data, apply_log=False):
    """
    Converts the 'quantity' field to an integer.
    If conversion fails, sets the quantity to 0.
    Optionally applies a log transform to the value.

    :param data: Dictionary containing the 'quantity' field.
    :param apply_log: Boolean to indicate whether to apply a logarithm transform.
    :return: The updated dictionary with cleaned 'quantity'.
    """
    try:
        quantity = int(str(data.get("quantity", "")).strip())
    except Exception as e:
        print(f"Warning: {e}. Setting quantity to 0.")
        quantity = 0

    if apply_log and quantity > 0:
        quantity = round(math.log(quantity), 4)

    data["quantity"] = quantity
    return data


In [None]:
from datetime import datetime

def clean_date(data):
    """
    Converts the 'date' field from a string like "2/15/25" to a datetime object.
    Also extracts additional features such as day_of_year and month.
    
    :param data: Dictionary containing the 'date' field.
    :return: Updated dictionary with 'date' as a datetime object and extra date features.
    """
    try:
        date_str = data.get("date", "").strip()
        if not date_str:
            raise ValueError("Empty date string")
        
        dt = datetime.strptime(date_str, "%m/%d/%y")
        data["date"] = dt
        
        data["day_of_year"] = dt.timetuple().tm_yday
        data["month"] = dt.month
        
    except Exception as e:
        print(f"Warning: {e}. Date conversion failed for: {data.get('date')}")
        data["date"] = None
        data["day_of_year"] = None
        data["month"] = None
    
    return data

In [None]:

def cleaning_process(market_history):
    for item in market_history:
        clean_price(item)
        clean_condition(item)
        clean_quantity(item)
        clean_date(item)
    return market_history

In [None]:
import pandas as pd
import json

# Load and prepare data
def read_json_file(filepath):
    """Read JSON data from file"""
    with open(filepath, 'r') as f:
        return json.load(f)

A = read_json_file('scraped_results.json')
B = read_json_file('scraped_results2.json')

for item in B:
    A.append(item)
for item in A:
    cleaning_process(item)

flattened_data = []
for item_index, item_data in enumerate(A):
    for record in item_data:
        record["item_id"] = item_index
        flattened_data.append(record)

# Convert to DataFrame
df = pd.DataFrame(flattened_data)
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')
df['item_id'] = df['item_id'].astype('category').cat.codes


In [None]:
import numpy as np

def add_temporal_features(df):
    """Add temporal features to the dataset."""
    df['day_of_year'] = df['date'].dt.dayofyear
    df['month'] = df['date'].dt.month
    df['day_of_week'] = df['date'].dt.dayofweek
    df['week_of_year'] = df['date'].dt.isocalendar().week
    
    # Fourier terms for seasonality
    df['weekly_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['weekly_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['yearly_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
    df['yearly_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
    
    return df

def add_lag_features(df):
    """Add lagged price features."""
    df['price_lag_1'] = df.groupby('item_id')['price'].shift(1)
    df['price_lag_2'] = df.groupby('item_id')['price'].shift(2)
    df['price_lag_7'] = df.groupby('item_id')['price'].shift(7)
    return df

def add_differenced_features(df):
    """Add features that capture price changes."""
    df['price_diff_1'] = df.groupby('item_id')['price'].diff(1)  # Daily change
    df['price_diff_7'] = df.groupby('item_id')['price'].diff(7)   # Weekly change
    df['price_pct_change_1'] = df.groupby('item_id')['price'].pct_change(1)
    return df

def add_rolling_features(df):
    """Add rolling statistics."""
    windows = [3, 7, 14, 30]  # Multiple window sizes
    for window in windows:
        df[f'rolling_avg_{window}'] = df.groupby('item_id')['price'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )
        df[f'rolling_std_{window}'] = df.groupby('item_id')['price'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std()
        )
    return df

def add_mean_reversion_features(df):
    """Add features that capture deviation from typical levels."""
    # Deviation from rolling averages
    df['deviation_from_7day'] = df['price'] - df['rolling_avg_7']
    df['deviation_from_30day'] = df['price'] - df['rolling_avg_30']
    
    # Z-score of current price
    df['price_zscore'] = df.groupby('item_id')['price'].transform(
        lambda x: (x - x.rolling(window=30, min_periods=1).mean()) / 
                 x.rolling(window=30, min_periods=1).std()
    )
    
    # Price position within recent range
    df['price_percentile_30'] = df.groupby('item_id')['price'].transform(
        lambda x: x.rolling(window=30, min_periods=1).apply(
            lambda y: np.sum(y < y.iloc[-1]) / len(y) if len(y) > 1 else 0.5
        )
    )
    return df

# Apply all feature engineering
df = add_temporal_features(df)
df = add_lag_features(df)
df = add_differenced_features(df)
df = add_rolling_features(df)
df = add_mean_reversion_features(df)

# Drop rows with missing values (from lag features)
df = df.dropna()


In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
# Define feature columns
feature_columns = [
    'quantity',
    'day_of_year', 'month', 'day_of_week', 'week_of_year',
    'weekly_sin', 'weekly_cos', 'yearly_sin', 'yearly_cos',
    'price_lag_1', 'price_lag_2', 'price_lag_7',
    'price_diff_1', 'price_diff_7', 'price_pct_change_1',
    'rolling_avg_3', 'rolling_avg_7', 'rolling_avg_14', 'rolling_avg_30',
    'rolling_std_7', 'rolling_std_30',
    'deviation_from_7day', 'deviation_from_30day',
    'price_zscore', 'price_percentile_30'
]

# Define target variable
target_column = 'price'

# Prepare features and target
X = df[feature_columns]
y = df[target_column]

# Time-Based Validation with prediction adjustment
def time_based_validation(X, y):
    """Perform time-based validation using walk-forward splits."""
    tscv = TimeSeriesSplit(n_splits=5)
    results = []
    last_test_index = None
    models = []

    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train Random Forest model
        model = RandomForestRegressor(
            n_estimators=200,
            max_depth=10,
            min_samples_split=5,
            random_state=42,
            n_jobs=-1
        )
        model.fit(X_train, y_train)
        models.append(model)

        # Make predictions
        y_pred = model.predict(X_test)
        
        # Apply mean reversion adjustment
        deviation = X_test['deviation_from_7day'].values
        adjustment_factor = np.where(
            deviation > 0,
            1 - np.tanh(deviation / X_test['rolling_std_7'].clip(lower=0.1)),
            1 + np.tanh(-deviation / X_test['rolling_std_7'].clip(lower=0.1))
        )
        y_pred_adjusted = y_pred * adjustment_factor

        # Evaluate both raw and adjusted predictions
        for pred, label in [(y_pred, 'raw'), (y_pred_adjusted, 'adjusted')]:
            mae = mean_absolute_error(y_test, pred)
            mse = mean_squared_error(y_test, pred)
            r2 = r2_score(y_test, pred)
            
            results.append({
                'fold': len(results)//2 + 1,
                'type': label,
                'mae': mae,
                'mse': mse,
                'r2': r2,
                'up_percentage': np.mean(np.diff(pred) > 0)  # Track upward movements
            })

        last_test_index = test_index

    return results, models[-1], last_test_index

# Run time-based validation
validation_results, model, last_test_index = time_based_validation(X, y)

# Print validation results
print("\nValidation Results:")
for i in range(0, len(validation_results), 2):
    raw = validation_results[i]
    adj = validation_results[i+1]
    print(f"\nFold {raw['fold']}:")
    print(f"  Raw:      MAE = {raw['mae']:.2f}, MSE = {raw['mse']:.2f}, R² = {raw['r2']:.4f}, Up% = {raw['up_percentage']:.2f}")
    print(f"  Adjusted: MAE = {adj['mae']:.2f}, MSE = {adj['mse']:.2f}, R² = {adj['r2']:.4f}, Up% = {adj['up_percentage']:.2f}")

# Save the trained model
joblib.dump(model, "pokemon_forecaster_v3.pkl")
