In [1]:
from __future__ import annotations


import argparse
import datetime as dt
import logging
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple


import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
# -----------------------------------------------------------------------------
# Logging
# -----------------------------------------------------------------------------


logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
LOGGER = logging.getLogger(__name__)

In [None]:
# -----------------------------------------------------------------------------
# Data Loading & Basic Cleaning
# -----------------------------------------------------------------------------


EXPECTED_STUCK_COLS = [
"OrderGroupId CustomerId",
"MarketId LineItemId",
"Epi_TaxCategoryId OrderCount",
]


def load_raw_dataframe(file_path: Path) -> pd.DataFrame:
	"""Load the raw file (fixed-width or CSV fallback) with safety checks.

	Tries read_fwf first to match the original notebook; if that fails, falls
	back to read_csv.
	"""
	if not file_path.exists():
		raise FileNotFoundError(
			f"File not found: {file_path}. Please check the path and ensure the file exists."
		)

	try:
		df_raw = pd.read_fwf(file_path, encoding="utf-8")
		if df_raw.shape[1] <= 1:  # heuristics: FWF sometimes collapses
			raise ValueError("FWF produced single column; falling back to CSV.")
		LOGGER.info("Loaded data via read_fwf: %s rows, %s cols", *df_raw.shape)
		return df_raw
	except Exception as exc:  # noqa: BLE001
		LOGGER.warning("read_fwf failed (%s). Falling back to read_csv...", exc)
		df_raw = pd.read_csv(file_path)
		LOGGER.info("Loaded data via read_csv: %s rows, %s cols", *df_raw.shape)
		return df_raw