In [24]:
import pandas as pd
import numpy as np

In [34]:
raw_df = pd.read_csv('./test_datasets/house_prices.csv')

In [35]:
def detect_time_dependency(df, date_min_unique=3):
    datetime_cols = []

    # Identify datetime columns
    for col in df.columns:
        try:
            pd.to_datetime(df[col], errors='raise')
            datetime_cols.append(col)
        except:
            continue

    if not datetime_cols:
        return {
            "is_time_dependent": False,
            "reason": "No datetime-like columns found.",
            "datetime_columns": []
        }

    # Analyze primary datetime column
    time_col = datetime_cols[0]
    converted = pd.to_datetime(df[time_col], errors="coerce")

    unique_dates = converted.nunique()
    total_rows = len(df)

    # RULE 1 — dataset MUST have more than 1 date
    if unique_dates <= date_min_unique:
        return {
            "is_time_dependent": False,
            "reason": f"Datetime column '{time_col}' has too few unique dates ({unique_dates}).",
            "datetime_columns": datetime_cols
        }

    # RULE 2 — detect monotonic progression within groups (multi-location)
    monotonic_ratio = None
    if "Location" in df.columns:
        monotonic_flags = []
        for loc, group in df.groupby("Location"):
            series = pd.to_datetime(group[time_col], errors="coerce")
            ratio = (series.diff().dropna() >= pd.Timedelta(0)).mean()
            monotonic_flags.append(ratio)
        monotonic_ratio = np.mean([x for x in monotonic_flags if pd.notnull(x)])

    return {
        "is_time_dependent": True,
        "reason": (
            f"Dataset contains datetime '{time_col}' with {unique_dates} unique dates. "
            "Repeated timestamps detected → likely multi-location or panel time series."
        ),
        "datetime_columns": datetime_cols,
        "monotonic_ratio": monotonic_ratio
    }

In [36]:
detect_time_dependency(raw_df)

  pd.to_datetime(df[col], errors='raise')
  pd.to_datetime(df[col], errors='raise')
  pd.to_datetime(df[col], errors='raise')
  pd.to_datetime(df[col], errors='raise')
  pd.to_datetime(df[col], errors='raise')
  pd.to_datetime(df[col], errors='raise')


{'is_time_dependent': True,
 'reason': "Dataset contains datetime 'date' with 70 unique dates. Repeated timestamps detected → likely multi-location or panel time series.",
 'datetime_columns': ['date',
  'price',
  'bedrooms',
  'bathrooms',
  'sqft_living',
  'sqft_lot',
  'floors',
  'waterfront',
  'view',
  'condition',
  'sqft_above',
  'sqft_basement',
  'yr_built',
  'yr_renovated',
  'Phone Number'],
 'monotonic_ratio': None}