In [1]:
import os
from pathlib import Path
import json
import polars as pl
import pandas as pd
import numpy as np

# Setup paths
ROOT = Path.cwd().parent
DATA_DIR = ROOT / "data"
OUTPUT_DIR = ROOT / "output"


In [2]:
item_stats = pl.read_parquet(OUTPUT_DIR / 'item_popularity_stats.parquet')

# Find items with high engagement but 0 orders
suspicious_items = item_stats.filter(
    (pl.col('orders') == 0) & 
    (pl.col('popularity_score') > 50000)
).sort('popularity_score', descending=True)

print(f"Found {len(suspicious_items)} items with high engagement but 0 orders\n")
print("Top 10 suspicious items:")
print(suspicious_items.head(10))

# Focus on the top suspicious item
top_suspicious_item = suspicious_items[0, 'item_id']
print(f"\n\nInvestigating item: {top_suspicious_item}")

suspicious_item_stats = item_stats.filter(pl.col('item_id') == top_suspicious_item)
print("\nDetailed stats:")
print(suspicious_item_stats)

Found 6 items with high engagement but 0 orders

Top 10 suspicious items:
shape: (6, 6)
┌─────────┬────────────────────┬────────┬───────┬────────┬──────────────────┐
│ item_id ┆ total_interactions ┆ clicks ┆ carts ┆ orders ┆ popularity_score │
│ ---     ┆ ---                ┆ ---    ┆ ---   ┆ ---    ┆ ---              │
│ i64     ┆ u32                ┆ u32    ┆ u32   ┆ u32    ┆ u32              │
╞═════════╪════════════════════╪════════╪═══════╪════════╪══════════════════╡
│ 485256  ┆ 126836             ┆ 97154  ┆ 29682 ┆ 0      ┆ 156518           │
│ 1502122 ┆ 78203              ┆ 73805  ┆ 4398  ┆ 0      ┆ 82601            │
│ 322370  ┆ 74027              ┆ 66729  ┆ 7298  ┆ 0      ┆ 81325            │
│ 152547  ┆ 51153              ┆ 32617  ┆ 18536 ┆ 0      ┆ 69689            │
│ 95488   ┆ 62185              ┆ 62023  ┆ 162   ┆ 0      ┆ 62347            │
│ 33343   ┆ 43161              ┆ 28435  ┆ 14726 ┆ 0      ┆ 57887            │
└─────────┴────────────────────┴────────┴───────┴─────

In [3]:
df = pl.scan_parquet(DATA_DIR / 'train_full_processed.parquet')

# Filter all events for this specific item
item_events = (
    df
    .filter(pl.col('item_id') == top_suspicious_item)
    .collect()
)

print(f"Total events for item {top_suspicious_item}: {len(item_events):,}\n")

# Count by event type
event_breakdown = item_events.group_by('event_type').agg(
    pl.len().alias('count')
).sort('count', descending=True)

print("Event breakdown:")
print(event_breakdown)

# Check unique sessions
unique_sessions = item_events['session_id'].n_unique()
print(f"\nUnique sessions that interacted with this item: {unique_sessions:,}")

# Sample some events
print("\nSample events:")
print(item_events.head(20))

Total events for item 485256: 126,836

Event breakdown:
shape: (2, 2)
┌────────────┬───────┐
│ event_type ┆ count │
│ ---        ┆ ---   │
│ cat        ┆ u32   │
╞════════════╪═══════╡
│ clicks     ┆ 97154 │
│ carts      ┆ 29682 │
└────────────┴───────┘

Unique sessions that interacted with this item: 27,497

Sample events:
shape: (20, 4)
┌────────────┬─────────┬─────────────────────────┬────────────┐
│ session_id ┆ item_id ┆ event_time              ┆ event_type │
│ ---        ┆ ---     ┆ ---                     ┆ ---        │
│ i64        ┆ i64     ┆ datetime[ms]            ┆ cat        │
╞════════════╪═════════╪═════════════════════════╪════════════╡
│ 87         ┆ 485256  ┆ 2022-08-23 07:47:36.695 ┆ clicks     │
│ 87         ┆ 485256  ┆ 2022-08-23 07:48:15.085 ┆ clicks     │
│ 87         ┆ 485256  ┆ 2022-08-23 07:48:17.243 ┆ clicks     │
│ 87         ┆ 485256  ┆ 2022-08-23 07:48:24.735 ┆ carts      │
│ 87         ┆ 485256  ┆ 2022-08-23 07:48:32.950 ┆ clicks     │
│ …          ┆ …   

In [4]:
df = pl.scan_parquet(DATA_DIR / 'train_full_processed.parquet')

# Filter all events for this specific item
item_events = (
    df
    .filter(pl.col('item_id') == top_suspicious_item)
    .collect()
)

print(f"Total events for item {top_suspicious_item}: {len(item_events):,}\n")

# Count by event type
event_breakdown = item_events.group_by('event_type').agg(
    pl.len().alias('count')
).sort('count', descending=True)

print("Event breakdown:")
print(event_breakdown)

# Check unique sessions
unique_sessions = item_events['session_id'].n_unique()
print(f"\nUnique sessions that interacted with this item: {unique_sessions:,}")

# Sample some events
print("\nSample events:")
print(item_events.head(20))

Total events for item 485256: 126,836

Event breakdown:
shape: (2, 2)
┌────────────┬───────┐
│ event_type ┆ count │
│ ---        ┆ ---   │
│ cat        ┆ u32   │
╞════════════╪═══════╡
│ clicks     ┆ 97154 │
│ carts      ┆ 29682 │
└────────────┴───────┘

Unique sessions that interacted with this item: 27,497

Sample events:
shape: (20, 4)
┌────────────┬─────────┬─────────────────────────┬────────────┐
│ session_id ┆ item_id ┆ event_time              ┆ event_type │
│ ---        ┆ ---     ┆ ---                     ┆ ---        │
│ i64        ┆ i64     ┆ datetime[ms]            ┆ cat        │
╞════════════╪═════════╪═════════════════════════╪════════════╡
│ 87         ┆ 485256  ┆ 2022-08-23 07:47:36.695 ┆ clicks     │
│ 87         ┆ 485256  ┆ 2022-08-23 07:48:15.085 ┆ clicks     │
│ 87         ┆ 485256  ┆ 2022-08-23 07:48:17.243 ┆ clicks     │
│ 87         ┆ 485256  ┆ 2022-08-23 07:48:24.735 ┆ carts      │
│ 87         ┆ 485256  ┆ 2022-08-23 07:48:32.950 ┆ clicks     │
│ …          ┆ …   