# Online Order Data Exploration
## Focus: Order Fulfillments & Order Tags Analysis

This notebook explores the Shopify online order data, specifically:
- **Order Fulfillments**: Stored as nested dictionary strings - need parsing
- **Order Tags**: Cryptic codes that need decoding

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import ast
import json
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

## 1. Order Fulfillments - Parsing Nested Data

The fulfillment data is stored as Python dictionary strings. We need to:
1. Parse the string into actual dictionaries
2. Extract key fields (status, dates, tracking info)
3. Analyze shipping performance

In [None]:
# Load raw fulfillment data
print("Loading order fulfillments data...")
fulfillments_raw = pd.read_csv('data/input/Online/order_fulfillments.csv', 
                               encoding='utf-8-sig',
                               low_memory=False)

print(f"Total records: {len(fulfillments_raw):,}")
print(f"Columns: {fulfillments_raw.columns.tolist()}")
print("\nFirst few rows (raw):")
fulfillments_raw.head()

In [None]:
# Function to safely parse the fulfillment data string
def parse_fulfillment(data_str):
    """Parse the Python dict string into structured data"""
    if pd.isna(data_str):
        return {}
    
    try:
        # The data contains datetime objects as strings, need to handle carefully
        # Replace datetime.datetime(...) with a parseable format
        import re
        
        # Extract datetime patterns
        datetime_pattern = r'datetime\.datetime\((\d+),\s*(\d+),\s*(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\)'
        
        def datetime_replacer(match):
            year, month, day, hour, minute, second = match.groups()
            return f"'{year}-{month.zfill(2)}-{day.zfill(2)} {hour.zfill(2)}:{minute.zfill(2)}:{second.zfill(2)}'"
        
        cleaned = re.sub(datetime_pattern, datetime_replacer, data_str)
        
        # Replace array() with list
        cleaned = re.sub(r'array\(\[', '[', cleaned)
        cleaned = re.sub(r'\],\s*dtype=object\)', ']', cleaned)
        
        # Use ast.literal_eval for safe evaluation
        data = ast.literal_eval(cleaned)
        return data
    except Exception as e:
        # If parsing fails, return empty dict
        return {}

print("Parsing fulfillment data (this may take a moment)...")
fulfillments_raw['parsed_data'] = fulfillments_raw['fulfillment_data'].apply(parse_fulfillment)

In [None]:
# Extract fields from parsed data
print("Extracting fields from parsed data...")

fulfillments = pd.DataFrame()
fulfillments['order_id'] = fulfillments_raw['order_id']
fulfillments['fulfillment_id'] = fulfillments_raw['parsed_data'].apply(lambda x: x.get('id', None))
fulfillments['status'] = fulfillments_raw['parsed_data'].apply(lambda x: x.get('status', None))
fulfillments['display_status'] = fulfillments_raw['parsed_data'].apply(lambda x: x.get('displayStatus', None))
fulfillments['total_quantity'] = fulfillments_raw['parsed_data'].apply(lambda x: x.get('totalQuantity', None))
fulfillments['created_at'] = fulfillments_raw['parsed_data'].apply(lambda x: x.get('createdAt', None))
fulfillments['updated_at'] = fulfillments_raw['parsed_data'].apply(lambda x: x.get('updatedAt', None))
fulfillments['tracking_info'] = fulfillments_raw['parsed_data'].apply(lambda x: x.get('trackingInfo', []))

# Parse dates
fulfillments['created_at'] = pd.to_datetime(fulfillments['created_at'], errors='coerce')
fulfillments['updated_at'] = pd.to_datetime(fulfillments['updated_at'], errors='coerce')

# Extract tracking details
fulfillments['has_tracking'] = fulfillments['tracking_info'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)
fulfillments['tracking_number'] = fulfillments['tracking_info'].apply(
    lambda x: x[0].get('number', None) if isinstance(x, list) and len(x) > 0 else None
)
fulfillments['carrier'] = fulfillments['tracking_info'].apply(
    lambda x: x[0].get('company', None) if isinstance(x, list) and len(x) > 0 else None
)

# Calculate processing time (created to updated)
fulfillments['processing_hours'] = (fulfillments['updated_at'] - fulfillments['created_at']).dt.total_seconds() / 3600

print(f"‚úì Extracted {len(fulfillments):,} fulfillment records")
fulfillments.head(10)

In [None]:
# Summary statistics
print("FULFILLMENT DATA OVERVIEW")
print("="*70)

print(f"\nDate Range:")
print(f"  Earliest fulfillment: {fulfillments['created_at'].min()}")
print(f"  Latest fulfillment: {fulfillments['created_at'].max()}")
print(f"  Span: {(fulfillments['created_at'].max() - fulfillments['created_at'].min()).days} days")

print(f"\nFulfillment Status:")
print(fulfillments['status'].value_counts())

print(f"\nDisplay Status:")
print(fulfillments['display_status'].value_counts())

print(f"\nTracking Information:")
tracking_rate = 100 * fulfillments['has_tracking'].sum() / len(fulfillments)
print(f"  Orders with tracking: {fulfillments['has_tracking'].sum():,} ({tracking_rate:.1f}%)")

print(f"\nCarriers Used:")
print(fulfillments['carrier'].value_counts().head(10))

print(f"\nQuantity Distribution:")
print(fulfillments['total_quantity'].describe())

In [None]:
# Processing time analysis
print("FULFILLMENT TIMING ANALYSIS")
print("="*70)

print(f"\nProcessing Time (Created ‚Üí Updated):")
print(f"  Mean: {fulfillments['processing_hours'].mean():.1f} hours ({fulfillments['processing_hours'].mean()/24:.1f} days)")
print(f"  Median: {fulfillments['processing_hours'].median():.1f} hours ({fulfillments['processing_hours'].median()/24:.1f} days)")
print(f"  Min: {fulfillments['processing_hours'].min():.1f} hours")
print(f"  Max: {fulfillments['processing_hours'].max():.1f} hours ({fulfillments['processing_hours'].max()/24:.1f} days)")

# Categorize speed
same_day = (fulfillments['processing_hours'] <= 24).sum()
one_two_days = ((fulfillments['processing_hours'] > 24) & (fulfillments['processing_hours'] <= 48)).sum()
three_five_days = ((fulfillments['processing_hours'] > 48) & (fulfillments['processing_hours'] <= 120)).sum()
slow = (fulfillments['processing_hours'] > 120).sum()

print(f"\nSpeed Categories:")
print(f"  Same/Next Day (‚â§24h): {same_day:,} ({100*same_day/len(fulfillments):.1f}%)")
print(f"  1-2 Days (24-48h): {one_two_days:,} ({100*one_two_days/len(fulfillments):.1f}%)")
print(f"  3-5 Days (48-120h): {three_five_days:,} ({100*three_five_days/len(fulfillments):.1f}%)")
print(f"  >5 Days (>120h): {slow:,} ({100*slow/len(fulfillments):.1f}%)")

In [None]:
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Processing time distribution
processing_capped = fulfillments[fulfillments['processing_hours'] <= 168]['processing_hours']  # Cap at 1 week
axes[0, 0].hist(processing_capped, bins=50, edgecolor='black', alpha=0.7, color='steelblue')
axes[0, 0].axvline(processing_capped.median(), color='red', linestyle='--', 
                   label=f'Median: {processing_capped.median():.1f}h')
axes[0, 0].set_xlabel('Processing Time (hours)')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('Fulfillment Processing Time Distribution (‚â§1 week)')
axes[0, 0].legend()

# 2. Fulfillments over time
monthly_fulfillments = fulfillments.groupby(fulfillments['created_at'].dt.to_period('M')).size()
monthly_fulfillments.plot(kind='bar', ax=axes[0, 1], color='coral')
axes[0, 1].set_xlabel('Month')
axes[0, 1].set_ylabel('Number of Fulfillments')
axes[0, 1].set_title('Fulfillments Over Time')
axes[0, 1].tick_params(axis='x', rotation=45)

# 3. Carrier distribution
carrier_counts = fulfillments['carrier'].value_counts().head(10)
carrier_counts.plot(kind='barh', ax=axes[1, 0], color='lightgreen')
axes[1, 0].set_xlabel('Number of Shipments')
axes[1, 0].set_title('Top 10 Carriers')
axes[1, 0].invert_yaxis()

# 4. Status distribution
status_counts = fulfillments['display_status'].value_counts()
axes[1, 1].pie(status_counts.values, labels=status_counts.index, autopct='%1.1f%%', startangle=90)
axes[1, 1].set_title('Fulfillment Status Distribution')

plt.tight_layout()
plt.show()

## 2. Order Tags Analysis

Tags appear to be cryptic codes. Let's analyze their patterns and frequency.

In [None]:
# Load order tags
print("Loading order tags data...")
order_tags = pd.read_csv('data/input/Online/order_tags.csv', 
                         encoding='utf-8-sig',
                         low_memory=False)

print(f"Total tag records: {len(order_tags):,}")
print(f"Unique orders with tags: {order_tags['order_id'].nunique():,}")
print(f"Unique tags: {order_tags['tag'].nunique():,}")
print("\nFirst 20 rows:")
order_tags.head(20)

In [None]:
# Analyze tag patterns
print("ORDER TAGS ANALYSIS")
print("="*70)

print("\nTop 50 Most Common Tags:")
tag_counts = order_tags['tag'].value_counts().head(50)
for i, (tag, count) in enumerate(tag_counts.items(), 1):
    pct = 100 * count / len(order_tags)
    print(f"{i:3d}. '{tag:30s}' | {count:>8,} orders ({pct:>5.2f}%)")

In [None]:
# Analyze tag characteristics
print("\nTAG PATTERN ANALYSIS")
print("="*70)

# Tag length distribution
order_tags['tag_length'] = order_tags['tag'].astype(str).str.len()
print(f"\nTag Length Statistics:")
print(f"  Mean: {order_tags['tag_length'].mean():.1f} characters")
print(f"  Median: {order_tags['tag_length'].median():.0f} characters")
print(f"  Min: {order_tags['tag_length'].min():.0f} characters")
print(f"  Max: {order_tags['tag_length'].max():.0f} characters")

# Identify tag types
print("\nTag Type Patterns:")

# Short codes (1-3 chars)
short_codes = order_tags[order_tags['tag_length'] <= 3]
print(f"  Short codes (1-3 chars): {len(short_codes):,} ({100*len(short_codes)/len(order_tags):.1f}%)")
print(f"    Examples: {short_codes['tag'].value_counts().head(10).to_dict()}")

# Descriptive tags (>10 chars)
descriptive = order_tags[order_tags['tag_length'] > 10]
print(f"\n  Descriptive tags (>10 chars): {len(descriptive):,} ({100*len(descriptive)/len(order_tags):.1f}%)")
print(f"    Examples: {descriptive['tag'].value_counts().head(10).to_dict()}")

# Tags with special characters
special_char_tags = order_tags[order_tags['tag'].str.contains(r'[.-]', na=False)]
print(f"\n  Tags with special chars (. or -): {len(special_char_tags):,} ({100*len(special_char_tags)/len(order_tags):.1f}%)")
print(f"    Examples: {special_char_tags['tag'].value_counts().head(10).to_dict()}")

# Numeric tags
numeric_tags = order_tags[order_tags['tag'].str.isnumeric()]
print(f"\n  Numeric tags: {len(numeric_tags):,} ({100*len(numeric_tags)/len(order_tags):.1f}%)")
print(f"    Most common: {numeric_tags['tag'].value_counts().head(5).to_dict()}")

In [None]:
# Try to categorize tags by pattern
print("\nTAG CATEGORIZATION (Hypothesis-based)")
print("="*70)

categories = {
    'Initials/Staff Codes': [],  # Short 2-3 letter codes (ah, cs, hkb, etc.)
    'Location Codes': [],  # Codes with dots (.lk, .jh, .nc, .kr)
    'Campaign/Promo': [],  # Contains 'DN-', 'Welcome', 'Subscription'
    'Numeric/Other': []  # Numbers like '0'
}

for tag in order_tags['tag'].unique():
    tag_str = str(tag)
    
    # Campaign/Promo patterns
    if any(keyword in tag_str for keyword in ['DN-', 'Welcome', 'Subscription', 'Promo', 'Sale', 'Offer']):
        categories['Campaign/Promo'].append(tag)
    # Location codes (start with dot)
    elif tag_str.startswith('.'):
        categories['Location Codes'].append(tag)
    # Short codes (2-3 letters)
    elif len(tag_str) <= 3 and tag_str.isalpha():
        categories['Initials/Staff Codes'].append(tag)
    # Everything else
    else:
        categories['Numeric/Other'].append(tag)

# Display categorization with counts
for category, tags in categories.items():
    if tags:
        tag_orders = order_tags[order_tags['tag'].isin(tags)]
        print(f"\n{category} ({len(tags)} unique tags, {len(tag_orders):,} orders):")
        print("-" * 70)
        tag_freq = tag_orders['tag'].value_counts().head(15)
        for tag, count in tag_freq.items():
            print(f"  '{tag:20s}': {count:,} orders")

In [None]:
# Visualize tag distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Top 20 tags
top_20_tags = order_tags['tag'].value_counts().head(20)
top_20_tags.plot(kind='barh', ax=axes[0], color='steelblue')
axes[0].set_xlabel('Number of Orders')
axes[0].set_title('Top 20 Most Common Order Tags')
axes[0].invert_yaxis()

# Tag category distribution
category_counts = {cat: len(order_tags[order_tags['tag'].isin(tags)]) 
                   for cat, tags in categories.items() if tags}
pd.Series(category_counts).sort_values().plot(kind='barh', ax=axes[1], color='coral')
axes[1].set_xlabel('Number of Orders')
axes[1].set_title('Orders by Tag Category')

plt.tight_layout()
plt.show()

## 3. Cross-Analysis: Tags + Fulfillments

In [None]:
# Merge tags with fulfillments
print("CROSS-ANALYSIS: Tags vs Fulfillment Performance")
print("="*70)

# Merge on order_id
merged = fulfillments.merge(order_tags, on='order_id', how='inner')

print(f"\nMerged dataset: {len(merged):,} records")
print(f"Orders with both fulfillment and tag data: {merged['order_id'].nunique():,}")

if len(merged) > 0:
    # Analyze processing time by tag
    print("\nAverage Processing Time by Tag (min 20 orders):")
    print("="*70)
    
    tag_performance = merged.groupby('tag').agg({
        'processing_hours': ['mean', 'median', 'count'],
        'has_tracking': 'mean'
    }).round(2)
    
    tag_performance.columns = ['Avg Hours', 'Median Hours', 'Order Count', 'Tracking %']
    tag_performance['Tracking %'] = (tag_performance['Tracking %'] * 100).round(1)
    tag_performance = tag_performance[tag_performance['Order Count'] >= 20]
    tag_performance = tag_performance.sort_values('Avg Hours', ascending=False)
    
    print(tag_performance.head(20))
    
    # Visualize
    if len(tag_performance) > 0:
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        
        # Processing time by tag
        tag_performance.head(15)['Avg Hours'].plot(kind='barh', ax=axes[0], color='coral')
        axes[0].set_xlabel('Average Processing Time (hours)')
        axes[0].set_title('Fulfillment Speed by Tag (Top 15 slowest, min 20 orders)')
        axes[0].invert_yaxis()
        
        # Tracking rate by tag
        tag_performance.head(15)['Tracking %'].plot(kind='barh', ax=axes[1], color='lightgreen')
        axes[1].set_xlabel('Tracking Rate (%)')
        axes[1].set_title('Tracking Rate by Tag (Top 15 slowest tags)')
        axes[1].invert_yaxis()
        
        plt.tight_layout()
        plt.show()

## 4. Key Insights & AI Opportunities

In [None]:
print("="*70)
print("KEY INSIGHTS - ONLINE OPERATIONS")
print("="*70)

print("\nüì¶ FULFILLMENT INSIGHTS:")
print(f"  ‚Ä¢ Total fulfillments analyzed: {len(fulfillments):,}")
print(f"  ‚Ä¢ Average processing time: {fulfillments['processing_hours'].mean():.1f} hours ({fulfillments['processing_hours'].mean()/24:.1f} days)")
print(f"  ‚Ä¢ Median processing time: {fulfillments['processing_hours'].median():.1f} hours ({fulfillments['processing_hours'].median()/24:.1f} days)")
print(f"  ‚Ä¢ Orders with tracking: {100*fulfillments['has_tracking'].mean():.1f}%")
print(f"  ‚Ä¢ Primary carrier: {fulfillments['carrier'].mode()[0] if len(fulfillments['carrier'].mode()) > 0 else 'N/A'}")

print("\nüè∑Ô∏è  ORDER TAG INSIGHTS:")
print(f"  ‚Ä¢ Total unique tags: {order_tags['tag'].nunique():,}")
print(f"  ‚Ä¢ Total tagged orders: {len(order_tags):,}")
print(f"  ‚Ä¢ Most common tag: '{order_tags['tag'].value_counts().index[0]}' ({order_tags['tag'].value_counts().iloc[0]:,} orders)")
print(f"  ‚Ä¢ Tag categories identified:")
for category, tags in categories.items():
    if tags:
        tag_count = len(order_tags[order_tags['tag'].isin(tags)])
        print(f"    - {category}: {len(tags)} unique tags, {tag_count:,} orders")

print("\nüí° AI AGENT OPPORTUNITIES:")
print("  1. Smart Fulfillment Predictor")
print("     - Predict processing time based on order characteristics")
print("     - Flag orders likely to be delayed")
print("     - Optimize carrier selection based on destination")
print("\n  2. Tag Intelligence Agent")
print("     - Decode cryptic tags (ah, cs, hkb) - likely staff initials")
print("     - Auto-tag new orders based on patterns")
print("     - Identify which staff/locations have fastest fulfillment")
print("\n  3. Unified Retail+Online Dashboard")
print("     - Combine retail POS data with online fulfillment")
print("     - Cross-channel customer view")
print("     - Inventory sync alerts (online sale = reduce retail stock)")

print("\n" + "="*70)