In [5]:
# ChainEats Analytics - Day 2: Data Exploration & Quality Assessment
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

print("ChainEats Data Quality Assessment")
print("=" * 50)

# Load all datasets
print("Loading datasets...")
locations_df = pd.read_csv('locations.csv')
menu_df = pd.read_csv('menu_items.csv')
sales_df = pd.read_csv('sales_data.csv')
weather_df = pd.read_csv('weather_data.csv')

# Convert date columns
sales_df['date'] = pd.to_datetime(sales_df['date'])
weather_df['date'] = pd.to_datetime(weather_df['date'])
locations_df['opening_date'] = pd.to_datetime(locations_df['opening_date'])

print("All datasets loaded successfully!")

# STEP 1: Dataset Overview
print("\nDATASET OVERVIEW")
print("-" * 30)

datasets = {
    'Locations': locations_df,
    'Menu Items': menu_df, 
    'Sales': sales_df,
    'Weather': weather_df
}

for name, df in datasets.items():
    print(f"\n{name}:")
    print(f"  Rows: {len(df):,}")
    print(f"  Columns: {len(df.columns)}")
    print(f"  Memory: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# STEP 2: Missing Values Check
print("\nMISSING VALUES ANALYSIS")
print("-" * 30)

for name, df in datasets.items():
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print(f"\n{name} - Missing values:")
        for col, count in missing[missing > 0].items():
            print(f"  {col}: {count} ({count/len(df)*100:.1f}%)")
    else:
        print(f"\n{name}: No missing values")

# STEP 3: Data Quality Issues Detection
print("\nDATA QUALITY ISSUES")
print("-" * 30)

# Check locations data
print("\n1. LOCATIONS ANALYSIS:")
print(f"  Cities: {locations_df['city'].nunique()}")
print(f"  Location types: {locations_df['location_type'].nunique()}")
print(f"  Rent range: ${locations_df['monthly_rent'].min():,.0f} - ${locations_df['monthly_rent'].max():,.0f}")

# Check for unrealistic rent values
problematic_rent = locations_df[(locations_df['monthly_rent'] < 1000) | (locations_df['monthly_rent'] > 15000)]
if len(problematic_rent) > 0:
    print(f"  {len(problematic_rent)} locations with unrealistic rent")

# Check menu data
print("\n2. MENU ANALYSIS:")
print(f"  Total items: {len(menu_df)}")
print(f"  Categories: {menu_df['category'].nunique()}")

# Check for negative margins
negative_margin = menu_df[menu_df['profit_margin'] <= 0]
if len(negative_margin) > 0:
    print(f"  {len(negative_margin)} items with negative/zero profit margin")
    print("  Items:", negative_margin['item_name'].tolist())

# Check sales data
print("\n3. SALES ANALYSIS:")
print(f"  Date range: {sales_df['date'].min()} to {sales_df['date'].max()}")
print(f"  Total transactions: {len(sales_df):,}")
print(f"  Unique locations: {sales_df['location_id'].nunique()}")

# Check for unusual quantities or prices
unusual_qty = sales_df[sales_df['quantity'] > 5]
print(f"  {len(unusual_qty)} transactions with quantity > 5")

unusual_revenue = sales_df[sales_df['revenue'] > 100]
print(f"  {len(unusual_revenue)} transactions with revenue > $100")

# Check weather data
print("\n4. WEATHER ANALYSIS:")
print(f"  Temperature range: {weather_df['temperature'].min():.1f}°F to {weather_df['temperature'].max():.1f}°F")
print(f"  Rainy days: {weather_df['is_rainy'].sum():,} ({weather_df['is_rainy'].mean()*100:.1f}%)")

# STEP 4: Business Logic Validation
print("\nBUSINESS LOGIC VALIDATION")
print("-" * 30)

# Check if all sales locations exist in locations table
sales_locations = set(sales_df['location_id'].unique())
master_locations = set(locations_df['location_id'].unique())
missing_locations = sales_locations - master_locations
if missing_locations:
    print(f"{len(missing_locations)} sales locations not in master data")
else:
    print("All sales locations exist in master data")

# Check if all sales items exist in menu
sales_items = set(sales_df['item_id'].unique()) 
menu_items = set(menu_df['item_id'].unique())
missing_items = sales_items - menu_items
if missing_items:
    print(f"{len(missing_items)} sales items not in menu data")
else:
    print("All sales items exist in menu data")

# STEP 5: Quick Business Insights
print("\nQUICK BUSINESS INSIGHTS")
print("-" * 30)

# Top performing locations
location_performance = sales_df.groupby('location_id')['revenue'].sum().sort_values(ascending=False)
print("\nTop 5 locations by revenue:")
for loc, revenue in location_performance.head().items():
    city = locations_df[locations_df['location_id'] == loc]['city'].iloc[0]
    print(f"  {loc} ({city}): ${revenue:,.0f}")

# Category performance
category_performance = sales_df.groupby('category')['revenue'].sum().sort_values(ascending=False)
print("\nCategory performance:")
for cat, revenue in category_performance.items():
    print(f"  {cat}: ${revenue:,.0f}")

# Monthly trends
sales_df['month'] = sales_df['date'].dt.month
monthly_sales = sales_df.groupby('month')['revenue'].sum()
best_month = monthly_sales.idxmax()
worst_month = monthly_sales.idxmin()
print(f"\nBest month: {best_month} (${monthly_sales[best_month]:,.0f})")
print(f"Worst month: {worst_month} (${monthly_sales[worst_month]:,.0f})")

print("\nData exploration complete!")
print("Summary: Our data is mostly clean with minor issues to address.")


ChainEats Data Quality Assessment
Loading datasets...
All datasets loaded successfully!

DATASET OVERVIEW
------------------------------

Locations:
  Rows: 50
  Columns: 7
  Memory: 0.0 MB

Menu Items:
  Rows: 20
  Columns: 6
  Memory: 0.0 MB

Sales:
  Rows: 13,605,203
  Columns: 11
  Memory: 3692.0 MB

Weather:
  Rows: 3,650
  Columns: 5
  Memory: 0.3 MB

MISSING VALUES ANALYSIS
------------------------------

Locations: No missing values

Menu Items: No missing values

Sales: No missing values

Weather: No missing values

DATA QUALITY ISSUES
------------------------------

1. LOCATIONS ANALYSIS:
  Cities: 5
  Location types: 5
  Rent range: $3,098 - $12,795

2. MENU ANALYSIS:
  Total items: 20
  Categories: 5

3. SALES ANALYSIS:
  Date range: 2022-01-01 00:00:00 to 2023-12-31 00:00:00
  Total transactions: 13,605,203
  Unique locations: 50
  0 transactions with quantity > 5
  0 transactions with revenue > $100

4. WEATHER ANALYSIS:
  Temperature range: 20.0°F to 100.0°F
  Rainy days

In [7]:
# ChainEats Analytics - Day 2: Data Cleaning Pipeline
import pandas as pd
import numpy as np

print("ChainEats Data Cleaning Pipeline")
print("=" * 50)

# Load datasets
locations_df = pd.read_csv('locations.csv')
menu_df = pd.read_csv('menu_items.csv')
sales_df = pd.read_csv('sales_data.csv')
weather_df = pd.read_csv('weather_data.csv')

# Convert dates
sales_df['date'] = pd.to_datetime(sales_df['date'])
weather_df['date'] = pd.to_datetime(weather_df['date'])
locations_df['opening_date'] = pd.to_datetime(locations_df['opening_date'])

print("Original data loaded")

# CLEANING STEP 1: Fix Menu Items with Negative Margins
print("\nSTEP 1: Fixing Menu Items")
print("-" * 30)

# Find items with negative or zero margins
problematic_items = menu_df[menu_df['profit_margin'] <= 0].copy()
print(f"Found {len(problematic_items)} items with poor margins")

# Fix by adjusting cost to maintain 20% minimum margin
for idx, item in problematic_items.iterrows():
    old_cost = menu_df.at[idx, 'cost']
    new_cost = menu_df.at[idx, 'price'] * 0.8  # 20% margin
    menu_df.at[idx, 'cost'] = new_cost
    menu_df.at[idx, 'profit_margin'] = menu_df.at[idx, 'price'] - new_cost
    print(f"  {item['item_name']}: Cost ${old_cost:.2f} → ${new_cost:.2f}")

print("All menu items now have positive margins")

# CLEANING STEP 2: Handle Unusual Sales Transactions
print("\nSTEP 2: Cleaning Sales Data")
print("-" * 30)

original_sales_count = len(sales_df)

# Remove transactions with unrealistic quantities (>5 items same product)
unusual_qty = sales_df[sales_df['quantity'] > 5]
print(f"Removing {len(unusual_qty)} transactions with quantity > 5")
sales_df = sales_df[sales_df['quantity'] <= 5]

# Cap individual transaction revenue at $50 (reasonable for fast food)
high_revenue = sales_df[sales_df['revenue'] > 50]
print(f"Capping {len(high_revenue)} transactions above $50")
sales_df.loc[sales_df['revenue'] > 50, 'quantity'] = 1
sales_df.loc[sales_df['revenue'] > 50, 'revenue'] = sales_df.loc[sales_df['revenue'] > 50, 'price']

# Recalculate revenue and profit after cleaning
sales_df['revenue'] = sales_df['quantity'] * sales_df['price']
sales_df['total_cost'] = sales_df['quantity'] * sales_df['cost']
sales_df['profit'] = sales_df['revenue'] - sales_df['total_cost']

print(f"Sales records: {original_sales_count:,} → {len(sales_df):,}")

# CLEANING STEP 3: Add Business-Relevant Columns
print("\nSTEP 3: Adding Business Columns")
print("-" * 30)

# Add time-based features to sales
sales_df['year'] = sales_df['date'].dt.year
sales_df['month'] = sales_df['date'].dt.month
sales_df['day_of_week'] = sales_df['date'].dt.day_name()
sales_df['is_weekend'] = sales_df['date'].dt.weekday.isin([5, 6]).astype(int)

# Add season
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

sales_df['season'] = sales_df['month'].apply(get_season)

# Add location details to sales
location_details = locations_df[['location_id', 'city', 'location_type']].copy()
sales_df = sales_df.merge(location_details, on='location_id', how='left')

print("Added time features and location details to sales data")

# CLEANING STEP 4: Create Summary Tables
print("\nSTEP 4: Creating Summary Tables")
print("-" * 30)

# Daily sales summary by location
daily_summary = sales_df.groupby(['date', 'location_id', 'city', 'location_type']).agg({
    'revenue': 'sum',
    'profit': 'sum',
    'quantity': 'sum'
}).reset_index()

daily_summary = daily_summary.rename(columns={
    'revenue': 'daily_revenue',
    'profit': 'daily_profit', 
    'quantity': 'daily_items_sold'
})

print(f"Created daily summary: {len(daily_summary):,} records")

# Monthly summary by location
monthly_summary = sales_df.groupby(['year', 'month', 'location_id', 'city', 'location_type']).agg({
    'revenue': 'sum',
    'profit': 'sum',
    'quantity': 'sum'
}).reset_index()

monthly_summary = monthly_summary.rename(columns={
    'revenue': 'monthly_revenue',
    'profit': 'monthly_profit',
    'quantity': 'monthly_items_sold'
})

# Add location operating costs
location_costs = locations_df[['location_id', 'monthly_rent']].copy()
monthly_summary = monthly_summary.merge(location_costs, on='location_id', how='left')
monthly_summary['net_profit'] = monthly_summary['monthly_profit'] - monthly_summary['monthly_rent']

print(f"Created monthly summary: {len(monthly_summary):,} records")

# CLEANING STEP 5: Weather Data Enhancement
print("\nSTEP 5: Enhancing Weather Data")
print("-" * 30)

# Add temperature categories
def categorize_temperature(temp):
    if temp < 40:
        return 'Cold'
    elif temp < 70:
        return 'Mild'
    else:
        return 'Hot'

weather_df['temp_category'] = weather_df['temperature'].apply(categorize_temperature)

# Add weather impact score (business metric)
def weather_impact_score(row):
    score = 1.0  # neutral
    
    # Temperature impact
    if row['temp_category'] == 'Mild':
        score += 0.1  # Mild weather boosts sales
    elif row['temp_category'] == 'Cold':
        score -= 0.05
    
    # Rain impact
    if row['is_rainy']:
        score -= 0.15  # Rain hurts sales
    
    return round(score, 2)

weather_df['weather_impact_score'] = weather_df.apply(weather_impact_score, axis=1)

print("Added temperature categories and weather impact scores")

# CLEANING STEP 6: Save Cleaned Data
print("\nSTEP 6: Saving Cleaned Data")
print("-" * 30)

# Save cleaned datasets
locations_df.to_csv('locations_cleaned.csv', index=False)
menu_df.to_csv('menu_items_cleaned.csv', index=False)
sales_df.to_csv('sales_data_cleaned.csv', index=False)
weather_df.to_csv('weather_data_cleaned.csv', index=False)

# Save summary tables
daily_summary.to_csv('daily_sales_summary.csv', index=False)
monthly_summary.to_csv('monthly_sales_summary.csv', index=False)

print("All cleaned datasets saved!")

# FINAL DATA QUALITY REPORT
print("\nFINAL DATA QUALITY REPORT")
print("=" * 50)

print(f"Locations: {len(locations_df)} restaurants across {locations_df['city'].nunique()} cities")
print(f"Menu: {len(menu_df)} items, all with positive margins")
print(f"Sales: {len(sales_df):,} clean transactions")
print(f"Weather: {len(weather_df):,} daily records with business impact scores")
print(f"Daily Summary: {len(daily_summary):,} location-day combinations")
print(f"Monthly Summary: {len(monthly_summary):,} location-month combinations")

# Data quality metrics
print(f"\nData Quality Metrics:")
print(f"  Missing values: 0%")
print(f"  Date range: {sales_df['date'].min().strftime('%Y-%m-%d')} to {sales_df['date'].max().strftime('%Y-%m-%d')}")
print(f"  Revenue range: ${sales_df['revenue'].min():.2f} - ${sales_df['revenue'].max():.2f}")
print(f"  All business relationships maintained")

print("Data is clean and ready for SQL analysis.")

ChainEats Data Cleaning Pipeline
Original data loaded

STEP 1: Fixing Menu Items
------------------------------
Found 0 items with poor margins
All menu items now have positive margins

STEP 2: Cleaning Sales Data
------------------------------
Removing 0 transactions with quantity > 5
Capping 0 transactions above $50
Sales records: 13,605,203 → 13,605,203

STEP 3: Adding Business Columns
------------------------------
Added time features and location details to sales data

STEP 4: Creating Summary Tables
------------------------------
Created daily summary: 36,500 records
Created monthly summary: 1,200 records

STEP 5: Enhancing Weather Data
------------------------------
Added temperature categories and weather impact scores

STEP 6: Saving Cleaned Data
------------------------------
All cleaned datasets saved!

FINAL DATA QUALITY REPORT
Locations: 50 restaurants across 5 cities
Menu: 20 items, all with positive margins
Sales: 13,605,203 clean transactions
Weather: 3,650 daily record