# Sales ETL and Data Analysis Notebook

This notebook demonstrates the Sales ETL and Data Analysis program with interactive visualizations.


## 1. Import Libraries and Initialize

In [None]:
# Import the ETL analyzer
from sales_etl_analyzer import SalesETLAnalyzer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Enable inline plotting
%matplotlib inline

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("📊 Sales ETL and Data Analysis Notebook")
print("="*50)

## 2. Initialize the Analyzer

In [None]:
# Initialize the analyzer
analyzer = SalesETLAnalyzer()
print("✅ Analyzer initialized")

## 3. Extract Data

In [None]:
# Extract data from CSV
raw_data = analyzer.extract_data('sample_sales_data.csv')

# Display first few rows
print("\n📋 First 5 rows of raw data:")
display(raw_data.head())

print(f"\n📊 Data shape: {raw_data.shape}")
print(f"📋 Columns: {list(raw_data.columns)}")

## 4. Transform Data

In [None]:
# Transform and clean data
clean_data = analyzer.transform_data()

# Display transformed data info
print("\n📋 Transformed data info:")
print(clean_data.info())

print("\n📊 First 5 rows of clean data:")
display(clean_data.head())

## 5. Data Analysis

In [None]:
# Perform analysis
results = analyzer.analyze_data()

# Display basic statistics
print("📈 BASIC STATISTICS")
print("="*30)
stats = results['basic_stats']
print(f"Total Records: {stats['total_records']:,}")
print(f"Total Revenue: ${stats['total_revenue']:,.2f}")
print(f"Average Order Value: ${stats['average_order_value']:,.2f}")
print(f"Total Quantity Sold: {stats['total_quantity_sold']:,}")
print(f"Date Range: {stats['date_range']['start'].strftime('%Y-%m-%d')} to {stats['date_range']['end'].strftime('%Y-%m-%d')}")

## 6. Interactive Visualizations

In [None]:
# Sales by Category
plt.figure(figsize=(10, 6))
category_sales = clean_data.groupby('category')['total_sales'].sum().sort_values(ascending=True)
category_sales.plot(kind='barh', color='skyblue', edgecolor='black')
plt.title('Total Sales by Category', fontsize=16, fontweight='bold')
plt.xlabel('Total Sales ($)', fontsize=12)
plt.ylabel('Category', fontsize=12)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Sales by Region (Pie Chart)
plt.figure(figsize=(10, 8))
region_sales = clean_data.groupby('region')['total_sales'].sum()
plt.pie(region_sales.values, labels=region_sales.index, autopct='%1.1f%%', startangle=90)
plt.title('Sales Distribution by Region', fontsize=16, fontweight='bold')
plt.axis('equal')
plt.tight_layout()
plt.show()

In [None]:
# Daily Sales Trend
plt.figure(figsize=(14, 6))
daily_sales = clean_data.groupby('date')['total_sales'].sum()
plt.plot(daily_sales.index, daily_sales.values, marker='o', linewidth=2, markersize=4)
plt.title('Daily Sales Trend', fontsize=16, fontweight='bold')
plt.xlabel('Date', fontsize=12)
plt.ylabel('Total Sales ($)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Top 10 Products
plt.figure(figsize=(12, 8))
top_products = clean_data.groupby('product_name')['total_sales'].sum().sort_values(ascending=True).tail(10)
top_products.plot(kind='barh', color='lightcoral', edgecolor='black')
plt.title('Top 10 Products by Sales', fontsize=16, fontweight='bold')
plt.xlabel('Total Sales ($)', fontsize=12)
plt.ylabel('Product', fontsize=12)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Sales Heatmap
plt.figure(figsize=(10, 6))
heatmap_data = clean_data.pivot_table(
    values='total_sales', 
    index='category', 
    columns='region', 
    aggfunc='sum',
    fill_value=0
)
sns.heatmap(heatmap_data, annot=True, fmt='.0f', cmap='YlOrRd', cbar_kws={'label': 'Total Sales ($)'})
plt.title('Sales Heatmap: Category vs Region', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## 7. Detailed Analysis Tables

In [None]:
# Sales by Category Table
print("📊 SALES BY CATEGORY")
print("="*40)
display(results['sales_by_category'])

In [None]:
# Sales by Region Table
print("🌍 SALES BY REGION")
print("="*40)
display(results['sales_by_region'])

In [None]:
# Top Products Table
print("🏆 TOP 10 PRODUCTS")
print("="*40)
display(results['top_products'])

## 8. Custom Analysis

In [None]:
# Custom analysis - Sales performance by weekday
weekday_sales = clean_data.groupby('weekday')['total_sales'].agg(['sum', 'mean', 'count']).round(2)
weekday_sales.columns = ['Total Sales', 'Average Sales', 'Number of Transactions']

print("📅 SALES BY WEEKDAY")
print("="*40)
display(weekday_sales)

# Plot weekday sales
plt.figure(figsize=(10, 6))
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_sales_ordered = clean_data.groupby('weekday')['total_sales'].sum().reindex(weekday_order, fill_value=0)
weekday_sales_ordered.plot(kind='bar', color='lightgreen', edgecolor='black')
plt.title('Sales by Weekday', fontsize=16, fontweight='bold')
plt.xlabel('Weekday', fontsize=12)
plt.ylabel('Total Sales ($)', fontsize=12)
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 9. Export Results

In [None]:
# Save cleaned data
analyzer.load_data()

# Generate comprehensive report
analyzer.generate_report()

# Create all visualizations
analyzer.create_visualizations()

print("✅ All results exported!")
print("📁 Check the 'output' directory for:")
print("   📊 Plots: output/plots/")
print("   📋 Reports: output/reports/")
print("   💾 Data: output/")

## 10. Summary

In [None]:
# Display final summary
analyzer.display_summary()