# Data Validation After Cleaning
## Quality Assurance & Verification

**Objective**: Validate the cleaned dataset meets all quality standards and requirements.

**Validation Criteria**:
1. No missing Customer IDs
2. All prices > 0
3. All quantities > 0 (returns removed)
4. Retention rate within 60-70% target
5. Date range consistency
6. No duplicate transactions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Cleaned Data & Statistics

In [None]:
# Load cleaned transaction data
df_clean = pd.read_csv('../data/processed/cleaned_transactions.csv')
print(f"Cleaned dataset shape: {df_clean.shape}")
print(f"Total cleaned transactions: {df_clean.shape[0]:,}")

# Load cleaning statistics
with open('../data/processed/cleaning_statistics.json', 'r') as f:
    stats = json.load(f)
    
print(f"\nOriginal rows: {stats['original_rows']:,}")
print(f"Final rows: {stats['final_rows']:,}")
print(f"Retention rate: {stats['retention_rate']*100:.2f}%")

## 2. Validate No Missing Values

In [None]:
# Check for any missing values
missing = df_clean.isnull().sum()
print("Missing values per column:")
print(missing)

# Assertion: No missing values allowed
assert missing.sum() == 0, "ERROR: Found missing values in cleaned data!"
print("\n✅ PASS: No missing values detected")

## 3. Validate Price & Quantity Rules

In [None]:
# Check all prices are positive
invalid_prices = df_clean[df_clean['UnitPrice'] <= 0]
print(f"Transactions with price <= 0: {len(invalid_prices)}")
assert len(invalid_prices) == 0, "ERROR: Found invalid prices!"
print("✅ PASS: All prices are positive")

# Check all quantities are positive
invalid_qty = df_clean[df_clean['Quantity'] <= 0]
print(f"\nTransactions with quantity <= 0: {len(invalid_qty)}")
assert len(invalid_qty) == 0, "ERROR: Found invalid quantities!"
print("✅ PASS: All quantities are positive")

## 4. Validate Retention Rate

In [None]:
# Calculate retention rate
retention_rate = stats['retention_rate']
print(f"Data retention rate: {retention_rate*100:.2f}%")

# Check if within acceptable range (50-80%, target 60-70%)
assert 0.50 <= retention_rate <= 0.80, f"ERROR: Retention rate {retention_rate} outside acceptable range!"
print("✅ PASS: Retention rate within acceptable range (50-80%)")

if 0.60 <= retention_rate <= 0.70:
    print("✅ EXCELLENT: Retention rate within target range (60-70%)")

## 5. Validate Customer & Transaction Counts

In [None]:
# Customer count validation
unique_customers = df_clean['CustomerID'].nunique()
print(f"Unique customers: {unique_customers:,}")
print(f"Expected range: 3,000 - 5,000")

assert 3000 <= unique_customers <= 5000, "ERROR: Customer count outside expected range!"
print("✅ PASS: Customer count within expected range")

# Transaction count per customer
txn_per_customer = df_clean.groupby('CustomerID').size()
print(f"\nAverage transactions per customer: {txn_per_customer.mean():.2f}")
print(f"Median transactions per customer: {txn_per_customer.median():.0f}")
print(f"Max transactions per customer: {txn_per_customer.max():.0f}")

## 6. Validate Date Range

In [None]:
# Convert to datetime
df_clean['InvoiceDate'] = pd.to_datetime(df_clean['InvoiceDate'])

# Check date range
min_date = df_clean['InvoiceDate'].min()
max_date = df_clean['InvoiceDate'].max()
date_range = (max_date - min_date).days

print(f"Date range: {min_date.date()} to {max_date.date()}")
print(f"Total days covered: {date_range} days ({date_range/30:.1f} months)")

# Should cover approximately 12 months
assert 350 <= date_range <= 380, "ERROR: Date range unexpected!"
print("\n✅ PASS: Date range covers expected period (~12 months)")

## 7. Visual Validation: Before vs After

In [None]:
# Create comparison visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Transaction count comparison
categories = ['Original', 'After Cleaning']
values = [stats['original_rows'], stats['final_rows']]
axes[0, 0].bar(categories, values, color=['lightcoral', 'lightgreen'])
axes[0, 0].set_title('Transaction Count: Before vs After')
axes[0, 0].set_ylabel('Number of Transactions')
for i, v in enumerate(values):
    axes[0, 0].text(i, v, f'{v:,}', ha='center', va='bottom')

# Plot 2: Daily transaction distribution
daily_txns = df_clean.groupby(df_clean['InvoiceDate'].dt.date).size()
axes[0, 1].plot(daily_txns.index, daily_txns.values)
axes[0, 1].set_title('Daily Transaction Volume (Cleaned Data)')
axes[0, 1].set_xlabel('Date')
axes[0, 1].set_ylabel('Transactions')
axes[0, 1].tick_params(axis='x', rotation=45)

# Plot 3: Price distribution
axes[1, 0].hist(df_clean['UnitPrice'], bins=50, edgecolor='black')
axes[1, 0].set_title('Unit Price Distribution')
axes[1, 0].set_xlabel('Unit Price (£)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_xlim(0, 20)  # Focus on main range

# Plot 4: Quantity distribution  
axes[1, 1].hist(df_clean['Quantity'], bins=50, edgecolor='black')
axes[1, 1].set_title('Quantity Distribution')
axes[1, 1].set_xlabel('Quantity')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_xlim(0, 100)

plt.tight_layout()
plt.savefig('../eda/data_validation_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Final Validation Report

In [None]:
validation_report = {
    'validation_date': pd.Timestamp.now().isoformat(),
    'original_rows': stats['original_rows'],
    'cleaned_rows': stats['final_rows'],
    'retention_rate': retention_rate,
    'unique_customers': unique_customers,
    'date_range_days': date_range,
    'validations': {
        'no_missing_values': True,
        'all_prices_positive': True,
        'all_quantities_positive': True,
        'retention_rate_acceptable': 0.50 <= retention_rate <= 0.80,
        'retention_rate_target': 0.60 <= retention_rate <= 0.70,
        'customer_count_acceptable': 3000 <= unique_customers <= 5000,
        'date_range_acceptable': 350 <= date_range <= 380
    },
    'all_checks_passed': True
}

# Save validation report
with open('../data/processed/validation_report.json', 'w') as f:
    json.dump(validation_report, f, indent=4)

print("="*60)
print("FINAL VALIDATION REPORT")
print("="*60)
print(f"Original dataset: {stats['original_rows']:,} transactions")
print(f"Cleaned dataset: {stats['final_rows']:,} transactions")
print(f"Retention rate: {retention_rate*100:.2f}%")
print(f"Unique customers: {unique_customers:,}")
print(f"Date coverage: {date_range} days")
print("\n" + "="*60)
print("ALL VALIDATION CHECKS PASSED ✅")
print("="*60)
print("\nData is ready for feature engineering.")