In [3]:
# BIT2053 E-commerce Data Analysis Project
# Python Implementation for Data Processing and Analysis

import pandas as pd
import warnings

warnings.filterwarnings('ignore')

class EcommerceAnalyzer:
    def __init__(self, data_path):
        """Initialize the analyzer with dataset path"""
        self.data_path = data_path
        self.df = None
        self.processed_df = None

    def load_data(self):
        """Load the raw dataset"""
        print("Loading dataset...")
        try:
            self.df = pd.read_excel(self.data_path)
            print(f"Dataset loaded successfully: {self.df.shape}")
            return True
        except Exception as e:
            print(f"Error loading data: {e}")
            return False

    def explore_data(self):
        """Initial data exploration"""
        print("\n=== DATA EXPLORATION ===")
        print(f"Dataset shape: {self.df.shape}")
        print(f"\nColumn names: {list(self.df.columns)}")
        print(f"\nData types:\n{self.df.dtypes}")
        print(f"\nMissing values:\n{self.df.isnull().sum()}")
        print(f"\nFirst 5 rows:\n{self.df.head()}")

        # Basic statistics
        print(f"\nBasic statistics:\n{self.df.describe()}")

    def preprocess_data(self):
        """Clean and preprocess the dataset"""
        print("\n=== DATA PREPROCESSING ===")

        # Create a copy for processing
        self.processed_df = self.df.copy()

        print(f"Original dataset shape: {self.processed_df.shape}")

        # Step 1: Remove duplicates
        original_size = len(self.processed_df)
        self.processed_df = self.processed_df.drop_duplicates()
        print(f"Removed {original_size - len(self.processed_df)} duplicate records")

        # Step 2: Handle missing values
        print(f"Missing values before cleaning:\n{self.processed_df.isnull().sum()}")

        # Remove records with missing CustomerID and Description
        self.processed_df = self.processed_df.dropna(subset=['Customer ID', 'Description'])

        # Step 3: Remove cancelled orders (negative quantities)
        self.processed_df = self.processed_df[self.processed_df['Quantity'] > 0]
        self.processed_df = self.processed_df[self.processed_df['Price'] > 0]

        # Step 4: Convert data types
        self.processed_df['InvoiceDate'] = pd.to_datetime(self.processed_df['InvoiceDate'])
        self.processed_df['Customer ID'] = self.processed_df['Customer ID'].astype(str)

        # Step 5: Create calculated fields
        self.processed_df['TotalAmount'] = self.processed_df['Quantity'] * self.processed_df['Price']
        self.processed_df['Year'] = self.processed_df['InvoiceDate'].dt.year
        self.processed_df['Month'] = self.processed_df['InvoiceDate'].dt.month
        self.processed_df['Quarter'] = self.processed_df['InvoiceDate'].dt.quarter
        self.processed_df['DayOfWeek'] = self.processed_df['InvoiceDate'].dt.day_name()

        # Step 6: Filter for valid date range (if needed)
        self.processed_df = self.processed_df[
            (self.processed_df['InvoiceDate'] >= '2010-01-01') &
            (self.processed_df['InvoiceDate'] <= '2011-12-31')
            ]

        print(f"Final processed dataset shape: {self.processed_df.shape}")
        print("Data preprocessing completed successfully!")

        return self.processed_df

    def perform_analysis(self):
        """Perform comprehensive data analysis"""
        print("\n=== DATA ANALYSIS ===")

        if self.processed_df is None:
            print("Please preprocess data first!")

        # Analysis 1: Revenue Analysis
        print("\n1. REVENUE ANALYSIS")
        total_revenue = self.processed_df['TotalAmount'].sum()
        total_orders = self.processed_df['Invoice'].nunique()
        avg_order_value = total_revenue / total_orders
        unique_customers = self.processed_df['Customer ID'].nunique()
        unique_products = self.processed_df['StockCode'].nunique()

        print(f"Total Revenue: £{total_revenue:,.2f}")
        print(f"Total Orders: {total_orders:,}")
        print(f"Average Order Value: £{avg_order_value:.2f}")
        print(f"Unique Customers: {unique_customers:,}")
        print(f"Unique Products: {unique_products:,}")

        # Analysis 2: Top Products
        print("\n2. TOP PRODUCTS BY REVENUE")
        top_products = self.processed_df.groupby(['StockCode', 'Description']).agg({
            'TotalAmount': 'sum',
            'Quantity': 'sum'
        }).reset_index().sort_values('TotalAmount', ascending=False).head(10)
        print(top_products)

        # Analysis 3: Customer Analysis
        print("\n3. CUSTOMER ANALYSIS")
        customer_stats = self.processed_df.groupby('Customer ID').agg({
            'TotalAmount': ['sum', 'count', 'mean'],
            'InvoiceDate': ['min', 'max']
        }).reset_index()

        customer_stats.columns = ['Customer_ID', 'Total_Spent', 'Order_Count',
                                  'Avg_Order_Value', 'First_Purchase', 'Last_Purchase']

        # Customer segmentation
        customer_stats['Recency'] = (customer_stats['Last_Purchase'].max() -
                                     customer_stats['Last_Purchase']).dt.days

        # RFM Analysis
        customer_stats['R_Score'] = pd.qcut(customer_stats['Recency'], 5,
                                            labels=[5, 4, 3, 2, 1])
        customer_stats['F_Score'] = pd.qcut(customer_stats['Order_Count'].rank(method='first'), 5,
                                            labels=[1, 2, 3, 4, 5])
        customer_stats['M_Score'] = pd.qcut(customer_stats['Total_Spent'], 5,
                                            labels=[1, 2, 3, 4, 5])

        customer_stats['RFM_Score'] = (customer_stats['R_Score'].astype(str) +
                                       customer_stats['F_Score'].astype(str) +
                                       customer_stats['M_Score'].astype(str))

        print("Customer Segmentation Summary:")
        print(f"High Value Customers (RFM 555): {len(customer_stats[customer_stats['RFM_Score'] == '555'])}")
        print(f"Average Customer Lifetime Value: £{customer_stats['Total_Spent'].mean():.2f}")

        # Analysis 4: Time-based Analysis
        print("\n4. TIME-BASED ANALYSIS")
        monthly_revenue = self.processed_df.groupby(['Year', 'Month'])['TotalAmount'].sum().reset_index()
        print("Monthly Revenue Trends:")
        print(monthly_revenue.head(10))

        # Analysis 5: Geographic Analysis
        print("\n5. GEOGRAPHIC ANALYSIS")
        country_analysis = self.processed_df.groupby('Country').agg({
            'TotalAmount': 'sum',
            'Customer ID': 'nunique',
            'Invoice': 'nunique'
        }).reset_index().sort_values('TotalAmount', ascending=False)

        print("Top 10 Countries by Revenue:")
        print(country_analysis.head(10))

        return {
            'revenue_stats': {
                'total_revenue': total_revenue,
                'total_orders': total_orders,
                'avg_order_value': avg_order_value,
                'unique_customers': unique_customers,
                'unique_products': unique_products
            },
            'top_products': top_products,
            'customer_stats': customer_stats,
            'monthly_revenue': monthly_revenue,
            'country_analysis': country_analysis
        }

    def export_processed_data(self, filename='data/processed_ecommerce_data.csv'):
        """Export processed data for BI tools"""
        if self.processed_df is not None:
            self.processed_df.to_csv(filename, index=False)
            print(f"Processed data exported to {filename}")
        else:
            print("No processed data to export!")

    def generate_summary_report(self):
        """Generate a comprehensive summary report"""
        print("\n=== SUMMARY REPORT ===")

        if self.processed_df is None:
            print("Please preprocess data first!")
            return

        analysis_results = self.perform_analysis()

        report = f"""
E-COMMERCE DATA ANALYSIS SUMMARY REPORT
======================================

DATASET OVERVIEW:
- Total Records: {len(self.processed_df):,}
- Date Range: {self.processed_df['InvoiceDate'].min()} to {self.processed_df['InvoiceDate'].max()}
- Countries Covered: {self.processed_df['Country'].nunique()}

BUSINESS METRICS:
- Total Revenue: £{analysis_results['revenue_stats']['total_revenue']:,.2f}
- Total Orders: {analysis_results['revenue_stats']['total_orders']:,}
- Average Order Value: £{analysis_results['revenue_stats']['avg_order_value']:.2f}
- Unique Customers: {analysis_results['revenue_stats']['unique_customers']:,}
- Unique Products: {analysis_results['revenue_stats']['unique_products']:,}
        """

        print(report)

        # Save report to file
        with open('report/analysis_summary_report.txt', 'w') as f:
            f.write(report)

        print("\nSummary report saved as 'analysis_summary_report.txt' at directory 'report'")


# Main execution
def main():
    """Main function to run the complete analysis"""
    print("BIT2053 - E-commerce Data Analysis Project")
    print("=" * 50)

    # Initialize analyzer
    analyzer = EcommerceAnalyzer('data/raw_online_retail_II.xlsx')

    # Step 1: Load data
    if analyzer.load_data():
        # Step 2: Explore data
        analyzer.explore_data()

        # Step 3: Preprocess data
        analyzer.preprocess_data()

        # Step 4: Perform analysis
        analyzer.perform_analysis()

        # Step 5: Export processed data for BI tools
        analyzer.export_processed_data()

        # Step 6: Generate summary report
        analyzer.generate_summary_report()

        print("\n" + "=" * 50)
        print("Analysis completed successfully!")
        print("Files generated:")
        print("- processed_ecommerce_data.csv (for BI tools)")
        print("- analysis_summary_report.txt (summary report)")


if __name__ == "__main__":
    main()

BIT2053 - E-commerce Data Analysis Project
Loading dataset...
Dataset loaded successfully: (1048575, 8)

=== DATA EXPLORATION ===
Dataset shape: (1048575, 8)

Column names: ['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'Price', 'Customer ID', 'Country']

Data types:
Invoice                object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
Price                 float64
Customer ID           float64
Country                object
dtype: object

Missing values:
Invoice             0
StockCode           0
Description      4372
Quantity            0
InvoiceDate         0
Price               0
Customer ID    236682
Country             0
dtype: int64

First 5 rows:
  Invoice StockCode                          Description  Quantity  \
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1  489434    79323P                   PINK CHERRY LIGHTS        12   
2  489434    79323W        