In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
import pandas as pd
import os
from pathlib import Path

def memory_safe_merge(folder_path, merge_key='CustomerID', output_file='merged.parquet'):
    """
    Merges Parquet files with minimal memory usage:
    1. Processes files sequentially
    2. Uses chunked reading/writing
    3. Maintains consistent schema
    """
    folder_path = Path(folder_path)
    parquet_files = list(folder_path.glob('*.parquet'))
    
    if not parquet_files:
        raise FileNotFoundError(f"No Parquet files found in {folder_path}")

    # Process first file
    first_file = parquet_files[0]
    print(f"Processing {first_file.name} as base...")
    base_df = pd.read_parquet(first_file)
    
    # Initialize merged file
    base_df.to_parquet(output_file)
    del base_df
    
    # Process remaining files
    for file in parquet_files[1:]:
        print(f"\nMerging {file.name}...")
        
        # Read file to get schema
        file_df = pd.read_parquet(file)
        
        # Check if merge key exists
        if merge_key not in file_df.columns:
            print(f"Warning: {merge_key} not found - concatenating instead")
            existing_data = pd.read_parquet(output_file)
            pd.concat([existing_data, file_df]).to_parquet(output_file)
            continue
            
        # Get common columns (fixed syntax)
        existing_cols = pd.read_parquet(output_file, columns=[merge_key]).columns.tolist()
        common_cols = list(set(existing_cols + [merge_key]))
        
        # Process in chunks
        chunk_size = 50000
        for i, chunk in enumerate(pd.read_parquet(file, chunksize=chunk_size)):
            print(f"Processing chunk {i+1}...", end='\r')
            
            # Read corresponding base chunks
            base_chunk = pd.read_parquet(
                output_file,
                filters=[(merge_key, 'in', chunk[merge_key].unique().tolist())]
            )
            
            # Merge chunks
            merged_chunk = pd.merge(
                base_chunk,
                chunk,
                on=merge_key,
                how='outer',
                suffixes=('', '_DROP')
            )
            
            # Clean and save
            merged_chunk = merged_chunk.loc[:, ~merged_chunk.columns.str.endswith('_DROP')]
            
            # Update output file
            if i == 0:
                merged_chunk.to_parquet(output_file)
            else:
                existing = pd.read_parquet(output_file)
                pd.concat([existing, merged_chunk]).to_parquet(output_file)
    
    print("\nMerge complete!")
    return output_file

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

def execute_olap_queries(merged_file):
    """
    Executes OLAP queries on the merged data in memory-efficient chunks
    Returns dictionary with query results and saves visualizations
    """
    results = {}
    
    # 1. ROLL-UP: Sales by Country and Quarter (in chunks)
    print("Executing ROLL-UP: Sales by Country/Quarter...")
    rollup_results = pd.DataFrame()
    for chunk in pd.read_parquet(merged_file, chunksize=100000):
        chunk['Quarter'] = pd.to_datetime(chunk['InvoiceDate']).dt.quarter
        chunk_result = chunk.groupby(['Country', 'Quarter'])['TotalSales'].sum().reset_index()
        rollup_results = pd.concat([rollup_results, chunk_result])
    
    # Aggregate chunked results
    rollup_final = rollup_results.groupby(['Country', 'Quarter'])['TotalSales'].sum().reset_index()
    results['rollup'] = rollup_final
    
    # Visualization
    plt.figure(figsize=(12, 6))
    top_countries = rollup_final.groupby('Country')['TotalSales'].sum().nlargest(5).index
    for country in top_countries:
        country_data = rollup_final[rollup_final['Country'] == country]
        plt.plot(country_data['Quarter'], country_data['TotalSales'], label=country, marker='o')
    plt.title('Quarterly Sales by Top Countries')
    plt.xlabel('Quarter')
    plt.ylabel('Total Sales')
    plt.legend()
    plt.savefig('quarterly_sales.png', dpi=150, bbox_inches='tight')
    plt.close()
    
    # 2. DRILL-DOWN: Monthly sales for top country
    top_country = rollup_final.groupby('Country')['TotalSales'].sum().idxmax()
    print(f"\nExecuting DRILL-DOWN for {top_country}...")
    
    monthly_sales = pd.DataFrame()
    for chunk in pd.read_parquet(merged_file, chunksize=100000):
        chunk_dates = pd.to_datetime(chunk['InvoiceDate'])
        chunk = chunk[chunk['Country'] == top_country].copy()
        chunk['Month'] = chunk_dates.dt.month
        chunk['MonthName'] = chunk_dates.dt.month_name()
        monthly_sales = pd.concat([monthly_sales, chunk])
    
    drilldown_final = monthly_sales.groupby(['Month', 'MonthName'])['TotalSales'].sum().reset_index()
    results['drilldown'] = drilldown_final
    
    # Visualization
    plt.figure(figsize=(10, 5))
    plt.bar(drilldown_final['MonthName'], drilldown_final['TotalSales'])
    plt.title(f'Monthly Sales for {top_country}')
    plt.xticks(rotation=45)
    plt.savefig(f'monthly_sales_{top_country}.png', dpi=150, bbox_inches='tight')
    plt.close()
    
    # 3. SLICE: Electronics sales by country
    print("\nExecuting SLICE: Electronics sales by country...")
    electronics_sales = pd.DataFrame()
    for chunk in pd.read_parquet(merged_file, chunksize=100000):
        if 'Category' in chunk.columns:
            chunk_slice = chunk[chunk['Category'] == 'Electronics']
            electronics_sales = pd.concat([electronics_sales, chunk_slice])
    
    if len(electronics_sales) > 0:
        slice_final = electronics_sales.groupby('Country')['TotalSales'].sum().nlargest(10).reset_index()
        results['slice'] = slice_final
        
        # Visualization
        plt.figure(figsize=(10, 5))
        plt.bar(slice_final['Country'], slice_final['TotalSales'])
        plt.title('Top Countries for Electronics Sales')
        plt.xticks(rotation=45)
        plt.savefig('electronics_sales.png', dpi=150, bbox_inches='tight')
        plt.close()
    else:
        print("No electronics data found")
        results['slice'] = None
    
    # Save results to CSV
    rollup_final.to_csv('rollup_results.csv', index=False)
    drilldown_final.to_csv('drilldown_results.csv', index=False)
    if 'slice' in results and results['slice'] is not None:
        results['slice'].to_csv('electronics_sales.csv', index=False)
    
    return results

# Execute the analysis
merged_file = 'merged.parquet'  # From previous step
try:
    olap_results = execute_olap_queries(merged_file)
    
    print("\nOLAP Analysis Complete!")
    print("Generated Files:")
    print("- quarterly_sales.png")
    print(f"- monthly_sales_[country].png")
    print("- electronics_sales.png (if data exists)")
    print("- CSV files with raw results")
    
    # Print summary
    print("\nTop Countries by Quarterly Sales:")
    print(olap_results['rollup'].groupby('Country')['TotalSales'].sum().nlargest(5))
    
except Exception as e:
    print(f"\nAnalysis failed: {str(e)}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from pathlib import Path
from datetime import datetime

def analyze_merged_data(merged_df):
    """Perform OLAP analysis on the merged dataframe"""
    # Data Quality Checks
    print("\n=== Data Quality Check ===")
    print(f"Total rows: {len(merged_df)}")
    print("Null values per column:")
    print(merged_df.isnull().sum().sort_values(ascending=False))
    
    # Convert and validate InvoiceDate
    merged_df['InvoiceDate'] = pd.to_datetime(merged_df['InvoiceDate'], errors='coerce')
    date_nulls = merged_df['InvoiceDate'].isnull().sum()
    if date_nulls > 0:
        print(f"\nWarning: {date_nulls} rows have invalid dates")
    
    # Create time dimensions
    merged_df['Year'] = merged_df['InvoiceDate'].dt.year
    merged_df['Quarter'] = merged_df['InvoiceDate'].dt.quarter
    merged_df['Month'] = merged_df['InvoiceDate'].dt.month
    merged_df['MonthName'] = merged_df['InvoiceDate'].dt.month_name()
    
    # Ensure numeric fields
    for col in ['Quantity', 'UnitPrice', 'TotalSales']:
        if col in merged_df.columns:
            merged_df[col] = pd.to_numeric(merged_df[col], errors='coerce')
    
    # OLAP Query 1: ROLL-UP by Country and Quarter
    try:
        rollup = merged_df.groupby(['Country', 'Year', 'Quarter'])['TotalSales'].sum().reset_index()
        print("\nROLL-UP Results (Top 10 Countries):")
        print(rollup.groupby('Country')['TotalSales'].sum().nlargest(10))
        
        # Visualization
        plt.figure(figsize=(14, 7))
        for country in rollup['Country'].value_counts().index[:5]:
            country_data = rollup[rollup['Country'] == country]
            plt.plot(
                country_data['Year'].astype(str) + "Q" + country_data['Quarter'].astype(str),
                country_data['TotalSales'],
                label=country,
                marker='o'
            )
        plt.title('Quarterly Sales by Country')
        plt.ylabel('Total Sales')
        plt.xlabel('Quarter')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig('quarterly_sales.png', dpi=300)
        plt.show()
    except Exception as e:
        print(f"\nRoll-up failed: {str(e)}")

    # OLAP Query 2: DRILL-DOWN for top country
    try:
        top_country = merged_df.groupby('Country')['TotalSales'].sum().idxmax()
        drilldown = merged_df[merged_df['Country'] == top_country].groupby(
            ['Year', 'Month', 'MonthName'])['TotalSales'].sum().reset_index()
        
        print(f"\nDRILL-DOWN for {top_country}:")
        print(drilldown)
        
        # Visualization
        plt.figure(figsize=(12, 6))
        plt.bar(drilldown['MonthName'], drilldown['TotalSales'])
        plt.title(f'Monthly Sales for {top_country}')
        plt.ylabel('Total Sales')
        plt.xlabel('Month')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f'monthly_sales_{top_country}.png', dpi=300)
        plt.show()
    except Exception as e:
        print(f"\nDrill-down failed: {str(e)}")

    # OLAP Query 3: SLICE by Category
    if 'Category' in merged_df.columns:
        try:
            electronics_sales = merged_df[merged_df['Category'] == 'Electronics'].groupby(
                'Country')['TotalSales'].sum().nlargest(10)
            print("\nSLICE for Electronics:")
            print(electronics_sales)
            
            # Visualization
            plt.figure(figsize=(12, 6))
            electronics_sales.plot(kind='bar')
            plt.title('Top Countries for Electronics Sales')
            plt.ylabel('Total Sales')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig('electronics_sales.png', dpi=300)
            plt.show()
        except Exception as e:
            print(f"\nSlice failed: {str(e)}")
    else:
        print("\n'Category' column not available for slicing")

    # Save results
    try:
        with pd.ExcelWriter('olap_results.xlsx') as writer:
            rollup.to_excel(writer, sheet_name='Rollup', index=False)
            drilldown.to_excel(writer, sheet_name='Drilldown', index=False)
            if 'Category' in merged_df.columns:
                merged_df[merged_df['Category'] == 'Electronics'].to_excel(
                    writer, sheet_name='Electronics', index=False)
        print("\nResults saved to olap_results.xlsx")
    except Exception as e:
        print(f"\nCould not save to Excel: {str(e)}")
        print("Saving to CSV instead...")
        rollup.to_csv('rollup_results.csv', index=False)
        drilldown.to_csv('drilldown_results.csv', index=False)
        if 'Category' in merged_df.columns:
            merged_df[merged_df['Category'] == 'Electronics'].to_csv(
                'electronics_sales.csv', index=False)

# Main execution
if __name__ == "__main__":
    folder_path = r"C:\Users\Snit Kahsay\Desktop\DSA-2040_Practical_Exam_SnitTeshome552\Task_2_ETL_Process_Implementation\Task_2_ETL_Process_Implementation"
    
    try:
        print("Starting data merge...")
        merged_df = safe_merge_parquet(folder_path, merge_key='CustomerID')
        
        print("\nStarting OLAP analysis...")
        analyze_merged_data(merged_df)
        
        print("\nAnalysis complete! Check generated files:")
        print("- quarterly_sales.png")
        print("- monthly_sales_[country].png")
        print("- electronics_sales.png (if category exists)")
        print("- olap_results.xlsx (or CSV files)")
        
    except Exception as e:
        print(f"\nFatal error: {str(e)}")
        print("Check that:")
        print("1. Files exist in the specified folder")
        print("2. 'CustomerID' column exists in all files")
        print("3. You have sufficient memory")

In [None]:
# 1. Check basic DataFrame info
print("\n=== DataFrame Info ===")
print(f"Total rows: {len(full_df)}")
print(f"Columns: {full_df.columns.tolist()}")
print(f"Null counts:\n{full_df[['Country', 'InvoiceDate', 'Quantity']].isnull().sum()}")

# 2. Verify datetime conversion
print("\n=== Date Conversion Check ===")
print(f"Date range: {full_df['InvoiceDate'].min()} to {full_df['InvoiceDate'].max()}")
print(f"Quarter sample:\n{full_df[['InvoiceDate', 'Quarter']].head()}")

# 3. Check category assignment
print("\n=== Category Check ===")
print("Category distribution:")
print(full_df['Category'].value_counts(dropna=False))

In [None]:
# Convert InvoiceDate to datetime if it's not already
full_df['InvoiceDate'] = pd.to_datetime(full_df['InvoiceDate'])

# Create time dimensions
full_df['Quarter'] = full_df['InvoiceDate'].dt.quarter
full_df['Month'] = full_df['InvoiceDate'].dt.month
full_df['MonthName'] = full_df['InvoiceDate'].dt.month_name()

# Add product category (if not exists)
# For this example, let's create a synthetic category based on StockCode
if 'Category' not in full_df.columns:
    # Simple categorization - in real scenario use proper mapping
    full_df['Category'] = full_df['StockCode'].apply(
        lambda x: 'Electronics' if str(x).startswith('5') 
        else ('Clothing' if str(x).startswith('2') 
              else 'Other')
    )

# OLAP Query 1: ROLL-UP - Total sales by Country and quarter
rollup_result = full_df.groupby(['Country', 'Quarter'])['Quantity'].sum().reset_index()
print("ROLL-UP Result (Sales by Country & Quarter):")
print(rollup_result.head(10))

In [None]:
# OLAP Query 2: DRILL-DOWN - Sales for specific country by month
specific_country = 'United Kingdom'  # Change as needed
drilldown_result = full_df[full_df['Country'] == specific_country].groupby(
    ['MonthName', 'Month'])['Quantity'].sum().reset_index().sort_values('Month')
print(f"\nDRILL-DOWN Result (Sales for {specific_country} by Month):")
print(drilldown_result)


In [None]:
# OLAP Query 3: SLICE - Total sales for electronics category
slice_result = full_df[full_df['Category'] == 'Electronics'].groupby(
    'Country')['Quantity'].sum().reset_index().sort_values('Quantity', ascending=False)
print("\nSLICE Result (Electronics Sales by Country):")
print(slice_result.head(10))

# Visualization for one query (Roll-up example)
plt.figure(figsize=(12, 6))
for country in rollup_result['Country'].unique()[:5]:  # Top 5 countries
    country_data = rollup_result[rollup_result['Country'] == country]
    plt.plot(country_data['Quarter'], country_data['Quantity'], 
             label=country, marker='o')

plt.title('Quarterly Sales Trends by Country')
plt.xlabel('Quarter')
plt.ylabel('Total Quantity Sold')
plt.xticks([1, 2, 3, 4])
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('quarterly_sales_trends.png')
plt.show()