# Exploratory Data Analysis (EDA)

This notebook performs exploratory data analysis on the e-commerce customer data stored in Snowflake.

## Objectives
1. Connect to Snowflake and retrieve data
2. Analyze customer demographics and behavior
3. Explore transaction patterns
4. Investigate churn patterns
5. Visualize key insights

In [None]:
# Import libraries
import sys
sys.path.append('/home/ubuntu/snowflake_ds_project')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from src.snowflake_connector import SnowflakeConnector
from src.feature_engineering import FeatureEngineer
from config import config

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")

## 1. Connect to Snowflake

In [None]:
# Validate configuration
config.validate()

# Create connector
connector = SnowflakeConnector(config.get_connection_params())
connector.connect()

# Create feature engineer
engineer = FeatureEngineer(connector)

print("Connected to Snowflake successfully!")

## 2. Load Data from Snowflake

In [None]:
# Get customer 360 view
customer_df = engineer.get_customer_360_view(limit=10000)

print(f"Loaded {len(customer_df)} customer records")
print(f"\nDataset shape: {customer_df.shape}")
print(f"\nColumns: {list(customer_df.columns)}")

In [None]:
# Display first few rows
customer_df.head()

In [None]:
# Data info
customer_df.info()

In [None]:
# Statistical summary
customer_df.describe()

## 3. Customer Demographics Analysis

In [None]:
# Age distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(customer_df['AGE'].dropna(), bins=30, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Age Distribution')
axes[0].axvline(customer_df['AGE'].mean(), color='red', linestyle='--', label=f'Mean: {customer_df["AGE"].mean():.1f}')
axes[0].legend()

# Box plot
axes[1].boxplot(customer_df['AGE'].dropna())
axes[1].set_ylabel('Age')
axes[1].set_title('Age Box Plot')

plt.tight_layout()
plt.show()

In [None]:
# Gender distribution
gender_counts = customer_df['GENDER'].value_counts()

fig = px.pie(values=gender_counts.values, names=gender_counts.index, 
             title='Gender Distribution',
             hole=0.4)
fig.show()

In [None]:
# Country distribution
country_counts = customer_df['COUNTRY'].value_counts()

fig = px.bar(x=country_counts.index, y=country_counts.values,
             labels={'x': 'Country', 'y': 'Number of Customers'},
             title='Customer Distribution by Country')
fig.show()

In [None]:
# Membership tier distribution
tier_counts = customer_df['MEMBERSHIP_TIER'].value_counts()

fig = px.bar(x=tier_counts.index, y=tier_counts.values,
             labels={'x': 'Membership Tier', 'y': 'Number of Customers'},
             title='Membership Tier Distribution',
             color=tier_counts.index,
             color_discrete_map={'Gold': 'gold', 'Silver': 'silver', 'Bronze': '#CD7F32'})
fig.show()

## 4. Transaction Analysis

In [None]:
# Get transaction summary
transaction_summary = engineer.get_transaction_summary()
transaction_summary

In [None]:
# Revenue by product category
fig = px.bar(transaction_summary, 
             x='PRODUCT_CATEGORY', 
             y='TOTAL_REVENUE',
             title='Total Revenue by Product Category',
             labels={'TOTAL_REVENUE': 'Total Revenue ($)', 'PRODUCT_CATEGORY': 'Product Category'})
fig.show()

In [None]:
# Transaction count vs average value
fig = px.scatter(transaction_summary,
                 x='TRANSACTION_COUNT',
                 y='AVG_TRANSACTION_VALUE',
                 size='TOTAL_REVENUE',
                 color='PRODUCT_CATEGORY',
                 hover_data=['PRODUCT_CATEGORY'],
                 title='Transaction Count vs Average Value by Category',
                 labels={'TRANSACTION_COUNT': 'Number of Transactions',
                        'AVG_TRANSACTION_VALUE': 'Average Transaction Value ($)'})
fig.show()

In [None]:
# Customer spending distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Total spend distribution
axes[0].hist(customer_df['TOTAL_SPEND'].dropna(), bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Total Spend ($)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Total Spend Distribution')
axes[0].axvline(customer_df['TOTAL_SPEND'].median(), color='red', linestyle='--', 
                label=f'Median: ${customer_df["TOTAL_SPEND"].median():.2f}')
axes[0].legend()

# Average transaction value distribution
axes[1].hist(customer_df['AVG_TRANSACTION_VALUE'].dropna(), bins=50, edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Average Transaction Value ($)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Average Transaction Value Distribution')
axes[1].axvline(customer_df['AVG_TRANSACTION_VALUE'].median(), color='red', linestyle='--',
                label=f'Median: ${customer_df["AVG_TRANSACTION_VALUE"].median():.2f}')
axes[1].legend()

plt.tight_layout()
plt.show()

## 5. Customer Engagement Analysis

In [None]:
# Page views distribution
fig = px.histogram(customer_df, x='PAGE_VIEWS', nbins=50,
                   title='Page Views Distribution',
                   labels={'PAGE_VIEWS': 'Page Views'})
fig.show()

In [None]:
# Email engagement
fig = px.histogram(customer_df, x='EMAIL_OPENED', nbins=30,
                   title='Email Opened Distribution',
                   labels={'EMAIL_OPENED': 'Emails Opened'})
fig.show()

In [None]:
# Support tickets
support_counts = customer_df['SUPPORT_TICKETS'].value_counts().sort_index()

fig = px.bar(x=support_counts.index, y=support_counts.values,
             labels={'x': 'Number of Support Tickets', 'y': 'Number of Customers'},
             title='Support Tickets Distribution')
fig.show()

## 6. Churn Analysis

In [None]:
# Define churn (customers who haven't purchased in 90+ days)
customer_df['IS_CHURNED'] = (customer_df['DAYS_SINCE_LAST_PURCHASE'] > 90).astype(int)

churn_rate = customer_df['IS_CHURNED'].mean()
print(f"Overall Churn Rate: {churn_rate:.2%}")

churn_counts = customer_df['IS_CHURNED'].value_counts()
print(f"\nActive Customers: {churn_counts[0]:,}")
print(f"Churned Customers: {churn_counts[1]:,}")

In [None]:
# Churn by membership tier
churn_by_tier = customer_df.groupby('MEMBERSHIP_TIER')['IS_CHURNED'].agg(['sum', 'count', 'mean'])
churn_by_tier.columns = ['Churned', 'Total', 'Churn_Rate']
churn_by_tier = churn_by_tier.sort_values('Churn_Rate', ascending=False)

fig = px.bar(churn_by_tier, y='Churn_Rate',
             title='Churn Rate by Membership Tier',
             labels={'Churn_Rate': 'Churn Rate', 'MEMBERSHIP_TIER': 'Membership Tier'})
fig.update_yaxes(tickformat='.1%')
fig.show()

In [None]:
# Churn by country
churn_by_country = customer_df.groupby('COUNTRY')['IS_CHURNED'].agg(['sum', 'count', 'mean'])
churn_by_country.columns = ['Churned', 'Total', 'Churn_Rate']
churn_by_country = churn_by_country.sort_values('Churn_Rate', ascending=False)

fig = px.bar(churn_by_country, y='Churn_Rate',
             title='Churn Rate by Country',
             labels={'Churn_Rate': 'Churn Rate', 'COUNTRY': 'Country'})
fig.update_yaxes(tickformat='.1%')
fig.show()

In [None]:
# Compare churned vs active customers
comparison_metrics = ['TOTAL_SPEND', 'TOTAL_TRANSACTIONS', 'AVG_TRANSACTION_VALUE', 
                     'PAGE_VIEWS', 'EMAIL_OPENED', 'SUPPORT_TICKETS']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, metric in enumerate(comparison_metrics):
    data_to_plot = [customer_df[customer_df['IS_CHURNED'] == 0][metric].dropna(),
                    customer_df[customer_df['IS_CHURNED'] == 1][metric].dropna()]
    
    axes[idx].boxplot(data_to_plot, labels=['Active', 'Churned'])
    axes[idx].set_title(f'{metric}')
    axes[idx].set_ylabel('Value')

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
numeric_cols = ['AGE', 'TOTAL_TRANSACTIONS', 'TOTAL_SPEND', 'AVG_TRANSACTION_VALUE',
                'DAYS_SINCE_LAST_PURCHASE', 'PAGE_VIEWS', 'SUPPORT_TICKETS', 
                'EMAIL_OPENED', 'IS_CHURNED']

correlation_matrix = customer_df[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 7. Key Insights Summary

In [None]:
print("=" * 60)
print("KEY INSIGHTS FROM EXPLORATORY DATA ANALYSIS")
print("=" * 60)

print(f"\n1. CUSTOMER BASE")
print(f"   - Total Customers: {len(customer_df):,}")
print(f"   - Average Age: {customer_df['AGE'].mean():.1f} years")
print(f"   - Gender Distribution: {dict(customer_df['GENDER'].value_counts())}")

print(f"\n2. TRANSACTION BEHAVIOR")
print(f"   - Average Total Spend: ${customer_df['TOTAL_SPEND'].mean():.2f}")
print(f"   - Median Total Spend: ${customer_df['TOTAL_SPEND'].median():.2f}")
print(f"   - Average Transactions per Customer: {customer_df['TOTAL_TRANSACTIONS'].mean():.1f}")

print(f"\n3. CHURN ANALYSIS")
print(f"   - Overall Churn Rate: {churn_rate:.2%}")
print(f"   - Churned Customers: {churn_counts[1]:,}")
print(f"   - Active Customers: {churn_counts[0]:,}")

print(f"\n4. ENGAGEMENT METRICS")
print(f"   - Average Page Views: {customer_df['PAGE_VIEWS'].mean():.1f}")
print(f"   - Average Emails Opened: {customer_df['EMAIL_OPENED'].mean():.1f}")
print(f"   - Average Support Tickets: {customer_df['SUPPORT_TICKETS'].mean():.2f}")

print("\n" + "=" * 60)

## 8. Cleanup

In [None]:
# Disconnect from Snowflake
connector.disconnect()
print("Disconnected from Snowflake")