In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data_path = '../data/ecommerce_churn_data.csv'
df = pd.read_csv(data_path)

# Display the first few rows
display(df.head())

# Basic dataset overview
print("Dataset Shape:", df.shape)
print("Column Info:")
df.info()

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Summary statistics
print("\nSummary Statistics:")
print(df.describe())

# Churn distribution
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='Churn', palette='viridis')
plt.title('Churn Distribution')
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', cbar=True)
plt.title('Feature Correlation Heatmap')
plt.show()

# Distribution of numerical features
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
for feature in numerical_features:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[feature], kde=True, bins=30, color='blue')
    plt.title(f'Distribution of {feature}')
    plt.show()

# Box plots for numerical features grouped by churn
for feature in numerical_features:
    plt.figure(figsize=(6, 4))
    sns.boxplot(data=df, x='Churn', y=feature, palette='Set2')
    plt.title(f'{feature} by Churn')
    plt.show()

# Summary insights
print("\nInsights:")
print("1. Churn distribution shows class balance or imbalance.")
print("2. Correlation matrix helps identify potential relationships between features and target.")
print("3. Numerical feature distributions reveal skewness or outliers.")