In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:")
display(missing_values[missing_values > 0])

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# Check for outliers using IQR method
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]
    return len(outliers)

print("\nOutlier Analysis:")
for col in numerical_features:
    outlier_count = detect_outliers(df, col)
    if outlier_count > 0:
        print(f"{col}: {outlier_count} outliers")

## 5. Data Quality Check

In [None]:
# Distribution of page values by revenue
fig = px.histogram(df, 
                  x="page_values", 
                  color="revenue",
                  marginal="box",
                  title="Distribution of Page Values by Revenue")
fig.show()

# Monthly purchase patterns
monthly_revenue = df.groupby(['month', 'revenue'])['revenue'].count().unstack()
fig = px.bar(monthly_revenue,
             title="Monthly Purchase Patterns",
             barmode='group',
             labels={'value': 'Count', 'month': 'Month'})
fig.show()

# Visitor type analysis
visitor_revenue = pd.crosstab(df['visitor_type'], df['revenue'])
fig = px.pie(visitor_revenue, 
             values=visitor_revenue[True], 
             names=visitor_revenue.index,
             title="Revenue Distribution by Visitor Type")
fig.show()

## 4. Visualizations

In [None]:
# Calculate correlation matrix
correlation_matrix = df[numerical_features].corr()

# Create heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, 
            annot=True, 
            cmap='coolwarm', 
            center=0,
            fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

## 3. Correlation Analysis

In [None]:
# Numerical features summary
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
print("Numerical Features Summary:")
display(df[numerical_features].describe())

# Categorical features summary
categorical_features = df.select_dtypes(include=['object', 'bool']).columns
print("\nCategorical Features Summary:")
for col in categorical_features:
    print(f"\n{col} value counts:")
    display(df[col].value_counts())

## 2. Feature Analysis

In [None]:
# Load the dataset
df = pd.read_csv('../data/raw/online_shoppers_intention.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nSample of data:")
display(df.head())
print("\nData Info:")
df.info()

## 1. Data Loading and Overview

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go

# Set style
plt.style.use('seaborn')
sns.set_palette("husl")

# 🔍 Exploratory Data Analysis - Online Shopper Intention

This notebook performs exploratory data analysis on the online shopper intention dataset to understand:
1. Data structure and basic statistics
2. Feature distributions and relationships
3. Customer behavior patterns
4. Potential predictors of purchase intention