# Data Exploration

This notebook explores the customer data to understand its characteristics and prepare for analysis.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Add src directory to path
sys.path.append(os.path.join(os.path.dirname(os.path.abspath('')), '../src'))

# Import modules
from src.data_preprocessing import DataPreprocessor

# Set visualization style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

# Display settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

## 1. Load Data

In [None]:
# Initialize data preprocessor
preprocessor = DataPreprocessor()

# Load data
df = preprocessor.load_data('../data/raw/customer_data.csv')

# Display first few rows
df.head()

## 2. Data Overview

In [None]:
# Get basic information about the dataset
print(f"Dataset shape: {df.shape}")
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())
print("\nSummary statistics:")
df.describe()

## 3. Target Variable Analysis

In [None]:
# Analyze churn distribution
churn_counts = df['Churn'].value_counts()
churn_percentage = df['Churn'].value_counts(normalize=True) * 100

print("Churn Distribution:")
print(churn_counts)
print("\nChurn Percentage:")
print(churn_percentage)

# Visualize churn distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Churn', data=df)
plt.title('Churn Distribution')
plt.xlabel('Churn (0=No, 1=Yes)')
plt.ylabel('Count')

# Add percentage labels
total = len(df)
for p in plt.gca().patches:
    height = p.get_height()
    plt.gca().text(p.get_x() + p.get_width()/2., height + 50,
                    f'{height/total*100:.1f}%',
                    ha='center', va='bottom')

plt.tight_layout()
plt.savefig('../visualizations/churn_distribution.png', dpi=300)
plt.show()

## 4. Feature Analysis

In [None]:
# Analyze numerical features
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_features.remove('Churn')
numerical_features.remove('CustomerID')

print(f"Numerical features: {numerical_features}")

# Create histograms for numerical features
plt.figure(figsize=(15, 12))
for i, feature in enumerate(numerical_features):
    plt.subplot(3, 3, i+1)
    sns.histplot(df[feature], kde=True)
    plt.title(f'Distribution of {feature}')
    plt.tight_layout()

plt.savefig('../visualizations/feature_distributions.png', dpi=300)
plt.show()

In [None]:
# Analyze categorical features
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

print(f"Categorical features: {categorical_features}")

# Create count plots for categorical features
if categorical_features:
    plt.figure(figsize=(15, 10))
    for i, feature in enumerate(categorical_features):
        plt.subplot(2, 2, i+1)
        sns.countplot(y=feature, data=df)
        plt.title(f'Count of {feature}')
        plt.tight_layout()

    plt.savefig('../visualizations/categorical_features.png', dpi=300)
    plt.show()

## 5. Correlation Analysis

In [None]:
# Calculate correlation matrix
correlation_matrix = df.corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.savefig('../visualizations/correlation_matrix.png', dpi=300)
plt.show()

# Show correlations with churn
churn_correlations = correlation_matrix['Churn'].sort_values(ascending=False)
print("Correlations with Churn:")
print(churn_correlations)

## 6. Churn by Features

In [None]:
# Analyze churn by numerical features
plt.figure(figsize=(15, 12))
for i, feature in enumerate(numerical_features[:6]):
    plt.subplot(3, 2, i+1)
    sns.boxplot(x='Churn', y=feature, data=df)
    plt.title(f'{feature} by Churn')
    plt.tight_layout()

plt.savefig('../visualizations/churn_by_features.png', dpi=300)
plt.show()

## 7. Data Preprocessing

In [None]:
# Clean data
cleaned_df = preprocessor.clean_data(df)

# Encode categorical variables
encoded_df = preprocessor.encode_categorical(cleaned_df)

# Scale features
scaled_df = preprocessor.scale_features(encoded_df)

# Save processed data
preprocessor.save_processed_data(scaled_df, '../data/processed/processed_data.csv')

print("Data preprocessing completed.")
print(f"Processed data shape: {scaled_df.shape}")
print("\nFirst few rows of processed data:")
scaled_df.head()

## 8. Summary

This notebook explored the customer data and performed initial preprocessing. Key findings:

1. The dataset contains 10,000 customers with various features
2. Churn rate is approximately 20%
3. Some features show correlation with churn
4. Data has been cleaned, encoded, and scaled for modeling

Next steps:
- Customer segmentation using clustering
- Churn prediction using machine learning models
- Business insights and recommendations