# Customer Churn Prediction - Data Preparation

This notebook implements the data cleaning and preprocessing pipeline for the customer churn prediction project.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Set up logging
import logging
import sys
# date_strftime_format = "%Y-%m-%y %H:%M:%S"
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")

# Add the src directory to the path to import custom modules
sys.path.append('../')
from src.data.data_processing import load_data, save_processed_data

# Set up plotting
%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Load the Data

In [None]:
# Define file paths
RAW_DATA_PATH = '../data/raw/churn.csv'
PROCESSED_DATA_PATH = '../data/processed/churn_processed.csv'
CLEANED_DATA_PATH = '../data/cleaned/churn_cleaned.csv'

# Load the data
df = load_data(RAW_DATA_PATH)
print(f"Dataset shape: {df.shape}")
df.head()

## 2. Explore Data Structure

In [None]:
# Check data types and missing values
df.info()

In [None]:
# Check summary statistics
df.describe(include='all')

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

In [None]:
# Check target distribution
print("Target distribution:")
print(df['Exited'].value_counts())
print("\nPercentage:")
print(df['Exited'].value_counts(normalize=True) * 100)

In [None]:
# Check categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
print("Categorical columns:")
print(categorical_cols)

for col in categorical_cols:
    print(f"\n{col} value counts:")
    print(df[col].value_counts())

## 3. Data Cleaning

In [None]:
# Remove unnecessary columns
df_cleaned = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
print(f"Cleaned dataset shape: {df_cleaned.shape}")
df_cleaned.head()

In [None]:
# Check for duplicates
duplicate_count = df_cleaned.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

if duplicate_count > 0:
    df_cleaned = df_cleaned.drop_duplicates()
    print(f"Shape after removing duplicates: {df_cleaned.shape}")

In [None]:
# Check for outliers in numerical columns
numerical_cols = df_cleaned.select_dtypes(include=['int64', 'float64']).columns
numerical_cols = [col for col in numerical_cols if col != 'Exited']

plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(2, 4, i+1)
    sns.boxplot(x=df_cleaned[col])
    plt.title(col)
plt.tight_layout()

In [None]:
# Handle outliers if necessary (using IQR method)
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = ((df[column] < lower_bound) | (df[column] > upper_bound)).sum()
    print(f"{column}: {outliers} outliers detected")
    
    # Cap outliers instead of removing them
    if outliers > 0:
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
            
    return df

# Apply outlier handling to numerical columns
for col in numerical_cols:
    df_cleaned = handle_outliers(df_cleaned, col)

## 4. Data Preprocessing

In [None]:
# One-hot encode categorical variables
df_processed = pd.get_dummies(df_cleaned, columns=['Geography', 'Gender'], drop_first=True) # we drop the first category to avoid dummy variable trap
print(f"Processed dataset shape: {df_processed.shape}")
df_processed.head()

In [None]:
# Convert binary columns to proper format
binary_cols = ['HasCrCard', 'IsActiveMember']
for col in binary_cols:
    df_processed[col] = df_processed[col].astype(int)

In [None]:
# Check correlation matrix
plt.figure(figsize=(12, 10))
correlation_matrix = df_processed.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.tight_layout()

## 5. Save Processed Data

In [None]:
# Save the cleaned data
save_processed_data(df_cleaned, CLEANED_DATA_PATH)
print(f"Cleaned data saved to {CLEANED_DATA_PATH}")

# Save the processed data
save_processed_data(df_processed, PROCESSED_DATA_PATH)
print(f"Processed data saved to {PROCESSED_DATA_PATH}")

## 6. Data Dictionary

In [None]:
# Create a data dictionary
data_dictionary = {
    'CreditScore': 'Credit score of the customer',
    'Geography': 'Customer\'s location (France, Spain, Germany)',
    'Gender': 'Customer\'s gender (Male, Female)',
    'Age': 'Customer\'s age in years',
    'Tenure': 'Number of years the customer has been a client of the bank',
    'Balance': 'Account balance',
    'NumOfProducts': 'Number of bank products the customer uses',
    'HasCrCard': 'Whether the customer has a credit card (1=Yes, 0=No)',
    'IsActiveMember': 'Whether the customer is an active member (1=Yes, 0=No)',
    'EstimatedSalary': 'Estimated salary of the customer',
    'Exited': 'Whether the customer has churned (1=Yes, 0=No)',
    'Geography_Germany': 'Whether the customer is from Germany (1=Yes, 0=No)',
    'Geography_Spain': 'Whether the customer is from Spain (1=Yes, 0=No)',
    'Gender_Male': 'Whether the customer is male (1=Yes, 0=No)'
}

# Display the data dictionary
pd.DataFrame(list(data_dictionary.items()), columns=['Feature', 'Description'])

In [None]:
# Save the data dictionary
data_dict_df = pd.DataFrame(list(data_dictionary.items()), columns=['Feature', 'Description'])
output_path = '../docs/data_dictionary.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
data_dict_df.to_csv(output_path, index=False)
print("Data dictionary saved to ../docs/data_dictionary.csv")

## 7. Summary

In this notebook, we have:
1. Loaded the bank customer churn dataset
2. Explored the data structure and identified key characteristics
3. Cleaned the data by removing unnecessary columns and handling outliers
4. Preprocessed the data by encoding categorical variables
5. Saved the cleaned and processed datasets
6. Created and saved a data dictionary

The dataset is now ready for exploratory data analysis and feature engineering.