In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
customer_data_path = '../../data/raw/customer_data.csv'
df = pd.read_csv(customer_data_path)

df.head()

In [None]:
df.info()

In [None]:
missing_values = df.isnull().sum()
print('Missing values in each column:\n', missing_values)

In [None]:
df_clean = df.dropna()

In [None]:
df.describe()

In [None]:
df.dtypes
df.memory_usage(deep=True)

In [None]:
duplicate_rows = df[df.duplicated()]
print(f'Number of duplicate rows: {len(duplicate_rows)}')
df_clean = df_clean.drop_duplicates()

In [None]:
categorical_columns = df_clean.select_dtypes(include=['object']).columns
for col in categorical_columns:
    print(f'Unique values in {col}: {df_clean[col].nunique()}')

In [None]:
for col in categorical_columns:
    plt.figure(figsize=(10, 5))
    sns.countplot(data=df_clean, x=col, palette='Set1')
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
numerical_columns = df_clean.select_dtypes(include=[np.number]).columns
df_clean[numerical_columns].hist(figsize=(12, 10), bins=20, color='dodgerblue', edgecolor='black')
plt.suptitle('Distribution of Numerical Features', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
correlation_matrix = df_clean[numerical_columns].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
for col in numerical_columns:
    plt.figure(figsize=(10, 5))
    sns.boxplot(data=df_clean, x=col, palette='Set2')
    plt.title(f'Boxplot for {col}')
    plt.show()

In [None]:
if 'churn' in df_clean.columns:
    plt.figure(figsize=(6, 4))
    sns.countplot(x='churn', data=df_clean, palette='Set3')
    plt.title('Churn Distribution')
    plt.show()

In [None]:
if 'churn' in df_clean.columns:
    for col in numerical_columns:
        plt.figure(figsize=(8, 6))
        sns.boxplot(x='churn', y=col, data=df_clean, palette='Set1')
        plt.title(f'Relationship between {col} and Churn')
        plt.show()

In [None]:
if 'churn' in df_clean.columns:
    for col in categorical_columns:
        plt.figure(figsize=(10, 6))
        sns.countplot(x=col, hue='churn', data=df_clean, palette='coolwarm')
        plt.title(f'{col} vs Churn')
        plt.xticks(rotation=45)
        plt.show()

In [None]:
if 'tenure' in df_clean.columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(df_clean['tenure'], kde=True, bins=30, color='darkblue')
    plt.title('Customer Tenure Distribution')
    plt.show()

    plt.figure(figsize=(10, 6))
    sns.boxplot(x='churn', y='tenure', data=df_clean, palette='cool')
    plt.title('Tenure vs Churn')
    plt.show()

In [None]:
if 'total_charges' in df_clean.columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(df_clean['total_charges'], kde=True, bins=30, color='green')
    plt.title('Total Charges Distribution')
    plt.show()

    plt.figure(figsize=(10, 6))
    sns.boxplot(x='churn', y='total_charges', data=df_clean, palette='coolwarm')
    plt.title('Total Charges vs Churn')
    plt.show()

In [None]:
if 'monthly_charges' in df_clean.columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(df_clean['monthly_charges'], kde=True, bins=30, color='purple')
    plt.title('Monthly Charges Distribution')
    plt.show()

    plt.figure(figsize=(10, 6))
    sns.boxplot(x='churn', y='monthly_charges', data=df_clean, palette='Set3')
    plt.title('Monthly Charges vs Churn')
    plt.show()

In [None]:
service_columns = ['phone_service', 'internet_service', 'tech_support', 'online_backup']
for col in service_columns:
    if col in df_clean.columns:
        plt.figure(figsize=(10, 6))
        sns.countplot(x=col, hue='churn', data=df_clean, palette='Set2')
        plt.title(f'{col.capitalize()} vs Churn')
        plt.show()

In [None]:
if 'contract_type' in df_clean.columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(x='contract_type', hue='churn', data=df_clean, palette='coolwarm')
    plt.title('Contract Type vs Churn')
    plt.show()

In [None]:
cleaned_data_path = '../../data/processed/cleaned_customer_data.csv'
df_clean.to_csv(cleaned_data_path, index=False)
print('EDA Completed and Cleaned Data Saved.')