# Data Cleaning and Exploratory Data Analysis (Churn dataset)

Author: Swasthik

Dataset: churn-bigml-80.csv

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from src.helpers import save_fig

df = pd.read_csv('data/churn-bigml-80.csv')
print('Loaded dataset with shape:', df.shape)
df.head()

In [None]:
print(df.info())
print('\nMissing values per column:\n', df.isna().sum())

In [None]:
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
print(f'Dropped {before-after} duplicate rows')
for col in df.columns:
    if df[col].dtype in ['float64','int64']:
        df[col] = df[col].fillna(df[col].median())
    else:
        df[col] = df[col].fillna(df[col].mode().iloc[0] if not df[col].mode().empty else '')
print('Missing values after imputation:\n', df.isna().sum().sum())

In [None]:
summary = df.describe(include='all')
summary.to_csv('data/churn_summary_stats.csv')
summary

In [None]:
if 'churn' in df.columns:
    fig = plt.figure()
    df['churn'].value_counts().plot(kind='bar')
    plt.title('Churn value counts')
    plt.xlabel('Churn')
    plt.ylabel('Count')
    save_fig(fig, 'figures/churn_value_counts.png')

num_cols = df.select_dtypes(include=['float64','int64']).columns.tolist()
if num_cols:
    fig2 = plt.figure()
    df[num_cols[0]].hist()
    plt.title(f'Histogram of {num_cols[0]}')
    save_fig(fig2, f'figures/hist_{num_cols[0]}.png')