# EDA Notebook — BCG X Task 2

This notebook starts from the **starter template provided** and extends it with additional EDA steps focused on understanding drivers of churn (price sensitivity, consumption, margins, and customer service channels). It expects `client_data.csv` and `price_data.csv` in the same folder as the notebook. The code includes fallbacks to the uploaded filenames if needed.

In [None]:
# Import packages
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
%matplotlib inline

# Set plot style
sns.set(color_codes=True)


In [None]:
# Try common filenames, otherwise use the uploaded versions with (1)
from pathlib import Path

candidates = ["./client_data.csv", "./client_data (1).csv", "/mnt/data/client_data (1).csv", "/mnt/data/client_data.csv"]
p_candidates = ["./price_data.csv", "./price_data (1).csv", "/mnt/data/price_data (1).csv", "/mnt/data/price_data.csv"]

client_path = None
price_path = None

for p in candidates:
    if Path(p).exists():
        client_path = p
        break

for p in p_candidates:
    if Path(p).exists():
        price_path = p
        break

if client_path is None or price_path is None:
    raise FileNotFoundError(f"Could not find client_data.csv or price_data.csv. Checked: {candidates} and {p_candidates}")

print("Using:", client_path, price_path)
client_df = pd.read_csv(client_path)
price_df = pd.read_csv(price_path)

# Quick head
display(client_df.head(3))
display(price_df.head(3))


In [None]:
# Data types and quick stats
print("Client data info:")
client_df.info()
print("\nPrice data info:")
price_df.info()

print("\nNumerical description for client_df:")
display(client_df.describe(include=[np.number]).T)

print("\nCategorical / object counts (sample):")
for col in client_df.select_dtypes(include=['object','bool','category']).columns[:10]:
    print(f"--- {col} ---")
    print(client_df[col].value_counts(dropna=False).head(10))
    print()


In [None]:
# Convert common date columns and categorical flags
date_cols = [c for c in client_df.columns if 'date' in c.lower()]
for c in date_cols:
    try:
        client_df[c] = pd.to_datetime(client_df[c], errors='coerce')
    except Exception as e:
        print("Could not convert", c, e)

# Convert price date
if 'price_date' in price_df.columns:
    price_df['price_date'] = pd.to_datetime(price_df['price_date'], errors='coerce')

# Convert boolean-like columns
bool_like = [c for c in client_df.columns if client_df[c].dropna().isin([0,1]).all() and client_df[c].nunique()<=2]
# keep churn as numeric but show distribution
print("Churn value counts:")
print(client_df['churn'].value_counts(dropna=False))

# Ensure categorical dtype for some columns
cat_cols = ['channel_sales','origin_up','has_gas']
for c in cat_cols:
    if c in client_df.columns:
        client_df[c] = client_df[c].astype('category')

print("\nAfter conversions:")
client_df.info()


In [None]:
# Missing values and unique counts
missing = client_df.isnull().sum().sort_values(ascending=False)
display(missing[missing>0])

uniques = client_df.nunique().sort_values()
display(uniques.head(20))


In [None]:
# Helper plotting functions from starter notebook

def annotate_stacked_bars(ax, pad=0.99, colour="white", textsize=13):
    for p in ax.patches:
        value = str(round(p.get_height(),1))
        if value == '0.0':
            continue
        ax.annotate(
            value,
            ((p.get_x()+ p.get_width()/2)*pad-0.05, (p.get_y()+p.get_height()/2)*pad),
            color=colour,
            size=textsize
        )

def plot_stacked_bars(dataframe, title_, size_=(18, 10), rot_=0, legend_="upper right"):
    ax = dataframe.plot(
        kind="bar",
        stacked=True,
        figsize=size_,
        rot=rot_,
        title=title_
    )
    annotate_stacked_bars(ax, textsize=14)
    plt.legend(["Retention", "Churn"], loc=legend_)
    plt.ylabel("Company base (%)")
    plt.show()

def plot_distribution(dataframe, column, ax, bins_=50):
    temp = pd.DataFrame({"Retention": dataframe[dataframe["churn"]==0][column],
    "Churn":dataframe[dataframe["churn"]==1][column]})
    temp[["Retention","Churn"]].plot(kind='hist', bins=bins_, ax=ax, stacked=True)
    ax.set_xlabel(column)
    ax.ticklabel_format(style='plain', axis='x')

# Churn overview
churn = client_df[['id', 'churn']].copy()
churn.columns = ['Companies', 'churn']
churn_total = churn.groupby(churn['churn']).count()
churn_percentage = churn_total / churn_total.sum() * 100
plot_stacked_bars(churn_percentage.transpose(), "Churning status", (6, 4), legend_="lower right")


In [None]:
# Consumption distributions: cons_12m and cons_last_month if available
cons_cols = [c for c in ['cons_12m','cons_last_month','cons_gas_12m'] if c in client_df.columns]
if len(cons_cols)>0:
    fig, axs = plt.subplots(nrows=len(cons_cols), figsize=(10,4*len(cons_cols)))
    if len(cons_cols)==1:
        axs = [axs]
    for ax,col in zip(axs,cons_cols):
        plot_distribution(client_df, col, ax, bins_=80)
        ax.set_title(f"Distribution of {col} (Retention vs Churn)")
    plt.tight_layout()


In [None]:
# Merge a sample of price data with client to analyze price exposure
if 'id' in client_df.columns and 'id' in price_df.columns:
    # take price time series for a random sample of 4 customers and plot
    sample_ids = price_df['id'].drop_duplicates().sample(4, random_state=42).tolist()
    df_sample = price_df[price_df['id'].isin(sample_ids)].copy()
    # pivot for plotting monthly price_off_peak_var where available
    if 'price_date' in df_sample.columns:
        for pid in sample_ids:
            tmp = df_sample[df_sample['id']==pid].sort_values('price_date')
            if 'price_off_peak_var' in tmp.columns:
                plt.plot(tmp['price_date'], tmp['price_off_peak_var'], marker='o', label=str(pid))
        plt.xlabel('Price date')
        plt.ylabel('price_off_peak_var')
        plt.title('Sample customers: price_off_peak_var time series')
        plt.legend()
        plt.gcf().autofmt_xdate()
        plt.show()
else:
    print("Cannot merge — missing 'id' in one of the datasets.")


In [None]:
# Churn by category for a few categorical columns
cat_cols = [c for c in ['channel_sales','has_gas','origin_up'] if c in client_df.columns]
for c in cat_cols:
    temp = client_df.groupby([c,'churn']).size().unstack(fill_value=0)
    # convert to percentage
    temp_pct = temp.div(temp.sum(axis=1), axis=0)*100
    display(temp_pct)
    plot_stacked_bars(temp_pct.transpose(), f"Churn by {c}", (6,4), legend_='lower right')


In [None]:
# Correlation of numeric features vs churn (point biserial via corr with churn)
num = client_df.select_dtypes(include=[np.number]).drop(columns=['churn'], errors='ignore')
if 'churn' in client_df.columns:
    corr_with_churn = num.apply(lambda x: x.corr(client_df['churn']))
    display(corr_with_churn.sort_values(ascending=False).head(20))

# Boxplot example: net_margin (if exists) by churn
if 'net_margin' in client_df.columns:
    fig, ax = plt.subplots(figsize=(8,6))
    client_df.boxplot(column='net_margin', by='churn', ax=ax)
    plt.title("Net margin by churn")
    plt.suptitle('')
    plt.xlabel("Churn")
    plt.ylabel("Net margin")
    plt.show()


## Next steps

- Feature engineering (tenure, avg monthly consumption, price change exposure)
- Statistical tests to validate price sensitivity hypothesis (t-tests, regression coefficients)
- Build predictive models (logistic regression, tree-based) and measure feature importance

---

**File saved:** `EDA_BCGX_Task2.ipynb`