# Module 03


In [1]:
from pathlib import Path

import altair as alt
import numpy as np
import polars as pl
import polars.selectors as cs
from sklearn.model_selection import train_test_split

from ml_zoomcamp.utils import clean_column_names, load_data

alt.data_transformers.disable_max_rows()

ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR.joinpath("data")

## 1. Data Preparation


In [2]:
csv_uri = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = load_data(csv_uri, DATA_DIR)
df = clean_column_names(df)

df = df.with_columns(
    cs.string()
    .str.to_lowercase()
    .str.replace_all(r"[^\w\s-]", "")
    .str.replace_all(r"\s+|-+", "_")
)

In [3]:
df.schema

Schema([('customerid', String),
        ('gender', String),
        ('seniorcitizen', Int64),
        ('partner', String),
        ('dependents', String),
        ('tenure', Int64),
        ('phoneservice', String),
        ('multiplelines', String),
        ('internetservice', String),
        ('onlinesecurity', String),
        ('onlinebackup', String),
        ('deviceprotection', String),
        ('techsupport', String),
        ('streamingtv', String),
        ('streamingmovies', String),
        ('contract', String),
        ('paperlessbilling', String),
        ('paymentmethod', String),
        ('monthlycharges', Float64),
        ('totalcharges', Float64),
        ('churn', String)])

In [4]:
df.glimpse()

Rows: 7043
Columns: 21
$ customerid       <str> '7590_vhveg', '5575_gnvde', '3668_qpybk', '7795_cfocw', '9237_hqitu', '9305_cdskc', '1452_kiovk', '6713_okomc', '7892_pookp', '6388_tabgu'
$ gender           <str> 'female', 'male', 'male', 'male', 'female', 'female', 'male', 'female', 'female', 'male'
$ seniorcitizen    <i64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ partner          <str> 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no'
$ dependents       <str> 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'yes'
$ tenure           <i64> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62
$ phoneservice     <str> 'no', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes'
$ multiplelines    <str> 'no_phone_service', 'no', 'no', 'no_phone_service', 'no', 'yes', 'yes', 'no_phone_service', 'yes', 'no'
$ internetservice  <str> 'dsl', 'dsl', 'dsl', 'dsl', 'fiber_optic', 'fiber_optic', 'fiber_optic', 'dsl', 'fiber_optic', 'dsl'
$ onlinesecurity   <str> 'no', 'yes', 'yes', 'yes', 'no', 'no', 'no

In [5]:
df.describe()

statistic,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
str,str,str,f64,str,str,f64,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,str
"""count""","""7043""","""7043""",7043.0,"""7043""","""7043""",7043.0,"""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""",7043.0,7032.0,"""7043"""
"""null_count""","""0""","""0""",0.0,"""0""","""0""",0.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""",0.0,11.0,"""0"""
"""mean""",,,0.162147,,,32.371149,,,,,,,,,,,,,64.761692,2283.300441,
"""std""",,,0.368612,,,24.559481,,,,,,,,,,,,,30.090047,2266.771362,
"""min""","""0002_orfbo""","""female""",0.0,"""no""","""no""",0.0,"""no""","""no""","""dsl""","""no""","""no""","""no""","""no""","""no""","""no""","""month_to_month""","""no""","""bank_transfer_automatic""",18.25,18.8,"""no"""
"""25%""",,,0.0,,,9.0,,,,,,,,,,,,,35.5,401.5,
"""50%""",,,0.0,,,29.0,,,,,,,,,,,,,70.35,1397.65,
"""75%""",,,0.0,,,55.0,,,,,,,,,,,,,89.85,3794.5,
"""max""","""9995_hotoh""","""male""",1.0,"""yes""","""yes""",72.0,"""yes""","""yes""","""no""","""yes""","""yes""","""yes""","""yes""","""yes""","""yes""","""two_year""","""yes""","""mailed_check""",118.75,8684.8,"""yes"""


In [6]:
df.null_count().transpose(include_header=True, column_names=["null_count"]).filter(
    pl.col("null_count") > 0
).sort(pl.col("null_count"), descending=True)

column,null_count
str,u32
"""totalcharges""",11


In [7]:
df.filter(pl.col("totalcharges").is_null()).select(pl.col("customerid", "totalcharges"))

customerid,totalcharges
str,f64
"""4472_lvygi""",
"""3115_czmzd""",
"""5709_lvoeq""",
"""4367_nuyao""",
"""1371_dwpaz""",
…,…
"""3213_vvolg""",
"""2520_sgtta""",
"""2923_arzlg""",
"""4075_wkniu""",


In [8]:
df = df.with_columns(pl.col("totalcharges").fill_null(0))

In [9]:
df = df.with_columns((pl.col("churn") == "yes").cast(pl.Int8))

## 2. Setting Up Validation Framework


In [10]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [11]:
len(df_train), len(df_val), len(df_test)

(4225, 1409, 1409)

In [12]:
y_train = df_train["churn"].to_numpy()
y_val = df_val["churn"].to_numpy()
y_test = df_test["churn"].to_numpy()

In [13]:
df_train = df_train.drop("churn")
df_val = df_val.drop("churn")
df_test = df_test.drop("churn")

## 3. Exploratory Data Analysis


In [14]:
df_full_train.null_count().transpose(
    include_header=True, column_names=["null_count"]
).filter(pl.col("null_count") > 0)

column,null_count
str,u32


In [15]:
df_full_train["churn"].value_counts(sort=True, normalize=True)

churn,proportion
i8,f64
0,0.730032
1,0.269968


In [16]:
global_churn_rate = df_full_train["churn"].mean()
round(global_churn_rate, 2)

0.27

In [17]:
numerical = ["tenure", "monthlycharges", "totalcharges"]

In [18]:
categorical = [
    "gender",
    "seniorcitizen",
    "partner",
    "dependents",
    "phoneservice",
    "multiplelines",
    "internetservice",
    "onlinesecurity",
    "onlinebackup",
    "deviceprotection",
    "techsupport",
    "streamingtv",
    "streamingmovies",
    "contract",
    "paperlessbilling",
    "paymentmethod",
]

In [19]:
df_full_train.select(pl.col(categorical).n_unique())

gender,seniorcitizen,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
2,2,2,2,2,3,3,3,3,3,3,3,3,3,2,4
