# Module 03


In [1]:
from pathlib import Path

import altair as alt
import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs

from ml_zoomcamp.utils import clean_column_names, load_data

alt.data_transformers.disable_max_rows()

ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR.joinpath("data")

## 1. Data Preparation


In [2]:
csv_uri = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = load_data(csv_uri, DATA_DIR)
df = clean_column_names(df)

In [3]:
df.schema

Schema([('customerid', String),
        ('gender', String),
        ('seniorcitizen', Int64),
        ('partner', String),
        ('dependents', String),
        ('tenure', Int64),
        ('phoneservice', String),
        ('multiplelines', String),
        ('internetservice', String),
        ('onlinesecurity', String),
        ('onlinebackup', String),
        ('deviceprotection', String),
        ('techsupport', String),
        ('streamingtv', String),
        ('streamingmovies', String),
        ('contract', String),
        ('paperlessbilling', String),
        ('paymentmethod', String),
        ('monthlycharges', Float64),
        ('totalcharges', Float64),
        ('churn', String)])

In [4]:
df.glimpse()

Rows: 7043
Columns: 21
$ customerid       <str> '7590-VHVEG', '5575-GNVDE', '3668-QPYBK', '7795-CFOCW', '9237-HQITU', '9305-CDSKC', '1452-KIOVK', '6713-OKOMC', '7892-POOKP', '6388-TABGU'
$ gender           <str> 'Female', 'Male', 'Male', 'Male', 'Female', 'Female', 'Male', 'Female', 'Female', 'Male'
$ seniorcitizen    <i64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ partner          <str> 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No'
$ dependents       <str> 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes'
$ tenure           <i64> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62
$ phoneservice     <str> 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes'
$ multiplelines    <str> 'No phone service', 'No', 'No', 'No phone service', 'No', 'Yes', 'Yes', 'No phone service', 'Yes', 'No'
$ internetservice  <str> 'DSL', 'DSL', 'DSL', 'DSL', 'Fiber optic', 'Fiber optic', 'Fiber optic', 'DSL', 'Fiber optic', 'DSL'
$ onlinesecurity   <str> 'No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No

In [6]:
df.describe()

statistic,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
str,str,str,f64,str,str,f64,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,str
"""count""","""7043""","""7043""",7043.0,"""7043""","""7043""",7043.0,"""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""",7043.0,7032.0,"""7043"""
"""null_count""","""0""","""0""",0.0,"""0""","""0""",0.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""",0.0,11.0,"""0"""
"""mean""",,,0.162147,,,32.371149,,,,,,,,,,,,,64.761692,2283.300441,
"""std""",,,0.368612,,,24.559481,,,,,,,,,,,,,30.090047,2266.771362,
"""min""","""0002-ORFBO""","""Female""",0.0,"""No""","""No""",0.0,"""No""","""No""","""DSL""","""No""","""No""","""No""","""No""","""No""","""No""","""Month-to-month""","""No""","""Bank transfer (automatic)""",18.25,18.8,"""No"""
"""25%""",,,0.0,,,9.0,,,,,,,,,,,,,35.5,401.5,
"""50%""",,,0.0,,,29.0,,,,,,,,,,,,,70.35,1397.65,
"""75%""",,,0.0,,,55.0,,,,,,,,,,,,,89.85,3794.5,
"""max""","""9995-HOTOH""","""Male""",1.0,"""Yes""","""Yes""",72.0,"""Yes""","""Yes""","""No""","""Yes""","""Yes""","""Yes""","""Yes""","""Yes""","""Yes""","""Two year""","""Yes""","""Mailed check""",118.75,8684.8,"""Yes"""


In [14]:
df.null_count().transpose(include_header=True, column_names=["null_count"]).filter(
    pl.col("null_count") > 0
).sort(pl.col("null_count"), descending=True)

column,null_count
str,u32
"""totalcharges""",11


In [17]:
df = df.with_columns(pl.col("totalcharges").fill_null(0))