# Module 03


In [1]:
from pathlib import Path

import altair as alt
import numpy as np
import polars as pl
import polars.selectors as cs
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from ml_zoomcamp.utils import clean_column_names, load_data

alt.data_transformers.disable_max_rows()

ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR.joinpath("data")

## 1. Data Preparation


In [2]:
csv_uri = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = load_data(csv_uri, DATA_DIR)
df = clean_column_names(df)

df = df.with_columns(
    cs.string()
    .str.to_lowercase()
    .str.replace_all(r"[^\w\s-]", "")
    .str.replace_all(r"\s+|-+", "_")
)

In [None]:
df.schema

Schema([('customerid', String),
        ('gender', String),
        ('seniorcitizen', Int64),
        ('partner', String),
        ('dependents', String),
        ('tenure', Int64),
        ('phoneservice', String),
        ('multiplelines', String),
        ('internetservice', String),
        ('onlinesecurity', String),
        ('onlinebackup', String),
        ('deviceprotection', String),
        ('techsupport', String),
        ('streamingtv', String),
        ('streamingmovies', String),
        ('contract', String),
        ('paperlessbilling', String),
        ('paymentmethod', String),
        ('monthlycharges', Float64),
        ('totalcharges', Float64),
        ('churn', String)])

In [4]:
df.glimpse()

Rows: 7043
Columns: 21
$ customerid       <str> '7590_vhveg', '5575_gnvde', '3668_qpybk', '7795_cfocw', '9237_hqitu', '9305_cdskc', '1452_kiovk', '6713_okomc', '7892_pookp', '6388_tabgu'
$ gender           <str> 'female', 'male', 'male', 'male', 'female', 'female', 'male', 'female', 'female', 'male'
$ seniorcitizen    <i64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ partner          <str> 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no'
$ dependents       <str> 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'yes'
$ tenure           <i64> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62
$ phoneservice     <str> 'no', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes'
$ multiplelines    <str> 'no_phone_service', 'no', 'no', 'no_phone_service', 'no', 'yes', 'yes', 'no_phone_service', 'yes', 'no'
$ internetservice  <str> 'dsl', 'dsl', 'dsl', 'dsl', 'fiber_optic', 'fiber_optic', 'fiber_optic', 'dsl', 'fiber_optic', 'dsl'
$ onlinesecurity   <str> 'no', 'yes', 'yes', 'yes', 'no', 'no', 'no

In [5]:
df.describe()

statistic,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
str,str,str,f64,str,str,f64,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,str
"""count""","""7043""","""7043""",7043.0,"""7043""","""7043""",7043.0,"""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""","""7043""",7043.0,7032.0,"""7043"""
"""null_count""","""0""","""0""",0.0,"""0""","""0""",0.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""",0.0,11.0,"""0"""
"""mean""",,,0.162147,,,32.371149,,,,,,,,,,,,,64.761692,2283.300441,
"""std""",,,0.368612,,,24.559481,,,,,,,,,,,,,30.090047,2266.771362,
"""min""","""0002_orfbo""","""female""",0.0,"""no""","""no""",0.0,"""no""","""no""","""dsl""","""no""","""no""","""no""","""no""","""no""","""no""","""month_to_month""","""no""","""bank_transfer_automatic""",18.25,18.8,"""no"""
"""25%""",,,0.0,,,9.0,,,,,,,,,,,,,35.5,401.5,
"""50%""",,,0.0,,,29.0,,,,,,,,,,,,,70.35,1397.65,
"""75%""",,,0.0,,,55.0,,,,,,,,,,,,,89.85,3794.5,
"""max""","""9995_hotoh""","""male""",1.0,"""yes""","""yes""",72.0,"""yes""","""yes""","""no""","""yes""","""yes""","""yes""","""yes""","""yes""","""yes""","""two_year""","""yes""","""mailed_check""",118.75,8684.8,"""yes"""


In [6]:
df.null_count().transpose(include_header=True, column_names=["null_count"]).filter(
    pl.col("null_count") > 0
).sort(pl.col("null_count"), descending=True)

column,null_count
str,u32
"""totalcharges""",11


In [7]:
df.filter(pl.col("totalcharges").is_null()).select(pl.col("customerid", "totalcharges"))

customerid,totalcharges
str,f64
"""4472_lvygi""",
"""3115_czmzd""",
"""5709_lvoeq""",
"""4367_nuyao""",
"""1371_dwpaz""",
…,…
"""3213_vvolg""",
"""2520_sgtta""",
"""2923_arzlg""",
"""4075_wkniu""",


In [8]:
df = df.with_columns(pl.col("totalcharges").fill_null(0))

In [9]:
df = df.with_columns((pl.col("churn") == "yes").cast(pl.Int8))

## 2. Setting Up Validation Framework


In [10]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [11]:
len(df_train), len(df_val), len(df_test)

(4225, 1409, 1409)

In [12]:
y_train = df_train["churn"].to_numpy()
y_val = df_val["churn"].to_numpy()
y_test = df_test["churn"].to_numpy()

In [13]:
df_train = df_train.drop("churn")
df_val = df_val.drop("churn")
df_test = df_test.drop("churn")

## 3. Exploratory Data Analysis


In [14]:
df_full_train.null_count().transpose(
    include_header=True, column_names=["null_count"]
).filter(pl.col("null_count") > 0)

column,null_count
str,u32


In [15]:
df_full_train["churn"].value_counts(sort=True, normalize=True)

churn,proportion
i8,f64
0,0.730032
1,0.269968


In [16]:
global_churn_rate = df_full_train["churn"].mean()
round(global_churn_rate, 2)

0.27

In [17]:
numerical = ["tenure", "monthlycharges", "totalcharges"]

In [18]:
categorical = [
    "gender",
    "seniorcitizen",
    "partner",
    "dependents",
    "phoneservice",
    "multiplelines",
    "internetservice",
    "onlinesecurity",
    "onlinebackup",
    "deviceprotection",
    "techsupport",
    "streamingtv",
    "streamingmovies",
    "contract",
    "paperlessbilling",
    "paymentmethod",
]

In [19]:
df_full_train.select(pl.col(categorical).n_unique())

gender,seniorcitizen,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
2,2,2,2,2,3,3,3,3,3,3,3,3,3,2,4


## 4. Feature importance: Churn rate and Risk ratio


#### Churn rate


In [20]:
df_full_train.head()

customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
str,str,i64,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,i8
"""5442_pptjy""","""male""",0,"""yes""","""yes""",12,"""yes""","""no""","""no""","""no_internet_service""","""no_internet_service""","""no_internet_service""","""no_internet_service""","""no_internet_service""","""no_internet_service""","""two_year""","""no""","""mailed_check""",19.7,258.35,0
"""6261_rcvns""","""female""",0,"""no""","""no""",42,"""yes""","""no""","""dsl""","""yes""","""yes""","""yes""","""yes""","""no""","""yes""","""one_year""","""no""","""credit_card_automatic""",73.9,3160.55,1
"""2176_osjuv""","""male""",0,"""yes""","""no""",71,"""yes""","""yes""","""dsl""","""yes""","""yes""","""no""","""yes""","""no""","""no""","""two_year""","""no""","""bank_transfer_automatic""",65.15,4681.75,0
"""6161_erdgd""","""male""",0,"""yes""","""yes""",71,"""yes""","""yes""","""dsl""","""yes""","""no""","""yes""","""yes""","""yes""","""yes""","""one_year""","""no""","""electronic_check""",85.45,6300.85,0
"""2364_ufrom""","""male""",0,"""no""","""no""",30,"""yes""","""no""","""dsl""","""yes""","""yes""","""no""","""yes""","""yes""","""no""","""one_year""","""no""","""electronic_check""",70.4,2044.75,0


In [21]:
churn_female = df_full_train.filter(pl.col("gender") == "female")["churn"].mean()
churn_female

0.27682403433476394

In [22]:
churn_male = df_full_train.filter(pl.col("gender") == "male")["churn"].mean()
churn_male

0.2632135306553911

In [None]:
global_churn_rate

0.26996805111821087

In [24]:
df_full_train["partner"].value_counts()

partner,count
str,u32
"""no""",2932
"""yes""",2702


In [None]:
churn_partner = df_full_train.filter(pl.col("partner") == "yes")["churn"].mean()
churn_partner

0.20503330866025166

In [26]:
global_churn_rate - churn_partner

0.06493474245795922

In [27]:
churn_no_partner = df_full_train.filter(pl.col("partner") == "no")["churn"].mean()
churn_no_partner

0.3298090040927694

In [28]:
global_churn_rate - churn_no_partner

-0.05984095297455855

`partner` may have a greater importance than `gender` in affecting churn

1. Difference (group - global)

   - difference < 0 -> less likely to churn
   - difference > 0 -> more likely to churn

1. Risk ratio (group / global)
   - risk < 1 -> less likely to churn
   - risk > 1 -> more likely to churn


In [29]:
churn_no_partner / global_churn_rate

1.2216593879412643

In [30]:
churn_partner / global_churn_rate

0.7594724924338315

In [None]:
df_full_train.group_by(pl.col("gender")).agg(
    pl.col("churn").mean().alias("mean"),
    pl.col("churn").count().alias("count"),
    (pl.col("churn").mean() - global_churn_rate).alias("diff"),
    (pl.col("churn").mean() / global_churn_rate).alias("risk"),
)

gender,mean,count,diff,risk
str,f64,u32,f64,f64
"""female""",0.276824,2796,0.006856,1.025396
"""male""",0.263214,2838,-0.006755,0.97498


In [None]:
for c in categorical:
    print(c)
    df_group = df_full_train.group_by(pl.col(c)).agg(
        pl.col("churn").mean().alias("mean"),
        pl.col("churn").count().alias("count"),
        (pl.col("churn").mean() - global_churn_rate).alias("diff"),
        (pl.col("churn").mean() / global_churn_rate).alias("risk"),
    )
    print(df_group)

gender
shape: (2, 5)
┌────────┬──────────┬───────┬───────────┬──────────┐
│ gender ┆ mean     ┆ count ┆ diff      ┆ risk     │
│ ---    ┆ ---      ┆ ---   ┆ ---       ┆ ---      │
│ str    ┆ f64      ┆ u32   ┆ f64       ┆ f64      │
╞════════╪══════════╪═══════╪═══════════╪══════════╡
│ male   ┆ 0.263214 ┆ 2838  ┆ -0.006755 ┆ 0.97498  │
│ female ┆ 0.276824 ┆ 2796  ┆ 0.006856  ┆ 1.025396 │
└────────┴──────────┴───────┴───────────┴──────────┘
seniorcitizen
shape: (2, 5)
┌───────────────┬──────────┬───────┬───────────┬──────────┐
│ seniorcitizen ┆ mean     ┆ count ┆ diff      ┆ risk     │
│ ---           ┆ ---      ┆ ---   ┆ ---       ┆ ---      │
│ i64           ┆ f64      ┆ u32   ┆ f64       ┆ f64      │
╞═══════════════╪══════════╪═══════╪═══════════╪══════════╡
│ 0             ┆ 0.24227  ┆ 4722  ┆ -0.027698 ┆ 0.897403 │
│ 1             ┆ 0.413377 ┆ 912   ┆ 0.143409  ┆ 1.531208 │
└───────────────┴──────────┴───────┴───────────┴──────────┘
partner
shape: (2, 5)
┌─────────┬──────────┬───

## 5. Feature importance: Mutual Information

- https://en.wikipedia.org/wiki/Mutual_information


In [None]:
mutual_info_score(df_full_train["churn"], df_full_train["contract"])

np.float64(0.0983203874041556)

In [34]:
mutual_info_score(df_full_train["contract"], df_full_train["churn"])

np.float64(0.0983203874041556)

In [None]:
mutual_info_score(df_full_train["gender"], df_full_train["churn"])

np.float64(0.0001174846211139946)

In [36]:
def mutual_info_churn_scores(series):
    return mutual_info_score(series, df_full_train["churn"])

In [37]:
df_full_train.select(
    pl.col(categorical).map_batches(mutual_info_churn_scores, return_dtype=pl.Float64)
).transpose(include_header=True, column_names=["score"]).sort(
    pl.col("score"), descending=True
)

column,score
str,f64
"""contract""",0.09832
"""onlinesecurity""",0.063085
"""techsupport""",0.061032
"""internetservice""",0.055868
"""onlinebackup""",0.046923
…,…
"""partner""",0.009968
"""seniorcitizen""",0.00941
"""multiplelines""",0.000857
"""phoneservice""",0.000229


## 6. Feature importance: Correlation

- https://en.wikipedia.org/wiki/Pearson_correlation_coefficient


In [None]:
df_full_train["tenure"].max()

72

In [None]:
df_full_train.select(
    pl.corr("tenure", "churn"),
)

tenure
f64
-0.351885


In [None]:
df_full_train.select([pl.corr(n, "churn") for n in numerical])

tenure,monthlycharges,totalcharges
f64,f64,f64
-0.351885,0.196805,-0.196353


In [None]:
df_full_train.filter(pl.col("tenure") <= 2)["churn"].mean()

0.5953420669577875

In [None]:
df_full_train.filter((pl.col("tenure") > 2) & (pl.col("tenure") <= 12))["churn"].mean()

0.3994413407821229

In [43]:
df_full_train.filter(pl.col("tenure") > 12)["churn"].mean()

0.17634908339788277

In [None]:
df_full_train.filter(pl.col("monthlycharges") <= 20)["churn"].mean()

0.08795411089866156

In [45]:
df_full_train.filter(
    (pl.col("monthlycharges") > 20) & (pl.col("monthlycharges") <= 50)
)["churn"].mean()

0.18340943683409436

In [None]:
df_full_train.filter(pl.col("monthlycharges") > 50)["churn"].mean()

0.32499341585462205

In [47]:
df_full_train.select(pl.col(numerical)).corr()

tenure,monthlycharges,totalcharges
f64,f64,f64
1.0,0.251072,0.828268
0.251072,1.0,0.650913
0.828268,0.650913,1.0


## 7. One-hot encoding


In [48]:
train_dicts = df_train.select(pl.col(categorical + numerical)).to_dicts()

In [49]:
dv = DictVectorizer(sparse=False)

In [50]:
X_train = dv.fit_transform(train_dicts)

In [51]:
dv.feature_names_[:10]

['contract=month_to_month',
 'contract=one_year',
 'contract=two_year',
 'dependents=no',
 'dependents=yes',
 'deviceprotection=no',
 'deviceprotection=no_internet_service',
 'deviceprotection=yes',
 'gender=female',
 'gender=male']

In [52]:
val_dicts = df_val.select(pl.col(categorical + numerical)).to_dicts()

In [53]:
X_val = dv.transform(val_dicts)

## 8. Logistic regression


In [54]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [55]:
z = np.linspace(-5, 5, 51)

In [56]:
sigmoid(z)

array([0.00669285, 0.00816257, 0.0099518 , 0.01212843, 0.01477403,
       0.01798621, 0.02188127, 0.02659699, 0.03229546, 0.03916572,
       0.04742587, 0.05732418, 0.06913842, 0.0831727 , 0.09975049,
       0.11920292, 0.14185106, 0.16798161, 0.19781611, 0.23147522,
       0.26894142, 0.31002552, 0.35434369, 0.40131234, 0.450166  ,
       0.5       , 0.549834  , 0.59868766, 0.64565631, 0.68997448,
       0.73105858, 0.76852478, 0.80218389, 0.83201839, 0.85814894,
       0.88079708, 0.90024951, 0.9168273 , 0.93086158, 0.94267582,
       0.95257413, 0.96083428, 0.96770454, 0.97340301, 0.97811873,
       0.98201379, 0.98522597, 0.98787157, 0.9900482 , 0.99183743,
       0.99330715])

In [57]:
alt.Chart(pl.DataFrame({"value": z, "sigmoid": sigmoid(z)})).mark_line().encode(
    alt.X("value"), alt.Y("sigmoid")
)

In [58]:
def linear_regression(xi, w0, w):
    result = w0

    for j in range(len(w)):
        result = result + xi[j] * w[j]

    return result

In [59]:
def logistic_regression(xi, w0, w):
    score = w0

    for j in range(len(w)):
        score = score + xi[j] * w[j]

    result = sigmoid(score)
    return result