# Module 05 A


In [1]:
import pickle
from pathlib import Path

import numpy as np
import polars as pl
import polars.selectors as cs
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
)
from sklearn.model_selection import KFold, train_test_split

from ml_zoomcamp.utils import clean_column_names, load_data

ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR.joinpath("data")
MODEL_DIR = ROOT_DIR.joinpath("model")

## 1. Data Preparation


In [2]:
csv_uri = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = load_data(csv_uri, DATA_DIR)
df = clean_column_names(df)

df = df.with_columns(
    cs.string()
    .str.to_lowercase()
    .str.replace_all(r"[^\w\s-]", "")
    .str.replace_all(r"\s+|-+", "_")
)

In [3]:
df = df.with_columns(
    pl.col("totalcharges").fill_null(0),
    (pl.col("churn") == "yes").cast(pl.Int8),
)

## 2. Setting Up Validation Framework


In [4]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [5]:
len(df_train), len(df_val), len(df_test)

(4225, 1409, 1409)

In [6]:
y_train = df_train["churn"].to_numpy()
y_val = df_val["churn"].to_numpy()
y_test = df_test["churn"].to_numpy()

In [7]:
df_train = df_train.drop("churn")
df_val = df_val.drop("churn")
df_test = df_test.drop("churn")

## 3. Exploratory Data Analysis


In [8]:
numerical = ["tenure", "monthlycharges", "totalcharges"]

In [9]:
categorical = [
    "gender",
    "seniorcitizen",
    "partner",
    "dependents",
    "phoneservice",
    "multiplelines",
    "internetservice",
    "onlinesecurity",
    "onlinebackup",
    "deviceprotection",
    "techsupport",
    "streamingtv",
    "streamingmovies",
    "contract",
    "paperlessbilling",
    "paymentmethod",
]

## 4. Cross-Validation


In [10]:
def train(df, y_train, C=1.0):
    dicts = df.select(pl.col(categorical + numerical)).to_dicts()

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(C=C, max_iter=10000)
    model.fit(X_train, y_train)

    return dv, model

In [11]:
def predict(df, dv, model):
    dicts = df.select(pl.col(categorical + numerical)).to_dicts()

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [12]:
C = 1.0
n_splits = 5

In [13]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []

for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train[train_idx]
    df_val = df_full_train[val_idx]

    y_train = df_train["churn"].to_numpy()
    y_val = df_val["churn"].to_numpy()

    dv, model = train(df_train, y_train, C)
    y_pred = predict(df_val, dv, model)

    auc_score = roc_auc_score(y_val, y_pred)
    scores.append(auc_score)

print("C=%s %.3f +- %.3f" % (C, np.mean(scores), np.std(scores)))

C=1.0 0.842 +- 0.007


In [14]:
scores

[np.float64(0.8443689114615631),
 np.float64(0.8448943020791948),
 np.float64(0.8335460565924142),
 np.float64(0.8347808882778025),
 np.float64(0.8518363088701327)]

In [15]:
dv, model = train(df_full_train, df_full_train["churn"].to_numpy(), C=1.0)
y_pred = predict(df_test, dv, model)

auc_score = roc_auc_score(y_test, y_pred)
auc_score

np.float64(0.8584005005037537)

## 5. Save the model


In [16]:
output_file = MODEL_DIR.joinpath(f"model_C={C}.bin")
output_file.name

'model_C=1.0.bin'

In [17]:
f_out = open(output_file, "wb")
pickle.dump((dv, model), f_out)
f_out.close()

In [18]:
with output_file.open("wb") as f_out:
    pickle.dump((dv, model), f_out)