# US Churn &mdash; Baseline Model

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
sns.set_style("darkgrid")
pd.set_option('display.max_columns', None)  

import sys, os, yaml

DATASET = "US_Churn"

ROOT = "./"
COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"

DEBUG = False
SEED = 1612

In [None]:
if COLAB:
    from google.colab import drive
    if not os.path.isdir("/content/gdrive"):
        drive.mount("/content/gdrive")

## Imports

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

## Datasets

In [None]:
df = pd.read_feather(f"{ROOT}/data/churn.feather")
print(df.shape)
df.head()

In [None]:
# drop feature that I have not yet considered properly in EDA
for c in ["State"]:
    if c in df.columns: df.drop(c, axis='columns', inplace=True)

## Pre-process Data

In [None]:
df_train, df_test = train_test_split(df, stratify=df.Churn, train_size=0.60, random_state=SEED)
df_train.shape, df_test.shape

In [None]:
target = "Churn"
cat_features = [c for c in df.select_dtypes("category").columns if c not in target]
num_features = [c for c in df.select_dtypes(["int","float"]).columns if c not in target]
features = cat_features + num_features

print(f"Target: {target}")

print(f"Categorical Features: {cat_features}")
print(f"Numerical Features: {num_features}")

In [None]:
ss = StandardScaler()

X_train = ss.fit_transform(df_train[features])
y_train = df_train[target].values

X_test = ss.transform(df_test[features])
y_test = df_test[target].values

### Dummy model - predict no churn

 * Since 85% do not churn a model that always predicts target=0 will have accuracy of 85%

In [None]:
df.Churn.value_counts(normalize=True)

In [None]:
# always predict 0 (no churn)
y_dummy_pred = np.zeros_like(y_train)

# accuracy is a bad metric to use - as dataset is imbalanced
print(confusion_matrix(y_train, y_dummy_pred))
print(classification_report(y_train, y_dummy_pred, zero_division=False))

In [None]:
model = LogisticRegression()

In [None]:
# how well does model work on data it saw during training - overestimate how good the model is
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

## Model Selection

In [None]:
models = {
    "NB": GaussianNB(),
    "LR": LogisticRegression(),
    "DT": DecisionTreeClassifier(),
    "DT(max_depth=3)": DecisionTreeClassifier(max_depth=3),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC(),
}

In [None]:
for name,model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=10)
    print(f"{name:20s} accuracy = {scores.mean():.3%} ± {scores.std():.3%}")