# US Churn &mdash; Feature Engineering


## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
sns.set_style("darkgrid")
pd.set_option('display.max_columns', None)  

import sys, os, yaml

DATASET = "US_Churn"

ROOT = "./"
COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"    

DEBUG = False
SEED = 1612

In [None]:
if COLAB:
    from google.colab import drive
    if not os.path.isdir("/content/gdrive"):
        drive.mount("/content/gdrive")

## Imports

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

## Datasets

In [None]:
df = pd.read_pickle(f"{ROOT}/data/churn.pkl")
print(df.shape)
df.head()

In [None]:
# drop feature that I have not yet considered properly in EDA
for c in ["State"]:
    if c in df.columns: df.drop(c, axis='columns', inplace=True)

## Pre-process Data

In [None]:
df_train, df_test = train_test_split(df, stratify=df.Churn, train_size=0.60, random_state=SEED)
df_train.shape, df_test.shape

In [None]:
target = "Churn"
cat_features = [c for c in df.select_dtypes("category").columns if c not in target]
num_features = [c for c in df.select_dtypes(["int","float"]).columns if c not in target]
features = cat_features + num_features

print(f"Target: {target}")

print(f"Categorical Features: {cat_features}")
print(f"Numerical Features: {num_features}")
print(f"Number of Features: {len(features)}")

## Feature_Engineering 

Features engineering and selection decisions are carried out on `df_train` ONLY, but resulting operations should be applied to both `df_train` and `df_test`. The simplest way to do this (without repeating code) is to 

 * Perform querys on `df_train` to select new possible features.
 * Apply operations to `df` and repeat the train/test split USING THE SAME SEED.
 * You can use the target values in `df_train` but cannot use the target values in `df_test`.

### New Feature - Cranky Triskaidekaphobias

Would a feature indicating customers who complain and avoid the number 13 be useful in predicting Churn?

This is a silly feature, but illustrates the general procedure

In [None]:
my_query = df_train.query('CustServ_Calls>0 & Day_Calls!=13 & Eve_Calls!=13 & Night_Calls!=13 & Intl_Calls!=13')
my_query.head()

In [None]:
df["Cranky_Triskaidekaphobias"] = df.apply (lambda row: (row.CustServ_Calls>0 
            and row.Day_Calls!=13 and row.Eve_Calls!=13 
            and row.Night_Calls!=13 and row.Intl_Calls!=13), axis=1)

In [None]:
# test generated feature to ensure has sufficient variability
df.Cranky_Triskaidekaphobias.value_counts()

### New Feature - ????

### New Feature - ????

### New Feature - ????

### New Feature - ????

### New Feature - ????

### Pre-Processing Data (2)

In [None]:
df_train, df_test = train_test_split(df, stratify=df[target], test_size=.40, random_state=SEED)
features = df.columns.tolist()
if target in features: features.remove(target)
print(f"Number of features {len(features)}\n{features}")

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(df_train.loc[:,features].values)
y_train = df_train.Churn.values

X_test = scaler.transform(df_test.loc[:,features].values)
y_test = df_test.Churn.values

## Model Selection

In [None]:
models = {
    "LR": LogisticRegression(),
    "DT": DecisionTreeClassifier(),
    "DT(max_depth=3)": DecisionTreeClassifier(max_depth=3),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC(),
}

def train_models():
    best = {'name':None, 'mean':0}
    
    for name,model in models.items():
    
        scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=10)
        mean = scores.mean()
        std = scores.std()
        
        print ("\nModel %s" % name)
        print ("\tCV scores: " +  ", ".join([f"{s:.2f}" for s in scores]))
        print (f"\tmean={mean:.2%} std={std:.4f}")
    
        if mean>best['mean']:
            best = {'name':name, 'mean':mean}

    print (f"\nBest Performing Model {best['name']} with (mean CV of) accuracy = {best['mean']:.2%}" )
    

In [None]:
train_models()

## Feature Selection

Next we are going to try to reduce the number of features using [RFECV](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html) a recursive feature elimination with cross-validation function.

RFECV needs a classifier that returns information about feature importance either through a `coef_` attribute or through a `feature_importances_` attribute. 
Of our models given above will work, only `LogisticRegression` and `DT` provide this information


In [None]:
print("Number of features starting:", len(features))

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

name = 'LR'  # use LR, DT
model = models[name]

rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(2), scoring="accuracy")
rfecv.fit(X_train, y_train)
print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.title(f"{name} with RFECV")
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score")
results = rfecv.cv_results_
plt.plot(range(1, len(results['mean_test_score']) + 1), results['split0_test_score'], label='split0')
plt.plot(range(1, len(results['mean_test_score']) + 1), results['split1_test_score'], label='split1')
plt.plot(range(1, len(results['mean_test_score']) + 1), results['mean_test_score'], label='mean')
plt.legend()
plt.show()

In [None]:
len(rfecv.support_), len(features)

In [None]:
# list of features selected
if len(rfecv.support_)<len(features):
    print("It looks like you have run cells out of order. Runn cells from Pre-Processing Data (2)")

features_selected = np.array(features)[rfecv.support_].tolist()
features_selected

### Training - After Feature Selection

In [None]:
display(df.loc[:, features_selected + [target]].head(1))

# normalise 
X_train = scaler.fit_transform(df_train.loc[:,features_selected].values)
y_train = df_train.Churn.values
X_test = scaler.transform(df_test.loc[:,features_selected].values)
y_test = df_test.Churn.values

train_models()

## Submit

 * Creates zip file to upload of notebooks in project

In [None]:
import glob
from zipfile import ZipFile

archive = f'{ROOT}/US_Churn_-_Feature_Engineering.zip'

if COLAB:
    files_to_upload = list(glob.glob("/content/gdrive/MyDrive/Colab Notebooks/US_Churn*.ipynb"))
else: 
    files_to_upload = list(glob.glob("US_Churn*.ipynb"))
print(f"Creating zip archive with notebooks {files_to_upload} ...")

with ZipFile(archive,'w') as zip:
    for file in files_to_upload:
        basename = file.split("/Colab Notebooks/")[-1]
        zip.write(file, basename) 