In [None]:
import numpy as np
import pandas as pd
from metadata import data_type
from src.pipe_store import (
    data_loader,
    date_parser,
    clean_string_strip,
    set_data_types,
    sort_values_per_client,
    datetime2int,
    one_hot_encoder,
    label_encoder,
    summerize_client_behaviour,
)

%load_ext autoreload
%autoreload 2

In [None]:
data_path = '/Users/Danial/Downloads/assesment_file2_churn.csv'
df = (
    data_loader(data_path, parse_dates=['MONTH_PERIOD'], date_parser=date_parser)
    .pipe(set_data_types, data_type) # optimize space & df size
    .pipe(clean_string_strip, 'AGE_CLASS', 'HOMEBANK_COLOUR', 'LOYALITY')
    .pipe(sort_values_per_client, 'MONTH_PERIOD')
)

# Problem Formulation

Lets see how much information exist in the data set


In this assessment case a customer is considered to have churned in a given month if either CHURNED_IND or COMMERCIALLY_CHURNED are set to value 1.

### Exploratory Data Analysis (EDA)

In [None]:
from src.feature_selection import total_variable_variances, variable_variances_per_client
var_matrix = variable_variances_per_client(df)
var_matrix

In [None]:
var_matrix.mean().sort_values(ascending=False)

In [None]:
# total_variable_variances(df, top=6, include_label=False)

'Record_Count' & 'TARGET' should be discarded from the data set

## Some remarks over the data

* Why to summerize the data:
    * The dynamic of client behaviours is rather slow (as to the variances). Thus we can account for the final value of each variable and the most recent changes (past 6 months) in the variable. This is to capture the latest state change in the variables close to the potential churn.

    * This corrects for potential inconsistency in subscription start time is important to be taken 
    
* To prevent potential bias in the dataset we have removed the client that either rejoin or rechurn 

* Column 'CHURNED_IND' is summerized to churn_event and the time at which churn occurs to churn_time.

* If client is not churned during the 24 months churn_time is assumed to be an arbitrary date, e.g. 2013-01-01.

In [None]:
churn_col = 'CHURNED_IND'
horizon = 6 # Time horizon (Even number) to find dominant recent past states per column
df_dropped = df.drop(['TARGET', 'Record_Count'], axis=1)
df_sum = summerize_client_behaviour(df_dropped, churn_col=churn_col, horizon=horizon)
df_sum

In [None]:
df_sum.churn_event.value_counts()

In [None]:
df_sum.churn_event.value_counts(normalize=True)

# Missing Values
Variables with missing values are: ACCOUNTMODEL, AGE_CLASS, HOMEBANK_COLOUR, LOYALITY

### Type of missingness:

In practice, domain experts or data managers can better argue on the type of the missingness. 

Missing Completely at Random (MCAR)-> random sampling from variable distribution

Missing at Random (MAR)-> random sampling from variable distribution or predict missing class via Logistic Regression

Missing not at Random (MNAR): Should not be imputed

### Adopted Strategy

Consider all the missingess as MNAR and proceed with feature selection. Upon importance of the variable with missigness or the missing categories we can come back and adopt another strategy 

In [None]:
from src.eda import top_columns_with_missingness
top_columns_with_missingness(df)

In [None]:
top_columns_with_missingness(df_sum)

In [None]:
df_sum = df_sum.fillna('Unknown')

In [None]:
# df_sum.info()

AGE_CLASS columns has more missing value than reported due to "Leeftijd_onbekend" label which.

# Feature Selection

From previous section we know to drop: 'Record_Count', 'TARGET'

### Univariate Correlation

In [None]:
cat_cols = [
    'CREDIT_CLASS', 'DEBIT_CLASS', 'INVESTED_CAPITAL_CLASS', 'SAVINGS_CAPITAL_CLASS', 'MIN_FEED_CLASS', 'REVENUES_CLASS',
     'PAYMENT_ACTIVITIES_CODE', 'CLIENTGROUP', 'ACCOUNTMODEL', 'AGE_CLASS', 'HOMEBANK_COLOUR', 'LOYALITY'
]
df_corr = df_sum.copy(deep=True) 
df_corr = (
    df_corr
    .pipe(one_hot_encoder, *cat_cols, dtype='int8')
    # .pipe(label_encoder, 'CLIENTGROUP', 'ACCOUNTMODEL', 'AGE_CLASS', 'HOMEBANK_COLOUR', 'LOYALITY')
)

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif
col_drop = ['churn_event', 'churn_time', 'id', 'CROSS_SELL_SCORE']
X, y = df_corr.drop(col_drop, axis=1), df_corr['churn_event'].astype('float16')
k_best = SelectKBest(chi2, k=10).fit(X, y)
selected_cols = list(k_best.get_feature_names_out())
selected_cols

In [None]:
from src.feature_selection import plot_corr_cat

corrs = plot_corr_cat(X[selected_cols], show_figure=False)

In [None]:
from src.eda import plot_graph

plot_graph(corrs[0:10])

## Multicolinearity

In [None]:
from src.feature_selection import plot_corr_cat
# plot_corr_cat(X[['CREDIT_CLASS', 'CREDIT_CLASS', 'AGE_CLASS']])

In [None]:
# import matplotlib.pyplot as plt
# from seaborn import  heatmap
# corr_mat = df_corr.corr()
# plt.figure(figsize=(12, 8))
# heatmap(corr_mat, cmap='RdYlGn', annot=True)

# Feature Engineering

## Time-to-Event Featuer Selection

For these kind of analysis we need to make the churn_time column into a float values showing months 

In [None]:
dtype = {'churn_time': 'float16', 'churn_event': 'float16'}
df_red = (
    df_sum
    .pipe(datetime2int, 'churn_time')
    .pipe(set_data_types, dtype)
    # .reset_index(drop=True)
    .fillna('unknown')
)

# Seasonality [pattern in churn time]

In [None]:
df_red[df_red.churn_event == 1]['churn_time'].value_counts().sort_index().plot.bar()


## Plot Univariate Time-to-Event feature analysis [Kaplan-Meier]:

In [None]:
import matplotlib.pyplot as plt
from src.eda import plot_km_churn_risk
from lifelines import KaplanMeierFitter
fig, ax = plt.subplots(1, 2, figsize=(14, 4))
# ('INSURANCE_LIFE_IND', 'INSURANCE_LIFE_IND'), ('MORTGAGE_IND', 'PACKAGE_IND'), ('INVESTMENTS_IND', 'LENDING_IND')
for i, j in [('SAVING_IND', 'SAVING_IND_CHANGED')]:  # ('PAYMENT_IND_CHANGED', 'SAVING_IND_CHANGED', CROSS_SELL_SCORE_CHANGED
    plot_km_churn_risk(df_red, i, ax=ax[0], estimator=KaplanMeierFitter, at_risk=True)
    plot_km_churn_risk(df_red, j, ax=ax[1], estimator=KaplanMeierFitter, at_risk=True)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(14, 4))
# plot_km_churn_risk(df_red, 'HOMEBANK_COLOUR', ax=ax[0], estimator=KaplanMeierFitter, at_risk=True)
# plot_km_churn_risk(df_red, 'LOYALITY', ax=ax[1], estimator=KaplanMeierFitter, at_risk=True)
plot_km_churn_risk(df_red, 'CROSS_SELL_SCORE', ax=ax[0], estimator=KaplanMeierFitter, at_risk=True)
plot_km_churn_risk(df_red, 'CROSS_SELL_SCORE_CHANGED', ax=ax[1], estimator=KaplanMeierFitter, at_risk=True)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(14, 4))
# 'CREDIT_CLASS', 'DEBIT_CLASS', 'INVESTED_CAPITAL_CLASS', 'SAVINGS_CAPITAL_CLASS', 
# 'MIN_FEED_CLASS', 'REVENUES_CLASS', 'PAYMENT_ACTIVITIES_CODE', 'CROSS_SELL_SCORE', 'CLIENTGROUP', 'ACCOUNTMODEL',
# 'AGE_CLASS', 'HOMEBANK_COLOUR', 'LOYALITY']
plot_km_churn_risk(df_red, 'LOYALITY', ax=ax[0], estimator=KaplanMeierFitter, at_risk=True)
plot_km_churn_risk(df_red, 'LOYALITY_CHANGED', ax=ax[1], estimator=KaplanMeierFitter, at_risk=True)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(14, 4))
plot_km_churn_risk(df_red, 'CREDIT_CLASS', ax=ax[0], estimator=KaplanMeierFitter, at_risk=True)
plot_km_churn_risk(df_red, 'CREDIT_CLASS_CHANGED', ax=ax[1], estimator=KaplanMeierFitter, at_risk=True)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(14, 4)) 
plot_km_churn_risk(df_red, 'INVESTED_CAPITAL_CLASS', ax=ax[0], estimator=KaplanMeierFitter, at_risk=True)
plot_km_churn_risk(df_red, 'SAVINGS_CAPITAL_CLASS', ax=ax[1], estimator=KaplanMeierFitter, at_risk=True)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(14, 4)) 
plot_km_churn_risk(df_red, 'MIN_FEED_CLASS', ax=ax[0], estimator=KaplanMeierFitter, at_risk=True)
plot_km_churn_risk(df_red, 'REVENUES_CLASS', ax=ax[1], estimator=KaplanMeierFitter, at_risk=True)

In [None]:
# 'CLIENTGROUP', 'ACCOUNTMODEL',
fig, ax = plt.subplots(1,2, figsize=(14, 4)) 
plot_km_churn_risk(df_red, 'PAYMENT_ACTIVITIES_CODE', ax=ax[0], estimator=KaplanMeierFitter, at_risk=True)
plot_km_churn_risk(df_red, 'CROSS_SELL_SCORE', ax=ax[1], estimator=KaplanMeierFitter, at_risk=True)

In [None]:
fig, ax = plt.subplots(figsize=(8, 5)) 
plot_km_churn_risk(df_red, 'CLIENTGROUP', ax=ax, estimator=KaplanMeierFitter, at_risk=True)

In [None]:
# 0101: 13%, 0307:23 %,  0105: 42%


In [None]:
df_red.AGE_CLASS.unique()

# Cox Partial Hazard Model

In [None]:
cat_cols = [
    'CREDIT_CLASS', 'DEBIT_CLASS', 'INVESTED_CAPITAL_CLASS', 'SAVINGS_CAPITAL_CLASS', 
    'MIN_FEED_CLASS', 'REVENUES_CLASS', 'PAYMENT_ACTIVITIES_CODE', 
    'CLIENTGROUP', 'ACCOUNTMODEL', 'AGE_CLASS', 'HOMEBANK_COLOUR', 'LOYALITY', ]     # 'CROSS_SELL_SCORE', 
    
df_cox = (
    df_red
    .pipe(one_hot_encoder, *cat_cols)
    [[
        'event', 'churn_time', 
        'PAYMENT_IND',
        'SAVING_IND',
        # 'CREDIT_CLASS_0', 
        'CREDIT_CLASS_1', #
        'CREDIT_CLASS_2',
        # 'DEBIT_CLASS_0', #
        # 'DEBIT_CLASS_1', #
        # 'DEBIT_CLASS_2', #
        'SAVINGS_CAPITAL_CLASS_0', #
        'SAVINGS_CAPITAL_CLASS_2',
        'INVESTED_CAPITAL_CLASS_0',
        'MIN_FEED_CLASS_0', # 
        'MIN_FEED_CLASS_1',
        'REVENUES_CLASS_3', #
        'PAYMENT_ACTIVITIES_CODE_0', 
        # 'CLIENTGROUP_0105', #
        'CLIENTGROUP_0307',
        'CLIENTGROUP_0101', #
        'CROSS_SELL_SCORE',  # 'CROSS_SELL_SCORE_0',
        'AGE_CLASS_Leeftijd_onbekend', 
        'HOMEBANK_COLOUR_unknown', 
        'HOMEBANK_COLOUR_Rood',
        'LOYALITY_unknown', 
        'LOYALITY_Rood', 
    ]]
)

In [None]:
# from seaborn import heatmap 
# plt.figure(figsize=(10, 7))
# heatmap(df_cox.corr(), cmap='RdYlGn')

In [None]:
df_cox.columns

In [None]:
from lifelines import CoxPHFitter
cph = CoxPHFitter()
cph.fit(df_cox, duration_col='churn_time', event_col='event')
cph.print_summary() 

In [None]:
df_cox

In [None]:
df_cox

# Classifier

In [None]:
from sklearn.model_selection import cross_validate, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from src.pipe_store import sklearn_adapter
random_state = 42

X, y = sklearn_adapter(df_cox, label='event')
# regs = [ LogisticRegression(), RandomForestClassifier()]
X.drop('churn_time', axis=1, inplace=True)
regs =  [ LogisticRegression(), RandomForestClassifier(), GaussianNB()]

num_cols = ['CROSS_SELL_SCORE']
column_transformer_scaler = ColumnTransformer([
    ('Scaler', StandardScaler(), num_cols), 
], remainder='passthrough')

results = {}
for reg in regs:

    pipeline = Pipeline([
        ('scaler', column_transformer_scaler),
        ('Model', reg),
    ], verbose=False)

    kfs = KFold(n_splits=5, shuffle=True)
    # For the list of all metrics visit: https://scikit-learn.org/stable/modules/model_evaluation.html
    metrics = ['recall', 'precision', 'roc_auc', 'accuracy', 'f1'] 
    scores = cross_validate(pipeline, X, y, cv=kfs, scoring=metrics)
    # We will not use cross_val_score as it can only accept one metric
    # print(scores)
    reg_name = type(reg).__name__
    results[reg_name] = {key: round(np.mean(val), 3) for key, val in scores.items()}
pd.DataFrame(results).T

In [None]:
from sklearn.model_selection import train_test_split
X, y = sklearn_adapter(df_cox, label='event')
X.drop('churn_time', axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)
clf = GaussianNB() # LogisticRegression() #GaussianNB()
pipeline = Pipeline([
    ('scaler', column_transformer_scaler),
    ('Model', clf),
], verbose=False)

pipeline.fit(X_train, y_train)

# Model Evaluation

In [None]:
from src.model_evaluation import (
    plot_roc_curve,
    plot_confusion_matrix,
    plot_precision_recall_curve,
    print_scores,
)

from sklearn.model_selection import train_test_split
X, y = sklearn_adapter(df_cox, label='event')
X.drop('churn_time', axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)
clf = GaussianNB() # LogisticRegression() #GaussianNB()
pipeline = Pipeline([
    ('scaler', column_transformer_scaler),
    ('Model', clf),
], verbose=False)

pipeline.fit(X_train, y_train)

plot_confusion_matrix(pipeline, X_train, y_train)
fig, ax = plt.subplots(1,2, figsize=(11, 4))
plot_precision_recall_curve(pipeline, X_train, y_train, ax[0])
plot_roc_curve(pipeline, X_train, y_train, ax[1])
print_scores(pipeline, X_test, y_test)

In [None]:
plot_confusion_matrix(pipeline, X_test, y_test)
print_scores(pipeline, X_test, y_test)
fig, ax = plt.subplots(1,2, figsize=(10, 4))
plot_roc_curve(pipeline, X_test, y_test, ax[0])
plot_precision_recall_curve(pipeline, X_test, y_test, ax[1])


In [None]:
# from src.model_evaluation import plot_calibration
# plot_calibration(pipeline, X_test, y_test, n_bins=5, strategy='uniform')

In [None]:

# forest_importances = pd.Series(rcf.feature_importances_, index=X.columns)

# fig, ax = plt.subplots()
# forest_importances.plot.bar( ax=ax)
# ax.set_title("Feature importances using MDI")
# ax.set_ylabel("Mean decrease in impurity")
# fig.tight_layout()

In [None]:
X