# Retraining Notebook

Notebook to be used with the purpose of retraining production model. The purpose for this is for the following two possible reasons:
- Account for drift in the data by retraining the model on the latest data
- Account for any possible changes in the structure of the data (additional columns and changes to column types)  

In [1]:
# Import dependencies
import numpy as np
import pandas as pd
import scipy.stats
import shap
import matplotlib.pyplot as plt
from econml.dml import LinearDML
import sklearn
from sklearn.base import BaseEstimator, clone, TransformerMixin, ClassifierMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, MaxAbsScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, make_scorer, log_loss
import joblib
from dateutil.relativedelta import relativedelta

### Define all required classes and functions

In [3]:
class CombinedModel(BaseEstimator, ClassifierMixin):
    def __init__(self, pipeline_1, pipeline_2, feature_types, id_cols, model, n_months, target_name):
        self.pipeline_1 = pipeline_1
        self.pipeline_2 = pipeline_2
        self.feature_types = feature_types
        self.model = model
        self.target_name = target_name 
        self.id_cols = id_cols
        self.tf_data = None
        self.n_months = n_months
    
    def fit(self, X1, X2):

        transformed_data = self._transform_data(X1, X2)


        self.model.fit(transformed_data.drop([self.target_name], axis=1), transformed_data[self.target_name])
        return self

    def _transform_data(self, X1, X2, prediction=False):
        
        transformed_data = self.pipeline_1.transform((X1, X2, self.id_cols))

        if not prediction:
            transformed_data = most_recent_data(transformed_data, self.n_months)
            
        transformed_data = self.pipeline_2.transform(transformed_data, self.feature_types)
        
        # Remove target if present
        if prediction and self.target_name in transformed_data.columns:
            transformed_data = transformed_data.drop([self.target_name], axis=1)

        self.tf_data = transformed_data
        
        return transformed_data
    
    def predict(self, X1, X2):
        
        # Apply relevant transformations for prediction
        transformed_data = self._transform_data(X1, X2, prediction=True)
        
        # Step 2: Use the model to predict based on the transformed data
        return self.model.predict(transformed_data)

    def predict_proba(self, X1, X2):
        transformed_data = self._transform_data(X1, X2, prediction=True)
        return self.model.predict_proba(transformed_data)


In [5]:
class MergeDatasets(BaseEstimator, TransformerMixin):
    def __init__(self, how='left', on=None):
        self.how = how
        self.on = on

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Assuming X is a list or tuple of two datasets
        dataset1, dataset2, id_cols = X
        dataset2 = dataset2.drop(['MONTH_KEY'], axis=1)
        merged_data = pd.merge(dataset1, dataset2, how=self.how, on=self.on)
        # Drop all id columns
        merged_data = merged_data.drop(id_cols, axis=1)
        return merged_data

# Custom transformer to sort by date
class SortByDate(BaseEstimator, TransformerMixin):
    def __init__(self, date_column='date'):
        self.date_column = date_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Convert the date column to datetime if not already
        X[self.date_column] = pd.to_datetime(X[self.date_column])
        
        # Sort the DataFrame by the date column
        X = X.sort_values(by=self.date_column).reset_index(drop=True)
        return X

In [7]:
# Custom transformer to preprocess the date column
class DatePreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        #self.feature_types = feature_types
        self.value = 0
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, feature_types):
        X = X.copy()
        # Convert the date columns to datetime if not already
        #X[self.date_column] = pd.to_datetime(X[self.date_column])
        date_columns = feature_types["datetime_features"]
        X[date_columns] = X[date_columns].apply(pd.to_datetime)
        
        # Extract features from the date column
        for date_column in date_columns:
            X['year_'+date_column] = X[date_column].dt.year
            X['month_'+date_column] = X[date_column].dt.month
        
        # Drop the original date column
        X = X.drop(columns=date_columns)
        
        return X

In [20]:
def most_recent_data(df_, n_months):
    df_ = df_.copy()
    df_['MONTH_KEY'] = pd.to_datetime(df_['MONTH_KEY'])
    max_date = df_['MONTH_KEY'].max()
    start_recent = max_date - relativedelta(months=n_months)
    df_ = df_[df_['MONTH_KEY']>start_recent]
    return df_

### Load model

In [9]:
# INSTRUCTIONS ------------------------------
# Specify correct location of saved model
# -------------------------------------------
loaded_model = joblib.load("ml_depl_object.pkl")

In [13]:
# Check the id columns and column feature_types
loaded_model.id_cols

['POL_NUMBER']

In [10]:
loaded_model.feature_types

{'datetime_features': ['MONTH_KEY'],
 'categorical_features': ['GENDER', 'HOME_LANGUAGE'],
 'text_features': ['GENERAL_FEEDBACK'],
 'numeric_features': ['HOW_LIKELY_ARE_YOU_TO_RECOMMEND_THE_PRODUCT',
  'AGE',
  'CLAIM_COUNT',
  'DOCUMENT_PAGES_FILLED',
  'EXCESS_AMOUNT_CHOSEN',
  'NEWSLETTER_EMAIL_COUNT',
  'WEBSITE_VISITS',
  'ECONOMY_HEALTH_INDICATOR']}

In [None]:
# INSTRUCTIONS -------------------------------------------------------
# Change feature types if the data has new or different columns

# loaded_model.id_cols = []
# loaded_model.feature_types = {}
# -------------------------------------------------------

### Retrain model on latest data

In [14]:
# Import data from csv's

# INSTRUCTIONS: ---------
# Replace paths to locate correct csv files
# -----------------------

path_customer_survey = './data/customer_survey.csv'
path_lapse_data = './data/lapse.csv'

customer_survey_data = pd.read_csv(path_customer_survey)
lapse_data = pd.read_csv(path_lapse_data)

In [21]:
# Retrain the model on the lastest data
loaded_model.fit(customer_survey_data,lapse_data)

In [24]:
# Validate if the model performs well on the training data
log_loss(lapse_data['LAPSE_IN_12M'], loaded_model.predict_proba(customer_survey_data, lapse_data))

0.0955628408575046

### Additional Steps

In [None]:
# INSTRUCTIONS ---------
# Add any additional steps here such as:
# Investigating lapse drivers
# etc.
# ----------------------