## Overview


Predict CTR of an Email Campaign

## Libraries

In [76]:
import pandas as pd
import numpy as np
import scipy as sp
import calendar
from datetime import date,datetime,timedelta
import time
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import RFE
from sklearn.metrics import r2_score, mean_squared_error,get_scorer_names
from rdt import HyperTransformer
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import RepeatedKFold

from matplotlib import pyplot as plt
import traceback
import eli5
from eli5.sklearn import PermutationImportance
from eli5.xgboost import explain_prediction_xgboost
import logging
from importlib import reload
import logging
import plotly.graph_objects as go
import plotly.express as px
import os
import json
from sklearn.linear_model import LinearRegression, HuberRegressor, QuantileRegressor
import plotly.io as pio
import tensorflow_data_validation as tfdv
from sklearn.preprocessing import StandardScaler, PowerTransformer, QuantileTransformer, Normalizer, RobustScaler
from scipy.stats import zscore
from fitter import Fitter, get_common_distributions, get_distributions
# remove static rendering to allow interactive mode
#pio.renderers.default = "notebook_connected"
pio.renderers.default = "svg"
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [77]:
# setup random state for experimentation
rng = np.random.RandomState(52)

In [78]:
# set up logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%I:%M:%S')

## Data

### Load data

In [80]:
df = pd.read_csv('train_F3fUq2S.csv', index_col=0)
submission_df = pd.read_csv('test_Bk2wfZ3.csv', index_col=0)

In [81]:
logging.info(f'Full dataset shape : {df.shape}')
logging.info(f'submission dataset shape : {submission_df.shape}')

04:28:03 INFO:Full dataset shape : (1888, 21)
04:28:03 INFO:submission dataset shape : (762, 20)


In [82]:
df.head()

Unnamed: 0_level_0,sender,subject_len,body_len,mean_paragraph_len,day_of_week,is_weekend,times_of_day,category,product,no_of_CTA,...,is_image,is_personalised,is_quote,is_timer,is_emoticons,is_discount,is_price,is_urgency,target_audience,click_rate
campaign_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,76,10439,39,5,1,Noon,6,26,3,...,0,0,0,0,0,0,0,0,14,0.103079
2,3,54,2570,256,5,1,Morning,2,11,0,...,0,0,0,0,0,0,0,0,10,0.7
3,3,59,12801,16,5,1,Noon,2,11,3,...,1,0,1,0,0,0,0,0,16,0.002769
4,3,74,11037,30,4,0,Evening,15,9,4,...,0,0,0,0,0,0,0,0,10,0.010868
5,3,80,10011,27,5,1,Noon,6,26,3,...,0,0,1,0,0,0,0,0,14,0.142826


### Distribution comparision between train and test

In [157]:
# Generate training dataset statistics
train_stats = tfdv.generate_statistics_from_dataframe(df)

# Generate test dataset statistics
test_stats = tfdv.generate_statistics_from_dataframe(submission_df)

# Compare training with evaluation
tfdv.visualize_statistics(
    lhs_statistics=train_stats, 
    rhs_statistics=test_stats, 
    lhs_name='TRAIN_DATASET', 
    rhs_name='TEST_DATASET'
)

##### Column with large number of 0's

* is_price - 100% . Remove the column
* is_price - 99.3%
* is_discount - 96.03%
* is_personalised - 94.03%

### Schema

In [84]:
# Infer schema from the computed statistics.
schema = tfdv.infer_schema(statistics=train_stats)

# Display the inferred schema
tfdv.display_schema(schema)

# Check evaluation data for errors by validating the evaluation dataset statistics using the reference schema
anomalies =  tfdv.validate_statistics(statistics=test_stats, schema=schema)

# Visualize anomalies
tfdv.display_anomalies(anomalies)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'sender',INT,required,,-
'subject_len',INT,required,,-
'body_len',INT,required,,-
'mean_paragraph_len',INT,required,,-
'day_of_week',INT,required,,-
'is_weekend',INT,required,,-
'times_of_day',STRING,required,,'times_of_day'
'category',INT,required,,-
'product',INT,required,,-
'no_of_CTA',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'times_of_day',"'Evening', 'Morning', 'Noon'"


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'click_rate',Column dropped,Column is completely missing


### Train, validation and test strategy

Since the dataset is small with 1888 datapoints, 

* Use hold out only for testing on unseen data. 
* No holdout dataset for validation, instead cross validation will be used.
* Will not be able to use earlystopping as a result of no validation datset
* 80% train and 20% test

In [85]:
# Split the dataset with shuffle
train_df, test_df = train_test_split(df,
                                     test_size=0.2,
                                     shuffle=True,
                                     random_state=rng)
                                     

### Data cleaning

In [86]:
train_df_eda = train_df.copy()

In [87]:
train_df_eda.columns

Index(['sender', 'subject_len', 'body_len', 'mean_paragraph_len',
       'day_of_week', 'is_weekend', 'times_of_day', 'category', 'product',
       'no_of_CTA', 'mean_CTA_len', 'is_image', 'is_personalised', 'is_quote',
       'is_timer', 'is_emoticons', 'is_discount', 'is_price', 'is_urgency',
       'target_audience', 'click_rate'],
      dtype='object')

##### Column with large number of 0's

* is_timer - 100% . Remove the column
* is_price - 99.3%
* is_discount - 96.03%
* is_personalised - 94.03%

In [88]:
train_df_eda.drop(columns=['is_timer'],inplace=True)

# price column contains 99.3% 0's with few other price values. Dop the column
train_df_eda.drop(columns=['is_price'],inplace=True)

In [89]:
train_df_eda.nunique()

sender                  12
subject_len            159
body_len              1290
mean_paragraph_len     117
day_of_week              7
is_weekend               2
times_of_day             3
category                16
product                 41
no_of_CTA               36
mean_CTA_len            75
is_image                 6
is_personalised          2
is_quote                 7
is_emoticons             6
is_discount              2
is_urgency               2
target_audience         17
click_rate            1455
dtype: int64

##### Below Boolean columns have more than 1 value

* is_image - 6 (likely number of images per email)
* is_quote - 7 (likely number of quotes per email)
* is_emoticons - 6 (likely number of emoticons per email)



In [90]:
print('image value counts')
print(train_df_eda.is_image.value_counts())
print('\nqoute value counts')
print(train_df_eda.is_quote.value_counts())
print('\nemoticons value counts')
print(train_df_eda.is_emoticons.value_counts())

image value counts
0    573
1    558
2    347
3     28
4      2
6      2
Name: is_image, dtype: int64

qoute value counts
0    691
1    521
2    180
3     81
4     21
5     11
6      5
Name: is_quote, dtype: int64

emoticons value counts
0    1300
1     132
2      47
3      27
4       3
6       1
Name: is_emoticons, dtype: int64


Blindly cleaning the above columns to boolean will lead to loss of information. Instead, let's create two columns, the first indicating the number of values and the second indicating whether the column is boolean

* number_of_images, is_image
* number_of_qoutes, is_qoute
* number_of_emoticons, is_emoticons

In [91]:
train_df_eda['num_images'] = train_df_eda['is_image']
train_df_eda['is_image'] = train_df_eda['num_images'] > 0

train_df_eda['num_qoutes'] = train_df_eda['is_quote']
train_df_eda['is_quote'] = train_df_eda['num_qoutes'] > 0

train_df_eda['num_emoticons'] = train_df_eda['is_emoticons']
train_df_eda['is_emoticons'] = train_df_eda['num_emoticons'] > 0

In [92]:
train_df_eda.columns

Index(['sender', 'subject_len', 'body_len', 'mean_paragraph_len',
       'day_of_week', 'is_weekend', 'times_of_day', 'category', 'product',
       'no_of_CTA', 'mean_CTA_len', 'is_image', 'is_personalised', 'is_quote',
       'is_emoticons', 'is_discount', 'is_urgency', 'target_audience',
       'click_rate', 'num_images', 'num_qoutes', 'num_emoticons'],
      dtype='object')

### Feature engineering

Feature engineering mainly includes 2 types feature engineering

* aggregates: aggregate columns using key categorical columns and obtain descriptive statistics such as mean,median,etc
    * Ex: df.groupby(['category,'product']['body_len'].mean()) (average body_len per category and product)
* N-way combination: combining numerical features with other numerical features
    * Multiplication of features (f1*f2)
    * ratio features (f1/f2)
    * combine features as polynomial features

#### Key features identified using Recurisve Feature Elimination

 
 * categorical features:
    * category
    * product
    * target_audience

* numerical features
    * body_len
    * no_of_CTA
    * mean_paragraph_len
    * mean_CTA_len
    * subject_len 

calculate descripctive statistics of numerical features by grouping them by catagorical features

#### features not added since model is already overfitting

In [93]:
def get_grouped_syntax(grouping_feature_list:list[str])->str:
    """add groupby syntax in column name to avoid column name mix up
    
    Args:
        grouping_feature_list (list[str]): list of features

    Returns:
        str: grouped syntax in string format
    """
    return f'groupby('+','.join(grouping_feature_list)+')'

def get_grouped_aggregates_dataframe(grouping_feature_list:list[str], aggregate_functions:list, num_feature:str, orig_df: pd.DataFrame)->pd.DataFrame:
    """get grouped aggregates in the format groupby(['cat_feature1','cat_feature2'])['num_feature'].mean) for a give numerical feature

    Args:
        grouping_feature_list (list[str]): list of categorical features
        aggregate_functions (list): ['min','std', 'max']. 
        num_feature (str): numerical feature
        orig_df (pd.DataFrame): original dataframe

    Returns:
        pd.DataFrame: return grouped aggregates

    Notes:
        list of all aggregate functions-
        https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/\n
        https://pandas.pydata.org/docs/user_guide/groupby.html
    """

    # get group by syntax in string
    grouping_feature_list_name = get_grouped_syntax(grouping_feature_list)
    # groupby - features -> aggregate using agg functions
    grouped = orig_df.groupby(grouping_feature_list).agg(
        {num_feature: aggregate_functions})
    # rename columns for convinience.
    grouped.columns = [f'{grouping_feature_list_name}_{num_feature}_{x[1]}' for x in grouped.columns.ravel()]
    grouped = grouped.reset_index()
    return grouped
def aggregate_and_merge(orig_df: pd.DataFrame, grouped_feature_list: list[str], aggregate_functions:list, num_feature:str) -> pd.DataFrame:
    """merge grouped aggregate columns to original dataframe.

    Args:
        orig_df (pd.DataFrame): original dataframe
        grouped_df (pd.DataFrame): grouped aggergated dataframe
        grouped_feature_list (list[str]): features used for grouping

    Returns:
        pd.DataFrame: merged dataframe
    """
    grouped_df = get_grouped_aggregates_dataframe(grouped_feature_list, aggregate_functions, num_feature, orig_df)
    df = pd.merge(orig_df.reset_index(), grouped_df, on=grouped_feature_list, how='inner')
    return df.set_index('campaign_id').reindex(orig_df.index)

### Data preprocessing

In [94]:
def clean_Data(df):
    """sequential steps of data cleaning. Above steps are added in sequence
    """

    # drop is_timer column containing 100% 0's
    df.drop(columns=['is_timer'],inplace=True)

    df['num_images'] = df['is_image']
    df['is_image'] = df['num_images'] > 0

    df['num_qoutes'] = df['is_quote']
    df['is_quote'] = df['num_qoutes'] > 0

    df['num_emoticons'] = df['is_emoticons']
    df['is_emoticons'] = df['num_emoticons'] > 0

    df['NO_of_paragrahs'] = df['mean_paragraph_len'] / df['body_len']

    return df

In [95]:
def add_features(df):
    #df = aggregate_and_merge(df, ['category','product'] , ['mean','std'], 'body_len')
    return df

In [96]:
def split_features_target(df, target, drop_features = [], target_available =True):
    
    y = df[target] if target_available else None
    drop_cols = drop_features+[target] if target_available else drop_features
    X = df.drop(columns = drop_cols)
    return X,y

In [97]:
target = 'click_rate'
drop_features = []

In [98]:
# work on copy of train_df
train_df_cp = train_df.copy()

In [99]:
train_df_cp = clean_Data(train_df_cp)
train_df_cp = add_features(train_df_cp)
X_train,y_train = split_features_target(train_df_cp, target, drop_features)

### Data transformation

Reversible Data Transforms(https://docs.sdv.dev/rdt/) is used for categorical encoding and data transformation

In [100]:
# initialize data transformer
ht = HyperTransformer()

In [101]:
# identify datatypes of given dataframe
ht.detect_initial_config(data=X_train)

Detecting a new config from the data ... SUCCESS
Setting the new config ... SUCCESS
Config:
{
    "sdtypes": {
        "sender": "numerical",
        "subject_len": "numerical",
        "body_len": "numerical",
        "mean_paragraph_len": "numerical",
        "day_of_week": "numerical",
        "is_weekend": "numerical",
        "times_of_day": "categorical",
        "category": "numerical",
        "product": "numerical",
        "no_of_CTA": "numerical",
        "mean_CTA_len": "numerical",
        "is_image": "boolean",
        "is_personalised": "numerical",
        "is_quote": "boolean",
        "is_emoticons": "boolean",
        "is_discount": "numerical",
        "is_price": "numerical",
        "is_urgency": "numerical",
        "target_audience": "numerical",
        "num_images": "numerical",
        "num_qoutes": "numerical",
        "num_emoticons": "numerical",
        "NO_of_paragrahs": "numerical"
    },
    "transformers": {
        "sender": FloatFormatter(missing_va

In [102]:
# Update data types if required
ht.update_sdtypes(column_name_to_sdtype={
  'sender': 'categorical',
  'day_of_week': 'categorical',
  'is_weekend' : 'boolean',
  'category' : 'categorical',
  'product' : 'categorical',
  'is_personalised': 'boolean',
  'is_discount': 'boolean',
  'is_urgency': 'boolean',
  'target_audience': 'categorical',
})

Info: The transformers for these columns may change based on the new sdtype.
Use 'get_config()' to verify the transformers.


In [103]:
# verify updates
ht.get_config()

{
    "sdtypes": {
        "sender": "categorical",
        "subject_len": "numerical",
        "body_len": "numerical",
        "mean_paragraph_len": "numerical",
        "day_of_week": "categorical",
        "is_weekend": "boolean",
        "times_of_day": "categorical",
        "category": "categorical",
        "product": "categorical",
        "no_of_CTA": "numerical",
        "mean_CTA_len": "numerical",
        "is_image": "boolean",
        "is_personalised": "boolean",
        "is_quote": "boolean",
        "is_emoticons": "boolean",
        "is_discount": "boolean",
        "is_price": "numerical",
        "is_urgency": "boolean",
        "target_audience": "categorical",
        "num_images": "numerical",
        "num_qoutes": "numerical",
        "num_emoticons": "numerical",
        "NO_of_paragrahs": "numerical"
    },
    "transformers": {
        "sender": FrequencyEncoder(),
        "subject_len": FloatFormatter(missing_value_replacement='mean'),
        "body_len": Fl

In [104]:
# fit train data to transformer
ht.fit(X_train)

# transform the train data
X_train_transformed = ht.transform(X_train)

In [105]:
selected_features = ['sender.value', 'subject_len.value', 'body_len.value',
       'mean_paragraph_len.value', 'day_of_week.value', 'is_weekend.value',
       'times_of_day.value', 'category.value', 'product.value',
       'no_of_CTA.value', 'mean_CTA_len.value', 'is_personalised.value',
       'is_quote.value', 'target_audience.value', 'num_qoutes.value']

In [106]:
# selected features  only
X_train_transformed = X_train_transformed[selected_features]

### Target transformation


In [107]:
import scipy.stats as stats
import pylab

In [108]:
def diagnostic_plots(df):
    # function to plot a histogram and a Q-Q plot
    # side by side, for a certain variable
    
    plt.figure(figsize=(15,6))
    plt.subplot(1, 2, 1)
    df.hist()

    plt.subplot(1, 2, 2)
    stats.probplot(df, dist="norm", plot=pylab)

    plt.show()

In [109]:
pt = PowerTransformer(method='box-cox')

In [110]:
pt.fit(pd.DataFrame(y_train + 1))

In [111]:
def power_transform(target_df):
    target_df_cp = target_df.copy()
    return pd.Series(pt.transform(pd.DataFrame(target_df_cp) + 1).squeeze())

In [112]:
def inverse_power_transform(inverse_target_df):
    inverse_target_df_cp = inverse_target_df.copy()
    return pd.Series((pt.inverse_transform(pd.DataFrame(inverse_target_df_cp)) - 1).squeeze())

In [113]:
def square_transform(target_df):
    target_df_cp = target_df.copy()
    return (target_df_cp)**(0.5)

In [114]:
def inverse_square_transform(inverse_target_df):
    inverse_target_df_cp = inverse_target_df.copy()
    return (inverse_target_df_cp)**(2)

In [115]:
def log_transform(target_df):
    target_df_cp = target_df.copy()
    return np.log1p(target_df_cp+1)

In [116]:
def inverse_log_transform(inverse_target_df):
    inverse_target_df_cp = inverse_target_df.copy()
    return np.expm1(inverse_target_df_cp) - 1

### Modelling

In [117]:
def dispatch_ml_algorithm(algorithm_type: str, type_of_target: str):
    is_regression = type_of_target == 'continuous'
    
    if algorithm_type == 'lgbm':
        from lightgbm import LGBMClassifier, LGBMRegressor
        return LGBMRegressor if is_regression else LGBMClassifier
    elif algorithm_type == 'cat':
        from catboost import CatBoostClassifier, CatBoostRegressor
        return CatBoostRegressor if is_regression else CatBoostClassifier
    elif algorithm_type == 'rf':
        from  sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
        return RandomForestRegressor if is_regression else RandomForestClassifier
    else:
        assert algorithm_type == 'xgb'
        from xgboost import XGBClassifier, XGBRegressor
        return XGBRegressor if is_regression else XGBClassifier 

def get_regressor_instance(algorithm_type: str, type_of_target: str, model_params:dict=None):
    
    regressor_class = dispatch_ml_algorithm(algorithm_type, type_of_target)
    regrssor_instance = regressor_class() if model_params is None else regressor_class(**model_params)
    return regrssor_instance




### Cross validation

In [118]:
rkf = RepeatedKFold(n_splits=4,random_state=rng)

In [119]:
def get_cross_validation_score(model, X, y, cv =5, cv_func = None, scoring ='r2', inverse_scale_func = None):

    cv_value = cv if cv_func is None else cv_func

    return cross_val_score(model, X, y, cv=cv_value,scoring=scoring).mean()

In [120]:
xgb_cross_val_scores = get_cross_validation_score(get_regressor_instance('xgb', 'continuous',{'random_state':rng}), X_train_transformed, log_transform(y_train), cv=rkf)
print(f'xgb cross validated scores: {xgb_cross_val_scores}')

xgb cross validated scores: 0.4379773778020632


In [121]:
rf_cross_val_scores = get_cross_validation_score(get_regressor_instance('rf', 'continuous', {'random_state':rng}), X_train_transformed,log_transform(y_train), cv=rkf)
print(f'random forest cross validated scores: {rf_cross_val_scores}')

random forest cross validated scores: 0.4667617428964574


### Build model

#### Prepare test data without data leakage

In [122]:
def prepare_data_for_model(df, target_available = True):
    # preprocess_data
    df = clean_Data(df)
    # add features
    df = add_features(df)
    # split dataset to X,y
    X, y = split_features_target(df, target, drop_features,target_available)
    # data transformation"
    X_transformed = ht.transform(X)
    return X_transformed, y

In [123]:
test_df.columns

Index(['sender', 'subject_len', 'body_len', 'mean_paragraph_len',
       'day_of_week', 'is_weekend', 'times_of_day', 'category', 'product',
       'no_of_CTA', 'mean_CTA_len', 'is_image', 'is_personalised', 'is_quote',
       'is_timer', 'is_emoticons', 'is_discount', 'is_price', 'is_urgency',
       'target_audience', 'click_rate'],
      dtype='object')

In [124]:
test_df.shape

(378, 21)

In [125]:
test_df_cp = test_df.copy()

In [126]:
X_test_transformed,y_test = prepare_data_for_model(test_df_cp)

In [127]:
X_test_transformed = X_test_transformed[selected_features]

In [128]:
# xgboost model
xgb_model = get_regressor_instance('xgb', 'continuous', {'random_state':rng})

In [129]:
# random forest model
rf_model = get_regressor_instance('rf', 'continuous', {'random_state':rng})

In [130]:
xgb_model.fit(X_train_transformed, y_train)

In [131]:
rf_model.fit(X_train_transformed, y_train)

### feature selection using recursive feature elimination

In [132]:
xgb_sel_ = RFE(get_regressor_instance('xgb', 'continuous', {'random_state':rng}), n_features_to_select=15)
xgb_sel_.fit(X_train_transformed, y_train)

selected_feat = X_test_transformed.columns[(xgb_sel_.get_support())]
print(selected_feat)

Index(['sender.value', 'subject_len.value', 'body_len.value',
       'mean_paragraph_len.value', 'day_of_week.value', 'is_weekend.value',
       'times_of_day.value', 'category.value', 'product.value',
       'no_of_CTA.value', 'mean_CTA_len.value', 'is_personalised.value',
       'is_quote.value', 'target_audience.value', 'num_qoutes.value'],
      dtype='object')


### feature importance

In [133]:
def display_permutation_importance(model, X_test, y_test):
    perm = PermutationImportance(model, random_state=1).fit(X_test, y_test)
    return eli5.show_weights(perm, feature_names = X_test.columns.tolist())

In [134]:
display_permutation_importance(xgb_model, X_test_transformed, y_test)

Weight,Feature
0.8346  ± 0.2885,body_len.value
0.3091  ± 0.0946,mean_paragraph_len.value
0.2576  ± 0.1088,no_of_CTA.value
0.1786  ± 0.2834,product.value
0.1031  ± 0.0253,mean_CTA_len.value
0.0871  ± 0.0303,subject_len.value
0.0398  ± 0.0364,target_audience.value
0.0339  ± 0.0259,times_of_day.value
0.0321  ± 0.0349,is_quote.value
0.0251  ± 0.0139,category.value


In [135]:
display_permutation_importance(rf_model, X_test_transformed, y_test)

Weight,Feature
0.4882  ± 0.0959,body_len.value
0.2226  ± 0.0634,mean_paragraph_len.value
0.1928  ± 0.1697,no_of_CTA.value
0.0982  ± 0.0103,mean_CTA_len.value
0.0394  ± 0.0906,product.value
0.0254  ± 0.0330,category.value
0.0233  ± 0.0296,subject_len.value
0.0194  ± 0.0305,target_audience.value
0.0129  ± 0.0134,times_of_day.value
0.0116  ± 0.0178,is_quote.value


### Feature sliced performance analysis

In [136]:
def rewrite_name(df_type, error_type):
    return f"{df_type}{'_'}{error_type}"

def get_scores(y_actual, y_predicted,df_type='train'):
    #y_predicted = np.expm1(y_predicted)
    #y_actual = np.expm1(y_actual)
    rmse = mean_squared_error(y_actual, y_predicted, squared=False)
    mse = mean_squared_error(y_actual, y_predicted, squared=True)
    #mae = mean_absolute_error(y_actual, y_predicted)
    r2 = r2_score(y_actual, y_predicted)
    scores = {rewrite_name(df_type, 'rmse'):rmse, rewrite_name(df_type, 'mse'):mse , rewrite_name(df_type, 'r2_score'):r2}
    return scores

def dataset_results(X,y, model, df_type):

    y_pred = pd.Series(model.predict(X))
    
    return get_scores(y,y_pred,df_type)
    

def display_prediction_scores(X_train, y_train, X_test, y_test, model):

    print(dataset_results(X_train,y_train, model, 'train'))
    print(dataset_results(X_test,y_test, model, 'test'))
    

In [137]:
def get_feature_sliced_model_performance_per_dataset(features, model, X,y,  df_type, index_col):
    if features:
        return X.groupby(features).apply(lambda x:dataset_results(x.reset_index().set_index(index_col),y.loc[x.index], model, df_type))
    else:
        return pd.DataFrame(dataset_results(X.reset_index().set_index(index_col),y, model, df_type),index=[0])

In [138]:
def feature_sliced_performance_analysis_per_dataset(features, model, X,y, time_scale_str, index_col):
    
    train_results = get_feature_sliced_model_performance_per_dataset(features, model, X,y, time_scale_str , index_col)
    results = None
    if features:
        combined_results = train_results.to_frame(name = 'train').reset_index()
        results = pd.json_normalize(combined_results['train'])
        results[features] = combined_results[features]
     
        results = results[ features + [ col for col in results.columns if col not in features ] ]
        results[features] = ht.reverse_transform_subset(results[features])
    else:
        results = train_results
    return results

In [139]:
def feature_sliced_performance_analysis_train_test_serve(features, model, X,y, X_test,y_test , index_col):
    
    train_results = get_feature_sliced_model_performance_per_dataset(features, model, X,y, 'train' , index_col)
    test_results = get_feature_sliced_model_performance_per_dataset(features,model, X_test,y_test, 'test' , index_col)
    results = None
    if features:
        combined_results = train_results.to_frame(name = 'train').join(test_results.to_frame(name='test')).reset_index()
        results = pd.json_normalize(combined_results['train']).join(pd.json_normalize(combined_results['test']))
        results[features] = combined_results[features]
    else:
        results = train_results.join(test_results)
        
    results = results[ features + [ col for col in results.columns if col not in features ] ]
    results[features] = ht.reverse_transform_subset(results[features])
    results['overfit_train_test'] = results['train_r2_score'] - results['test_r2_score']
    return results

#### Overall results

In [140]:
model_results = feature_sliced_performance_analysis_train_test_serve([], rf_model, X_train_transformed,y_train, X_test_transformed, y_test, index_col='campaign_id')
model_results

Unnamed: 0,train_rmse,train_mse,train_r2_score,test_rmse,test_mse,test_r2_score,overfit_train_test
0,0.023286,0.000542,0.92782,0.057097,0.00326,0.395578,0.532242


In [141]:
model_results = feature_sliced_performance_analysis_train_test_serve([], xgb_model, X_train_transformed,y_train, X_test_transformed, y_test, index_col='campaign_id')
model_results

Unnamed: 0,train_rmse,train_mse,train_r2_score,test_rmse,test_mse,test_r2_score,overfit_train_test
0,0.006636,4.4e-05,0.994138,0.052247,0.00273,0.493897,0.500241


#### mode performance by sender

In [142]:
train_df.columns

Index(['sender', 'subject_len', 'body_len', 'mean_paragraph_len',
       'day_of_week', 'is_weekend', 'times_of_day', 'category', 'product',
       'no_of_CTA', 'mean_CTA_len', 'is_image', 'is_personalised', 'is_quote',
       'is_timer', 'is_emoticons', 'is_discount', 'is_price', 'is_urgency',
       'target_audience', 'click_rate'],
      dtype='object')

In [143]:
model_results = feature_sliced_performance_analysis_train_test_serve(['times_of_day.value'], xgb_model, X_train_transformed,y_train, X_test_transformed, y_test, index_col='campaign_id')

In [144]:
model_results.sort_values(by='test_r2_score',ascending=False)

Unnamed: 0,times_of_day.value,train_rmse,train_mse,train_r2_score,test_rmse,test_mse,test_r2_score,overfit_train_test
2,Morning,0.004784,2.3e-05,0.998179,0.074067,0.005486,0.72695,0.271228
1,Noon,0.007409,5.5e-05,0.995453,0.055913,0.003126,0.428339,0.567115
0,Evening,0.006506,4.2e-05,0.991937,0.048683,0.00237,0.256328,0.735609


### final model with full data trained

In [145]:
df_cp = df.copy()

In [146]:
X_full_transformed,y_full = prepare_data_for_model(df_cp.sample(frac=1.0))

In [147]:
X_full_transformed = X_full_transformed[selected_features]

In [148]:
xgb_full_model = get_regressor_instance('xgb', 'continuous', {'random_state':rng})

In [149]:
xgb_full_model.fit(X_full_transformed, y_full)

### Predictions

In [151]:
submission_df_cp = submission_df.copy()

In [152]:
# prepare submission_df for predictions
X_submit_transformed,_ = prepare_data_for_model(submission_df_cp, target_available=False)

In [153]:
X_submit_transformed = X_submit_transformed[selected_features]

In [154]:
predictions = xgb_full_model.predict(X_submit_transformed)

#### Submission

In [155]:
# Storing to Submission DataFrame


submission = pd.DataFrame({
                  "campaign_id": X_submit_transformed.index,
                  "click_rate": predictions})
submission.set_index('campaign_id',inplace=True)
submission.head()

Unnamed: 0_level_0,click_rate
campaign_id,Unnamed: 1_level_1
1889,0.057427
1890,0.69894
1891,0.185138
1892,0.187892
1893,0.165329


In [156]:
submission.to_csv('xgb_submission.csv')