In [2]:
from utils.all import *

# Dataframe creation

Preliminary code to download clean and split the demo dataframe

In [3]:
PROJECT = "uw-data-warehouse-prod"
TABLE = "partner_position_master_data_v5"
raw_data = download_data(PROJECT, table=TABLE)

Downloading:   0%|          | 0/190711 [00:00<?, ?rows/s]

In [4]:
dates, conts, cats = raw_data.dates_conts_cats_vars()

In [5]:
to_drop = ['fold', 'is_live', 'is_live_plus_3m', 'partner_position_id']

In [6]:
to_numeric_features = [
    'avg_monthly_amount_paid_last_1y',
 'avg_monthly_ri_earned_last_1y',
 'avg_monthly_ldb_paid_last_1y',
 'avg_monthly_supporting_bonus_paid_last_1y_pre_oct20',
 'avg_monthly_supporting_bonus_paid_last_1y_post_oct20']

In [7]:
actionables = ['training_completed_count_last_1y', 'supported_sign_ups_in_first_45_days',
              'remote_jtc_completion_pcnt_total', 'remote_jtc_completion_pcnt_last_1y',
              'remote_jtc_completion_pcnt_last_3m', 'remote_jtc_completion_pcnt_last_1m',
              'f2f_jtc_completion_pcnt_total', 'f2f_jtc_completion_pcnt_last_1y', 'days_logged_into_portal_last_3m',
              'jtc_applications_last_3m', 'started_at_month', 'pcnt_customers_gathered_cancelled_last_3m']

new_actionables = [
    'team_building_unique_days_viewed_last_6m',
    'team_building_unique_days_viewed_last_3m',
    'team_building_unique_days_viewed_last_1m',
    'customers_unique_days_viewed_last_6m',
    'customers_unique_days_viewed_last_3m',
    'customers_unique_days_viewed_last_1m',
    'training_unique_days_viewed_last_6m',
    'training_unique_days_viewed_last_3m',
    'training_unique_days_viewed_last_1m',
    'prospects_unique_days_viewed_last_6m',
    'prospects_unique_days_viewed_last_3m',
    'prospects_unique_days_viewed_last_1m',
    'incentives_unique_days_viewed_last_6m',
    'incentives_unique_days_viewed_last_3m',
    'incentives_unique_days_viewed_last_1m',
    'articles_unique_days_viewed_last_6m',
    'articles_unique_days_viewed_last_3m',
    'articles_unique_days_viewed_last_1m',
    'uw_engagement_unique_days_viewed_last_6m',
    'uw_engagement_unique_days_viewed_last_3m',
    'uw_engagement_unique_days_viewed_last_1m',
    'learning_plans_started_last_1y',
    'learning_plans_started_last_3m',
    'learning_plans_started_last_1m',
    'learning_plans_completed_last_1y',
    'learning_plans_completed_last_3m',
    'learning_plans_completed_last_1m',
    'days_since_last_learning_plan_interaction',    
]

In [8]:
ords = ['title', 'status', 'times_paid_commission_last_6m', 
        'times_paid_commission_last_1y', 'times_clawback_paid_last_3m']

In [9]:
targets = ['pcnt_customers_double_gold_total_plus_3m',
           'pcnt_customers_homeowners_total_plus_3m',
           'customers_gathered_total_plus_3m_delta',
           'partners_recruited_total_plus_3m_delta']


target = 'customers_gathered_total_plus_3m_delta'

In [10]:
cohort_filter = raw_data.is_live & raw_data.is_live_plus_3m  & (raw_data.elapsed_days_since_joined >= 274) & (raw_data.elapsed_days_since_joined < 679) 
activity_filter = ((raw_data.customers_gathered_last_1y + raw_data.partners_recruited_last_1y) > 0)

In [11]:
# 20% of the partners based on fold 3
raw_data['is_valid'] = raw_data.fold == 3
cohort_data = raw_data.loc[cohort_filter & activity_filter].reset_index(drop=True).copy()

def split_df(df, test_random_state=None):
    train = df[df.is_valid == 0].drop_cols('is_valid', inplace=False).reset_index(drop=True)
    valid = df[df.is_valid == 1].drop_cols('is_valid', inplace=False).reset_index(drop=True)
    if test_random_state is not None:
        N = len(valid) // 2
        reshuffled = valid.sample(frac=1., random_state=test_random_state)
        valid, test = reshuffled.iloc[: N].reset_index(drop=True), reshuffled.iloc[N: ].reset_index(drop=True)
        return train, valid, test
    return train, valid

In [12]:
import re

@patch
def add_datepart(dataframe: pd.DataFrame, date_fields, prefix=None, drop=True, time=False):
    "Helper function that adds columns relevant to a date in the column `field_name` of `df`."
    df = dataframe.copy()
    for field_name in date_fields:
        field = df[field_name]
        prefix = re.sub('[Dd]ate$', '', field_name) if prefix is None else prefix
        attr = ['Year', 'Month', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
                'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
        if time: attr = attr + ['Hour', 'Minute', 'Second']
        df[prefix + 'Week'] = field.dt.isocalendar().week
        for n in attr: df[prefix + n] = getattr(field.dt, n.lower())    
        df[prefix + 'Elapsed'] = field.astype(np.int64) // 10 ** 9
        if drop: df.drop(field_name, axis=1, inplace=True)
    return df

In [13]:
# Before splitting into train/valid/test
preprocessed_data = cohort_data.add_datepart(dates).cast_types(extra_columns=to_numeric_features, inplace=False)
for t in targets: preprocessed_data[t] = preprocessed_data[t] > 0

_, conts, cats = preprocessed_data.dates_conts_cats_vars()

In [14]:
@patch
def inpute_continuous_variables(df: pd.DataFrame, cont_vars):
    inputed_values = df.loc[~df.is_valid, cont_vars].median()
    for c in cont_vars: df[c].fillna(inputed_values[c].astype(int if is_integer_dtype(df[c].dtype) else float), inplace=True)
    return inputed_values

@patch
def numericalise_categorical_variables(df: pd.DataFrame, cat_vars):
    categorised = df.loc[~df.is_valid, cat_vars].copy().astype('category')
    cat_dict = {c: sorted(categorised[c].cat.categories) for c in cat_vars if not df[c].dtype == bool}
    for c, cats in cat_dict.items():
        df[c] = pd.Categorical(df[c], categories=cats, ordered=True)
        df[c] = df[c].cat.codes
    return cat_dict
        


In [15]:
inputed_vals = preprocessed_data.inpute_continuous_variables(conts)
categories_dict = preprocessed_data.numericalise_categorical_variables(cats)

In [17]:
prepped_data = preprocessed_data.drop_cols(to_drop + [t for t in targets if t != target])
# train, valid, test = split_df(prepped_data, test_random_state=92)
train, valid  = split_df(prepped_data, test_random_state=None)

In [18]:
def rebalance(df, target, ratio=3):
    N = df[target].sum()
    result = df.sort_values(target).tail(N * (ratio + 1)).copy()
    return result.sample(frac=1.).reset_index(drop=True)

def split_target(df, target): return df.drop(target, axis=1), df[target]

In [19]:
balanced_train = rebalance(train, target)

In [104]:
X_train, y_train = split_target(balanced_train, target)
X_valid, y_valid = split_target(valid, target) 

## Preprocessing: normalising variables.

This is not really needed for tree based model, but for the sake of demonstration, this is how to do it.

In [105]:
def normalise_column(df, col, mn=None, std=None):
    """Normalise column col.
    If mean and std are passed, 
    it will use them for normalisation (for the validation set)"""
    mn = df[col].mean() if mn is None else mn
    std = df[col].std() if std is None else std
    df[col] =  (df[col] - mn) / std

    return mn, std


def normalise_continuous_variables(train, valid, continuous_cols):
    for c in [c for c in continuous_cols if c in train.columns]:
        mn, std = normalise_column(train, c)
        mn, std = normalise_column(valid, c, mn, std)
                    
    

In [106]:
X_train['pcnt_customers_double_gold_total'].mean(), X_train['pcnt_customers_double_gold_total'].std()

(0.6256006286492616, 0.3249209964944291)

In [107]:
normalise_continuous_variables(X_train, X_valid, conts)

In [108]:
X_train['pcnt_customers_double_gold_total'].mean(), X_train['pcnt_customers_double_gold_total'].std()

(-2.9541320370310924e-17, 0.999999999999991)

## Feature selection: volatility control 

Quick and safe way to reduce the number of features: drop 0 variance columns.

In [129]:
len(X_train.columns)

153

In [130]:
def control_volatility_selection(df, var_threshold=0.):
    """Drops features with variance that is not above the
    var_threshold and return the remaining dataframe"""
    _feats = df.columns[df.var() > var_threshold].tolist()
    return _feats

In [131]:
# Let's remove 0 variance features
feats = control_volatility_selection(X_train)
len(feats)

146

## Feature selection: Boruta

In [132]:
!pip install -q boruta 

In [133]:
def get_boruta_selector(classifier=True):
    from boruta import BorutaPy
    from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
    
    args = dict(n_jobs=-1, class_weight='balanced', max_depth=5)
    rf = RandomForestClassifier(**args) if classifier else RandomForestRegressor(**args)
    
    return BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)



def boruta_selection(X, y, classifier=True, strict=False):
    selector = get_boruta_selector(classifier)
    selector.fit(X.values, y.values)
    support = selector.support_
    if not strict: support = support | selector.support_weak_
    return pd.Series(X.columns).loc[support].tolist()

In [134]:
boruta_feats = boruta_selection(X_train.loc[:, feats], y_train)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	146
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	146
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	146
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	146
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	146
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	146
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	146
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	45
Tentative: 	24
Rejected: 	77
Iteration: 	9 / 100
Confirmed: 	45
Tentative: 	24
Rejected: 	77
Iteration: 	10 / 100
Confirmed: 	45
Tentative: 	24
Rejected: 	77
Iteration: 	11 / 100
Confirmed: 	45
Tentative: 	24
Rejected: 	77
Iteration: 	12 / 100
Confirmed: 	45
Tentative: 	19
Rejected: 	82
Iteration: 	13 / 100
Confirmed: 	45
Tentative: 	19
Rejected: 	82
Iteration: 	14 / 100
Confirmed: 	45
Tentative: 	19
Rejected: 	82
Iteration: 	15 / 100
Confirmed: 	45
Tentative: 	19
Rejected: 	82
Iteration: 	16 / 100
Confirmed: 	45
Tenta

In [135]:
feats = boruta_feats
len(feats)

46

## Categorical features selection: $\chi ^ 2$

In [136]:
def chi_square_selection(X, y, cats, alpha=0.05):
    """Warning! Only for classification tasks"""
    from sklearn.feature_selection import chi2
    import pandas as pd
    feats = [c for c in cats if c in X.columns]
    _df = X.loc[:, feats].copy()
    #encode variables
    for c in _df.columns: _df[c] = _df[c].astype('category').cat.codes
    c, p = chi2(_df, y)
    df = pd.DataFrame({'chi_stats': c, 'p_val': p}, index=feats)
    df['selected'] = df['p_val'] < alpha
    return df.sort_values('p_val').copy()

In [137]:
chi_square_selection(X_train.loc[:, feats], y_train, cats)

Unnamed: 0,chi_stats,p_val,selected
is_activated,275.874259,5.952373e-62,True
days_to_get_activated_null_reason,96.060306,1.1143680000000001e-22,True


## Feature selection and multicolinearity reduction: VIF

Once again, this is not really needed for tree based models, but it is useful otherwise

In [138]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.astype(float).values, i) for i in range(X.shape[1])]

    return(vif.sort_values("VIF").reset_index(drop=True))

def vif_selection(df, vif_threshold=10.):
    vif_df = calc_vif(df)
    return vif_df.loc[vif_df.VIF <= vif_threshold, 'variables'].tolist()

In [139]:
feats = vif_selection(X_train.loc[:, feats].copy())

  vif = 1. / (1. - r_squared_i)


In [140]:
len(feats)

15

## Feature selection: pairwise correlation

In [141]:
def pairwise_corr_selection(df, threshold=.9):
    corrmat = df.corr().abs() 
    mask = corrmat <= threshold
    mask |= np.triu(np.ones_like(corrmat)).astype(bool)
    feats = mask.columns[mask.all()].tolist()
    return feats


In [142]:
### Low correlation threshold for demo purposes

corr_feats = pairwise_corr_selection(X_train.loc[:, feats], .7)

In [143]:
len(corr_feats)

8

In [144]:
feats

['jtc_completion_pcnt_total',
 'learning_plans_completed_last_1y',
 'learning_plans_started_last_3m',
 'pcnt_customers_double_gold_last_3m',
 'jtc_completion_pcnt_last_3m',
 'jtc_completion_pcnt_last_1m',
 'group_customer_count',
 'pcnt_customers_gathered_live_last_1m',
 'avg_monthly_ri_earned_last_1y',
 'training_unique_days_viewed_last_3m',
 'avg_monthly_amount_paid_last_1y',
 'training_unique_days_viewed_last_6m',
 'uw_engagement_unique_days_viewed_last_1m',
 'pcnt_customers_gathered_live_last_3m',
 'customers_gathered_last_1m']

## Feature selection with MRMR

This is an implementation of Uber's MRMR algorithm (see e.g. https://towardsdatascience.com/mrmr-explained-exactly-how-you-wished-someone-explained-to-you-9cf4ed27458b). This algorithm balances feature importance and minimising correlation, so it is probably to be used instead of most of the techniques shown so far. I have had decent success by applying Boruta first and then this one on datasets with a big number of features.

In [145]:
len(boruta_feats)

46

In [146]:
from sklearn.feature_selection import f_regression
from sklearn.ensemble import RandomForestClassifier

## Number of features to select
K = 15


X = X_train.loc[:, boruta_feats].copy().astype('float32')
y = y_train.copy()

# Original version uses the linear regression tests to establish columns importances
# F = pd.Series(f_regression(X, y)[0], index = X.columns)

# This implementation uses random forests, which make less assumption on the shape of the data
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
rf.fit(X, y)
F = pd.Series(rf.feature_importances_, index=X.columns)

# A possible alternative is using Spearman's correlation
corr = pd.DataFrame(.00001, index = X.columns, columns = X.columns)


selected = []
not_selected = X.columns.to_list()
scores = []

# repeat K times
for i in range(min(K, len(X.columns))):
  
    # compute (absolute) correlations between the last selected feature and all the (currently) excluded features
    if i > 0:
        last_selected = selected[-1]
        corr.loc[not_selected, last_selected] = X[not_selected].corrwith(X[last_selected], 
                                                                         method='spearman').abs().clip(.00001)
        
    # compute FCQ score for all the (currently) excluded features (this is Formula 2)
    score = F.loc[not_selected] / corr.loc[not_selected, selected].mean(axis = 1).fillna(.00001)
    
    # find best feature, add it to selected and remove it from not_selected
    best = score.index[score.argmax()]
    scores.append(score.max())
    selected.append(best)
    not_selected.remove(best)

In [148]:
selected

['jtc_applications_last_1m',
 'days_logged_into_portal_last_3m',
 'last_activity_Dayofyear',
 'days_since_last_activity',
 'days_logged_into_portal_last_1m',
 'jtc_applications_last_3m',
 'last_activity_Elapsed',
 'customers_unique_days_viewed_last_3m',
 'articles_unique_days_viewed_last_1m',
 'last_activity_Week',
 'team_building_unique_days_viewed_last_6m',
 'jtc_completion_pcnt_total',
 'articles_unique_days_viewed_last_3m',
 'uw_engagement_unique_days_viewed_last_3m',
 'articles_unique_days_viewed_last_6m']