In [30]:
import pandas as pd
import numpy as np
from datetime import datetime

In [42]:
wide_info_others = pd.read_csv('/home/mam_jupyter/jupyter_dir/artefact/leads_scoring_model/wide_info_others.csv', delimiter = ',')

#### Lift Matrix

In [32]:
# transform float data
def round_value(data, col):
    '''
    Convert float-type data to int or rounded to one decimal given:
    data (dataframe): input dataframe
    col (string): column name needed to be processed
    '''
    if data[col].max() <= 1:
        data[col] = [round(i,1) if i is not np.nan else np.nan for i in data[col]]
    else:
        data[col] = [round(i,0) if i is not np.nan else np.nan for i in data[col]]        
    return data[col]

round_list = ['d_avg_leads_date', 'd_leads_dtbt_coincide', 'd_avg_visit_date', 'd_avg_fircard_firvisit_diff', 'd_deal_fail_ppt',
             'd_avg_fir_sec_visit_diff', 'd_avg_firleads_firvisit_diff', 'd_leads_dtbt_ppt']

for col in round_list:
    wide_info_others[col] = round_value(wide_info_others, col)

In [33]:
# transform datetime data
datetime_list = ['d_fir_leads_time', 'd_fir_card_time', 'd_fir_visit_time', 'd_fir_trail', 'd_last_reservation_time','d_fir_dealfail_d',
                 'd_last_dealfail_d', 'd_fir_activity_time', 'c_last_activity_time', 'c_last_sis_time', 'c_register_time']

def datatime_to_weekday_and_month(data, col):
    '''
    Extract weekday or month from datetime data given:
    data (dataframe): input dataframe
    col (string): column name needed to be processed    
    '''
    data[col] = pd.to_datetime(data[col])
    col_name_wk = col + '_weekday'
    data[col_name_wk] = [i.isoweekday() for i in data[col]]
    col_name_mon = col + '_month'
    data[col_name_mon] = data[col].dt.month
    return data[[col_name_wk, col_name_mon]]

for col in datetime_list:
    wide_info_others[[col + '_weekday', col + '_month']] = datatime_to_weekday_and_month(wide_info_others, col)

In [24]:
def create_list_actions(data, action_col):
    '''
    Select values that counts more than 100 times in a column given:
    data (dataframe): input dataframe
    action_col (string): column name needed to be processed    
    '''
    df_count = data[action_col].value_counts().rename_axis('feature').reset_index(name = 'counts')
    dict_value = list(df_count[df_count['counts'] >= 100]['feature'])
    return dict_value

In [23]:
action_col = round_list + [i + '_weekday' for i in datetime_list] + [i + '_month' for i in datetime_list] + \
['d_fir_sec_leads_diff',
'd_leads_count',
'd_leads_dtbt_count',
'd_leads_dtbt_level_1',
'd_leads_dtbt_level_2',
'd_cust_type',
'd_card_ttl',
'd_fir_sec_visit_diff',
'd_visit_dtbt_count',
'd_visit_ttl',
'd_avg_visit_dtbt_count',
'd_followup_ttl',
'd_trail_book_tll',
'd_trail_attend_ttl',
'd_leads_car_model_count',
'd_leads_car_model_type',
'd_is_deposit_order',
'd_dealf_succ_firvisit_diff',
'd_dealf_succ_lastvisit_diff',
'd_activity_ttl',
'd_has_deliver_history',
'd_trail_count_d30',
'd_trail_count_d90',
'd_visit_count_d15',
'd_visit_count_d30',
'd_visit_count_d90',
'd_followup_d7',
'd_followup_d15',
'd_followup_d30',
'd_followup_d60',
'd_followup_d90',
'd_activity_count_d15',
'd_activity_count_d30',
'd_activity_count_d60',
'd_activity_count_d90',
'd_firlead_dealf_diff',
'd_lastlead_dealf_diff',
'd_last_dfail_dealf_diff',
'd_firfollow_dealf_diff',
'd_lastfollow_dealf_diff',
'd_dealf_lastvisit_diff',
'd_dealf_firvisit_diff',
'd_lasttrail_dealf_diff',
'd_firleads_firvisit_diff',
'd_fircard_firvisit_diff',
'd_fir_dealfail_deal_diff',
'd_last_activity_dealf_diff',
'd_fir_activity_dealf_diff',
'd_fir_order_leads_diff',
'd_fir_order_visit_diff',
'd_fir_order_trail_diff',
'c_sex',
'c_age',
'c_city',
'c_province',
'c_MG350（海外销售）',
'c_MGeHS',
'c_MGGT',
'c_MG锐腾',
'c_MG550（海外销售）',
'c_MGeMGHS',
'c_MGHS',
'c_MG360（海外销售）',
'c_MGRX5（海外销售）',
'c_MGeMG6',
'c_MG7',
'c_MG750（海外销售）',
'c_MG5',
'c_MGZS纯电动',
'c_MGZS',
'c_MG3',
'c_MG新MG5（海外销售）',
'c_MGTF',
'c_MG6',
'c_last_reach_platform_MG服务号',
'c_last_reach_platform_MGAPP',
'c_last_reach_platform_MG官网',
'c_vertical_media',
'c_offcial_online',
'c_leads_source_nums',
'c_city_level']

In [35]:
# create a dictionary that stores values that counts more than 100 times of each feature
action_dic = {}
for i in action_col:
    action_dic[i] = create_list_actions(wide_info_others, i)

In [38]:
def compute_lift_matrix(df, action_col, conv_col):
    '''
    Functions that compute a lift matrix given:
    df (dataframe): input dataframe
    conv_col (string): column name of the dataframe containing the purchase result
    action_col (list of string): list of column name of the dataframe containing the actions
    '''
    list_conversion = [0, 1]
    lift_matrix_to_df = [] #list of lists from which we create the lift matrix
    action_index = [] #future indexes of the lift matrix  
    
    def compute_prob_cond(df, conv_col, conversion_item, action, action_instance):
        prob_cond = float(df[(df[conv_col] == conversion_item) & (df[action] == action_instance)]\
                          ["mobile"].unique().shape[0])/float(df[df[action] == action_instance]["mobile"].unique().shape[0])
        return prob_cond

    def compute_prob_conv(df, conv_col, conversion_item):
        prob_conv = float(df[df[conv_col] == conversion_item]["mobile"].unique().shape[0])/float(df["mobile"].unique().shape[0])
        return prob_conv

    def compute_lift(prob_cond, prob_conv):
        return prob_cond/prob_conv
    
    #We iterate over the different action columns we want to see in the lift matrix
    for action in action_col:
        list_actions = action_dic[action] #left column in the matrix
        action_list_name = [str(action) + "_" + str(a) for a in list_actions]
        action_index = action_index + action_list_name #names of the actions that will appear on the lift matrix
        for action_instance in list_actions:
            lift_row = [] #row with lifts, row to be added successively to the matrix 
            for conversion_item in list_conversion:
                try:
                    prob_cond = compute_prob_cond(df, conv_col, conversion_item, action, action_instance)
                    prob_conv = compute_prob_conv(df, conv_col, conversion_item)
                    prob_cond = compute_lift(prob_cond, prob_conv)
                except ZeroDivisionError:
                    prob_cond = "NA"
                lift_row.append(prob_cond)
            lift_row.append(df[df[action] == action_instance]["mobile"].unique().shape[0]) # add total population
            lift_matrix_to_df.append(lift_row)
    # create the lift matrix
    lift_matrix = pd.DataFrame(lift_matrix_to_df, columns = list_conversion + ['Total action population'])
    lift_matrix["action"] = action_index
    lift_matrix = lift_matrix.set_index("action", drop = True)
    
    return lift_matrix

In [39]:
# compute lift matrix for each value of each feature
compute_lift_matrix(wide_info_others, action_col , 'd_deal_flag').to_csv('lift_matrix_others_3.csv')