In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

def date_parser(s):
    """
    Parse a date string using the log file's format. For example: '2015/10/16 11:25:59.000'
    """
    without_ms = s.split('.')[0]
    return datetime.datetime.strptime(without_ms, '%Y/%m/%d %H:%M:%S')

log = pd.read_csv('data/log.csv', parse_dates=['startTime', 'completeTime'], date_parser=date_parser)

In [17]:
# TODO: Delete all rows happening after a user asked a question to avoid data leakages
def eliminate_leakage(df):
    pass

In [None]:
# Feature Engineering.
# Extract meaningfull features for each user to be used by a predictive model like XGBoost.

def asked_question(group):
    """
    Given the records of a user, return whether or not the user has asked a question.
    """
    return int(group['event'].str.contains('Question').sum() > 0)


def customer(customer_id):
    '''
    Helper function to subset the log by customer_id
    '''
    return log[log['case'] == customer_id]

def gender(customer_id):
    return int(customer(customer_id)['gender'].values[0] == 'M')

def age(customer_id):
    mapping = {'30-39': 35, '50-65': 57, '40-49': 45, '18-29': 23}
    age_category = customer(customer_id)['agecategory'].values[0]
    return mapping[age_category]

def num_sessions(customer_id):
    """
    Given a user's id, find the number of sessions per day.
    """
    return len(customer(customer_id)['sessionid'].unique())

def session_duration(customer_id, mode='mean'):
    """
    Return selected statistics about the session duration for the given customer.
    """
    try:
        durations = session_duration.sessions[session_duration.session['case'] == customer_id]['duration']
        if mode == 'mean':
            return durations.mean()
        if mode == 'median':
            return durations.median()
        if mode == 'max':
            return durations.max()
        if mode == 'min':
            return durations.min()
        raise ValueError('You enter mode = {}. Only mean, median, max and min are supported'.format(mode))
            
    except AttributeError:
        print('I got here')
        # The first time the function is called, compute the heavy sessions object as a static member
        def total_minutes(timedelta):
            return timedelta.total_seconds() / 60
        
        aggregations = {'startTime': 'first', 'completeTime': 'last'}
        session_duration.sessions = log.groupby(['case', 'sessionid'], as_index=False).agg(aggregations)
        
        start = session_duration.sessions['startTime']
        end = session_duration.sessions['completeTime']
        session_duration.sessions['duration'] = (end - start).apply(total_minutes)
        session_duration(customer_id) # Recursive call after setting the static variable
        
train = pd.DataFrame()

target = log.groupby('case').apply(asked_question)
train['customer_id'] = target.index
train['asked_question'] = target.values
train['gender'] = train['customer_id'].apply(gender)
train['age'] = train['customer_id'].apply(age)
train['num_sessions'] = train['customer_id'].apply(num_sessions)

# The following calls are quite slow, i need to find out why

#train['mean_session_duration'] = train['customer_id'].apply(session_duration, args=('mean',))
#train['median_session_duration'] = train['customer_id'].apply(session_duration, args=('median',))
#train['max_session_duration'] = train['customer_id'].apply(session_duration, args=('max',))
#train['min_session_duration'] = train['customer_id'].apply(session_duration, args=('min',))

train.head()