# Import packages and load to orignal log of UWV

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import random

def date_parser(s):
    """
    Parse a date string using the log file's format. For example: '2015/10/16 11:25:59.000'
    """
    without_ms = s.split('.')[0]
    return datetime.datetime.strptime(without_ms, '%Y/%m/%d %H:%M:%S')

log = pd.read_csv('data/log.csv', parse_dates=['startTime', 'completeTime'], date_parser=date_parser)

# Events without an event type are pretty useless so we might as well drop them
log.dropna(axis=0, subset=['event'], inplace=True)
log.head()

In [None]:
sad_log_1Q=pd.read_csv('data/sad_log_1Q.csv', encoding= "ISO-8859-1")
happy_log=pd.read_csv('data/happy_log.csv', encoding= "ISO-8859-1")

In [None]:
sad_log_1Q['asked_question']=1
happy_log['asked_question']=0
log=pd.concat([happy_log, sad_log_1Q], axis=0)

# Create session summary
## Aggregate log by sessions

In [None]:
# Make sure the list of activities per session will be ordered by time
log.sort_values(['sessionid', 'startTime'], ascending=[True, True], inplace=True)
log['startTime']=log['completeTime']

# Define aggregations when looking at each session
aggregations = {'event': lambda x: list(x), 
                'startTime': 'first', 
                'completeTime': 'last', 
                'gender': 'first',
                'agecategory': 'first',
                'asked_question': 'first'
               }
sessions = log.groupby('sessionid', as_index=False).agg(aggregations)

## Remove reoccurences of questions

In [None]:
def eliminate_leakage(event_list, including=True):
    """
    Cut sessions at the point where a question is asked, (including the question itself or not).
    If the trace does not include a question, return it unchanged.
    """
    try:
        question_index = event_list.index('Question')
        if including:
            question_index = question_index + 1
        else:
            question_index = question_index
        return event_list[:question_index]
    except ValueError:
        return event_list
    
#sessions['event'] = sessions['event'].apply(eliminate_leakage)

# Feature Engineering

## Age category, Max loops, hour, gender 

In [None]:
from collections import Counter

# If there is a question within a session, return a 1, else a 0
def asked_question(event_list):
    return int(event_list[-1] == 'Question')

# Ordinal mapping of age categories
def age(age_cat):
    mapping = {'30-39': 35, '50-65': 57, '40-49': 45, '18-29': 23}
    return mapping[age_cat]

# This function returns:
# 1) The most visisted page within the session
# 2) The times that page is visited
def max_loops(event_list):
    event_counter = Counter(event_list)
    most_visited_page = max(event_counter, key=event_counter.get)
    times_visited = event_counter[most_visited_page]
    if times_visited == 1:
        most_visited_page = None
        
    inds=[index for index, value in enumerate(event_list) if value==most_visited_page]
    if len(inds)==0:
        avg_steps=0
    else:
        steps=np.diff(inds)
        avg_steps=np.mean(steps)+1
    return most_visited_page, times_visited, avg_steps

# The hour of the timestamp can be seen as a feature
def hour(timestamp):
    return timestamp.hour

# Gender of a customer
def gender(gender):
    """
    This could be done directly on the DF but lets keep the same style for everything
    """
    return int(gender == 'M')

    
# Create target variable - Did this session end up with a question?
#sessions['asked_question'] = sessions['event'].apply(asked_question)
#sessions['event'] = sessions['event'].apply(eliminate_leakage,including=False)

# Gender from character to int
sessions['gender'] = sessions['gender'].apply(gender)

# Age from category to int and rename column
sessions['agecategory'] = sessions['agecategory'].apply(age)
sessions.rename(columns={'agecategory': 'age'}, inplace=True)

# Hour of day when the session took place.
sessions['startTime']=sessions['startTime'].apply(pd.to_datetime)
sessions['completeTime']=sessions['completeTime'].apply(pd.to_datetime)
sessions['hour'] = sessions['startTime'].apply(hour)

timediff=sessions['completeTime']-sessions['startTime']
sessions['timediff'] = timediff.apply(lambda x: x.seconds)

# Max number of page reoccurence within the sessions and the page mostly visited. 
# If each page was visited once then mostly visited will be None. The start syntax is interesting,
# it allows the apply function to create multiple outputs. This could be useful for the TODO step
# mentioned below.
sessions['most_visited_page'], sessions['max_loops'], sessions['avg_steps'] = zip(*sessions['event'].apply(max_loops))

############################################# TODO ########################################################
### We could use the 'most_visited' column to create smart dummy variables. For example something like: ### 
### Is the home page the mostly visited, or the same for other interesting pages.                       ###
############################################# TODO ########################################################

## Presence of trigger events by Markov Chain model 

In [None]:
trigger_events= ['Visit page mijn_werkmap','Visit page home','Visit page taken',
                 'Visit page vacatures_bij_mijn_cv','Visit page mijn_berichten',
                 'Visit page werkmap','Visit page mijn_documenten',
                 'Visit page mijn_sollicitaties','Visit page mijn_cv',
                 'Visit page mijn_tips','Visit page inschrijven',
                 'Visit page foutopgetreden.html']

for event in trigger_events:
    sessions[event]=sessions['event'].apply(lambda x: int(event in x))

# Machine Learning to predict questions

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler

y = sessions['asked_question']
# Include more columns in the future!
X = sessions.drop(['asked_question', 'sessionid', 'startTime', 'completeTime', 'most_visited_page', 'event'], axis=1)
features=X.columns
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
param_grid = {'max_depth': [2, 3, 4, 5, 6, 7, 8]}
#param_grid = {'C':[1,10]}

clf=RandomForestClassifier()
grid = GridSearchCV(clf, param_grid, cv=3,scoring='roc_auc',n_jobs=3, verbose=3)
model=grid.fit(X_train, y_train)
    
best_parameters, score, _ = max(model.grid_scores_, key=lambda x: x[1])
print('Normalized AUC:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
model = RandomForestClassifier(max_depth=best_parameters[param_name], random_state=42)
model.fit(X_train, y_train)

In [None]:
# Use at own risk
import matplotlib.pyplot as plt

def f_importances(coef, names):
    imp = coef
    imp,names = zip(*sorted(zip(imp,names)))
    plt.barh(range(len(names)), imp, align='center')
    plt.yticks(range(len(names)), names)
    plt.tight_layout()
    plt.show()
    plt.savefig('Important features')
    
f_importances(model.feature_importances_, features)


In [None]:
predictions=model.predict(X_test)

In [None]:
confusion_matrix(y_test, predictions)

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
tn, fp, fn, tp

In [None]:
print(classification_report(y_test, predictions))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)