In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import rc
from matplotlib.ticker import StrMethodFormatter, FuncFormatter
from datetime import timedelta
import re 

In [None]:
user_actions = pd.read_csv('user_actions.csv', dtype={'message': str},
    parse_dates=['createdDateTime'])
user_actions

# DATA PREPARATION TO ANALYZE

## start preparation

In [None]:
def parse_msg(msg):
    return re.findall(r'\[(.*?)\]', msg)

In [None]:
def process_table(base, type_msg):
    drop = base[base['message'].apply(lambda x: not x.startswith(type_msg))].index
    new_table = base.drop(drop).loc[:,['actionId', 'createdDateTime', 'message']]
    new_table['action'] = new_table.apply(lambda x: parse_msg(x['message'])[1], axis=1)
    new_table['msg'] = new_table.apply(lambda x: parse_msg(x['message'])[2], axis=1)
    new_table.drop(columns='message', inplace=True)
    return new_table

In [None]:
table_of_start_action = process_table(user_actions, '[START]')
table_of_start_action

In [None]:
table_of_end_action = process_table(user_actions, '[END]')
table_of_end_action

In [None]:
processed_data = table_of_start_action.set_index(['actionId', 'action'])\
    .join(table_of_end_action.set_index(['actionId', 'action']), 
          lsuffix='_start', 
          rsuffix='_end',
          how='outer').reset_index()

## Retrieving Data That Cannot Be Processed

In [None]:
processed_data[processed_data.isnull().any(axis=1)]

In [None]:
processed_data[processed_data.isnull().any(axis=1)].count()

In [None]:
processed_data.drop((processed_data[processed_data.isnull().any(axis=1)]).index, inplace=True)

## сontinued preparation

In [None]:
def parse_io_result(msg):
    return re.findall(r'\d+', msg)

In [None]:
processed_data['duration'] = processed_data.apply(
    lambda x: (x['createdDateTime_end'] - x['createdDateTime_start']), axis = 1)
processed_data.drop(columns='createdDateTime_end', inplace=True)

In [None]:
processed_data['input_data'] = processed_data.apply(lambda x: parse_io_result(x['msg_start'])[0] 
                                                 if x['action'] != 'SEARCH' else '-', axis=1)
processed_data.drop(columns='msg_start', inplace=True)

In [None]:
processed_data['success'] = processed_data.apply(lambda x: parse_io_result(x['msg_end'])[0] 
                                                 if x['action'] != 'SEARCH' else '-', axis=1)
processed_data['warning'] = processed_data.apply(lambda x: parse_io_result(x['msg_end'])[1]
                                                 if x['action'] != 'SEARCH' else '-', axis=1)
processed_data['failed'] = processed_data.apply(lambda x: parse_io_result(x['msg_end'])[2]
                                                if x['action'] != 'SEARCH' else '-', axis=1)
processed_data.drop(columns='msg_end', inplace=True)

In [None]:
processed_data

# TASK

## Percentage of operations (pie chart)

In [None]:
actions = processed_data.groupby('action').size()
actions

In [None]:
fig, ax = plt.subplots()
ax.pie(actions, labels=actions.index, autopct='%.0f%%', shadow=True, radius=0.5)
ax.axis('equal')
plt.show()

## The total number of operations by weeks

In [None]:
actions_by_weeks = processed_data.groupby(processed_data['createdDateTime_start'].dt.week).size()
actions_by_weeks

In [None]:
ax = actions_by_weeks.plot(x='createdDateTime_start', kind='bar', legend=False, figsize=(20,5))
ax.set_xlabel("weeks")
ax.set_ylabel("num_actions")

## Top 10 most lengthy searching operations indicating time spent

In [None]:
drop = processed_data[processed_data['action'].apply(lambda x: x != 'SEARCH')].index
srch_act = processed_data.drop(drop).loc[:,['actionId', 'action', 'duration']]\
                        .sort_values(['duration'], ascending=False)[:10]
srch_act

## A histogram of input data for each operation

In [None]:
drop = processed_data[processed_data['action'].apply(lambda x: x == 'SEARCH')].index
inp_for_act = processed_data\
                .sort_values(['createdDateTime_start'])\
                .drop(drop)\
                .loc[:,['action', 
                        'createdDateTime_start',
                        'duration','input_data', 
                        'success', 
                        'warning', 
                        'failed']]
inp_for_act['input_data'] = inp_for_act['input_data'].astype('int')
inp_for_act['success'] = inp_for_act['success'].astype('int')
inp_for_act['warning'] = inp_for_act['warning'].astype('int')
inp_for_act['failed'] = inp_for_act['failed'].astype('int')
inp_for_act

In [None]:
_, axes = plt.subplots(5, 1, figsize=(20,20), dpi=100, sharex=True, sharey=True)
colors = ['tab:red', 'tab:blue', 'tab:green', 'tab:pink', 'tab:olive']

for i, (ax, action) in enumerate(zip(axes.flatten(), inp_for_act.action.unique())):
    y = inp_for_act.loc[inp_for_act.action==action, 'input_data']
    x = inp_for_act.loc[inp_for_act.action==action, 'createdDateTime_start']
    ax.bar(x=x, height=y, alpha=1, label=str(action), color=colors[i])
    ax.set_xticklabels([])
    ax.set_title(action) 
    
plt.suptitle('Number of input data for each operation', y=0.93, size=16)


## Graph of the number of successful, conditionally successful (warning) and unsuccessful results

In [None]:
_, axes = plt.subplots(5, 1, figsize=(20,30), dpi=100)
rc('ytick', labelsize=16)

for i, (ax, action) in enumerate(zip(axes.flatten(), inp_for_act.action.unique())):
    datetime = inp_for_act.loc[inp_for_act.action==action, 'createdDateTime_start']
    success = inp_for_act.loc[inp_for_act.action==action, 'success']
    warning = inp_for_act.loc[inp_for_act.action==action, 'warning']
    failed = inp_for_act.loc[inp_for_act.action==action, 'failed']
    ax.bar(x=datetime, height=success, color='g')
    ax.bar(x=datetime, height=warning, color='y')
    ax.bar(x=datetime, height=failed, color='r')
    ax.legend(('success', 'warning', 'failed'), prop={'size': 16})
    ax.set_ylabel('num_results', fontsize=16)
    ax.set_xticklabels([])
    ax.set_title(action) 

plt.suptitle('Successful, conditionally successful (warning) and unsuccessful results', y=0.9, size=16)
plt.show()


## Dependence of the time spent on the amount of input data

In [None]:
_, ax = plt.subplots(figsize=(20,15))
colors = ['tab:red', 'tab:blue', 'tab:green', 'tab:pink', 'tab:olive']

def timeTicks(x, pos):                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
    return str(timedelta(seconds=x))                                                                                                                                                                                                                                                              


for i, action in enumerate(inp_for_act.action.unique()):
    x = inp_for_act.loc[inp_for_act.action==action, 'duration'].astype('timedelta64[s]')
    y = inp_for_act.loc[inp_for_act.action==action, 'input_data']
    ax.scatter(x, y, alpha=0.5, label=str(action), color=colors[i])
    formatter = FuncFormatter(timeTicks)                                                                                                                                                                                                                         
    ax.xaxis.set_major_formatter(formatter)   
    
ax.legend()
plt.suptitle('Number of input data for each operation', y=0.93, size=16)
