In [190]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from pprint import pprint
%matplotlib inline

import collections
import itertools

from sklearn.preprocessing import LabelEncoder

In [2]:
# Pandas configurations
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)

In [35]:
df = pd.read_csv('../Data/Q2_2021.csv')

In [36]:
df['at'] = pd.to_datetime(df['at'], format='%Y-%m-%d %H:%M:%S.%f')

In [37]:
df.drop('Unnamed: 0', inplace=True, axis=1)
df.drop('roles', inplace=True, axis=1)
df.drop('remote_addr', inplace=True, axis=1)
df.drop('user_agent', inplace=True, axis=1)
df.drop('uri', inplace=True, axis=1)
df.drop('query_string', inplace=True, axis=1)
df.drop('request_method', inplace=True, axis=1)
df.drop('referer', inplace=True, axis=1)
df.drop('succeeded', inplace=True, axis=1)
df.drop('rights', inplace=True, axis=1)
df.drop('country_code', inplace=True, axis=1)

In [38]:
label_encoder = LabelEncoder()
label_encoder.fit_transform(df['aim'][~df['aim'].isnull()])
inv_mapping = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))
mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))

In [7]:
users_info = df.groupby('account')['aim'].count().sort_values(ascending=False)
users_info.head()

account
600.0    143196
402.0    128494
849.0    105988
428.0     89837
364.0     58652
Name: aim, dtype: int64

In [8]:
user = df[df['account'] == 245]
user = user.sort_values('at')
user.head()

Unnamed: 0,id,aim,account,at
1649,16272683,login,245.0,2021-04-01 09:20:58.934256
1670,16272684,auth_confirm_caution,245.0,2021-04-01 09:20:59.110076
1687,16272742,open_task,245.0,2021-04-01 09:21:42.783729
1733,16272749,open_perform_action,245.0,2021-04-01 09:21:45.809733
1716,16272753,perform_action,245.0,2021-04-01 09:21:47.841290


In [9]:
# day & amount of actions
user_days = user['at'].dt.day_of_year.value_counts()
user_days.head()

117    415
124    209
123    153
145    138
151    130
Name: at, dtype: int64

In [10]:
u_day = user[user['at'].dt.day_of_year == 117]
u_day.head()

Unnamed: 0,id,aim,account,at
652441,16919568,login,245.0,2021-04-27 09:07:38.358387
652624,16919569,auth_confirm_caution,245.0,2021-04-27 09:07:38.592844
652442,16919583,open_task,245.0,2021-04-27 09:08:01.777447
652629,16919586,download_task_file,245.0,2021-04-27 09:08:03.322072
652667,16919614,remove_inform,245.0,2021-04-27 09:08:44.844778


In [211]:
class PatternRecognition:
    
    def __init__(self, df):
        self.interval_values = (df['at'].shift(-1)[:-1] - df['at'][:-1]).dt.seconds.values
        self.aim_index_before = df.index[:-1]
        self.aim_before = df['aim'].values[:-1]
        self.aim_index_after = df.index[1:]
        self.aim_after = df['aim'].values[1:]
        self.interval = list(zip(self.aim_index_before, self.aim_before, self.interval_values, self.aim_index_after, self.aim_after))
    
    def outlier_detection(self):
        if hasattr(self, 'occurences'):
            percentile25 = pd.Series(self.occurences).quantile(0.25)
            percentile75 = pd.Series(self.occurences).quantile(0.75)
            iqr = percentile75 - percentile25
            self.upper_limit = percentile75 + 1.5 * iqr
            print(f"\n----- Upper limit:  {self.upper_limit} times -----\n")
        else:
            self.upper_limit = 0
    
    def register(self):
        self.occurences = []
        cntr = collections.Counter()
        for itvl in self.interval_values:
            cntr[itvl] += 1
        for el in cntr.most_common():
            if el[1] !=1:
                self.occurences.append(el[1])
        self.gaps = cntr.most_common()
        print(f"----- Gaps (second, times) -----\n")
        pprint(self.gaps)
        return self.gaps
    
    def optimal_interval(self):
        self.register()
        self.outlier_detection()
        fltr = list(filter(lambda x: x[1] > self.upper_limit, self.gaps))
        try:
            self.optimal_value = sum(map(lambda x: x[0], fltr)) / len(list(map(lambda x: x[0], fltr)))
            print(f"----- Optimal value: {self.optimal_value} second -----\n")
        except ZeroDivisionError:
            self.optimal_value = None
    
    def common_patterns(self):
        self.common_behaviour = []
        for occ in sorted(list(set([(lambda x: x[1])(x) for x in self.patterns])), reverse=True):
            lst = list(filter(lambda x: x[1] == occ, self.patterns))
            if lst[0][1] == 1:
                continue
            max_common = len(max(lst, key=lambda x: len(x[0]))[0])
            if max_common == 1:
                continue
            temp = list(filter(lambda x: len(x[0]) == max_common, lst))
            self.common_behaviour.extend(temp)
        return self.common_behaviour
    
    def pattern_decoder(self):
        for pttrn in self.common_behaviour:
            print(f"This pattern occurred {pttrn[1]} times")
            print("The pattern is:\n\t| ", end="")
            temp = ''
            for g in itertools.groupby(list(map(lambda x: inv_mapping[x], pttrn[0]))):
                occ = len(list(g[1]))
                if occ == 1:
                    temp += f"{g[0]} -> "
                else:
                    temp = f"{occ}x {g[0]} -> "
            print(temp[:-4], "|\n")
    
    def pattern_detection(self):
        self.optimal_interval()
        accumulator = collections.Counter()
        for start in range(len(self.interval) - 1):
            temp = (mapping[self.interval[start][1]], mapping[self.interval[start][4]])
            for stop in range(start + 1, len(self.interval)):
                if abs(self.interval[stop][2] - self.interval[start][2]) <= self.optimal_value:
                    start += 1
                    temp = temp + (mapping[self.interval[stop][4]],)
                else:
                    break
            accumulator[temp] += 1
        self.patterns = accumulator.most_common()
        self.common_patterns()
        self.pattern_decoder()
    
    def histogram(self):
        return px.histogram(self.interval_values)
    
    def scatter(self):
        return px.scatter(x=self.interval_values, y=self.interval_values)

In [212]:
ptrn = PatternRecognition(u_day)
ptrn.pattern_detection()

----- Gaps (second, times) -----

[(1, 132),
 (10, 31),
 (11, 27),
 (12, 22),
 (9, 20),
 (13, 14),
 (14, 12),
 (2, 11),
 (22, 8),
 (19, 8),
 (3, 8),
 (16, 7),
 (15, 7),
 (7, 7),
 (4, 6),
 (17, 5),
 (25, 5),
 (0, 4),
 (8, 4),
 (5, 4),
 (23, 3),
 (6, 3),
 (29, 3),
 (18, 3),
 (33, 3),
 (26, 3),
 (28, 3),
 (21, 3),
 (27, 2),
 (56, 2),
 (79, 2),
 (41, 1),
 (54, 1),
 (52, 1),
 (1317, 1),
 (58, 1),
 (81, 1),
 (1152, 1),
 (673, 1),
 (383, 1),
 (140, 1),
 (123, 1),
 (3090, 1),
 (53, 1),
 (118, 1),
 (84, 1),
 (423, 1),
 (537, 1),
 (120, 1),
 (1060, 1),
 (122, 1),
 (98, 1),
 (69, 1),
 (31, 1),
 (16545, 1),
 (191, 1),
 (724, 1),
 (182, 1),
 (35, 1),
 (68, 1),
 (263, 1),
 (60, 1),
 (20, 1),
 (85, 1),
 (115, 1),
 (141, 1),
 (78, 1),
 (71, 1),
 (88, 1),
 (38, 1),
 (80, 1),
 (49, 1),
 (50, 1)]

----- Upper limit:  19.25 times -----

----- Optimal value: 8.6 second -----

This pattern occurred 134 times
The pattern is:
	| open_task -> download_task_file |

This pattern occurred 126 times
The pattern is