In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import IsolationForest
import seaborn as sns
from df_functions import read_csv_file, extract_users, format_data, split_and_reformat, reqs_per_5mins, clean_reqlogs, calculate_variance, count_unique_reqs, longestConsecutive
from vectorize import vectorizer
import math

In [None]:
#Fetch data from csv file
data = read_csv_file('requests.csv')

#Extracts a list of all unique users in dataframe
users = extract_users(data)

#Fetches the first user in list
user1 = users[0]

In [65]:
#This function will only be used for training data
def split_user_df(dataframe, user):

    #Dataframe containing all requests made by chosen user
    user_data = dataframe.loc[data['userID'] == user]
    number_of_reqs = user_data.shape[0]
    partitions = number_of_reqs/50
    partitions = math.floor(partitions)

    #Splits the data frame into smaller chunks of ~50 requests
    dfs = np.array_split(user_data, partitions)
    
    #Returns a list of dataframes
    return dfs

In [64]:
#Each of these dataframes represent how data fetched from redis might look like
data_chunk = split_user_df(data, user1)[4]

#This is a dataframe of 50 requests made by user-1003
data_chunk

Unnamed: 0,timestamp,userID,sessionID,expiring,URL
1869,1676373808196,user-1003,,0,/login
1888,1676373815842,user-1003,3e51c31d-92f2-4483-9c3a-e208ad2188ef,1676374108196,/searchUsers?page=2099
1902,1676373819306,user-1003,3e51c31d-92f2-4483-9c3a-e208ad2188ef,1676374108196,/searchUsers?page=2100
1903,1676373819816,user-1003,3e51c31d-92f2-4483-9c3a-e208ad2188ef,1676374108196,/chat/send/822
1904,1676373820336,user-1003,3e51c31d-92f2-4483-9c3a-e208ad2188ef,1676374108196,/chat/send/498
1905,1676373820856,user-1003,3e51c31d-92f2-4483-9c3a-e208ad2188ef,1676374108196,/chat/send/779
1929,1676373830009,user-1003,3e51c31d-92f2-4483-9c3a-e208ad2188ef,1676374108196,/searchUsers?page=2101
1948,1676373833982,user-1003,3e51c31d-92f2-4483-9c3a-e208ad2188ef,1676374108196,/searchUsers?page=2102
1950,1676373834496,user-1003,3e51c31d-92f2-4483-9c3a-e208ad2188ef,1676374108196,/chat/send/666
1951,1676373835016,user-1003,3e51c31d-92f2-4483-9c3a-e208ad2188ef,1676374108196,/chat/send/993


In [41]:
#Next we want to check the time between the requests in each chunk and calculate the mean
def calc_avg_timediff(userdata):
    #We get a list of all timestamps within the data chunk
    timestamps = userdata['timestamp'].tolist()
    timestamps = np.array(timestamps)
    
    #calculates the avarage in milliseconds
    avg_ms = np.average(np.diff(timestamps))
    
    return avg_ms/1000

2.1524

In [66]:
#This gives us the avarage time between the requests of the dataframe above
avg_time_diff = calc_avg_timediff(data_chunk)
avg_time_diff

2.1524

In [61]:
"""
For the next step we create a function that split the a data frame in intervals of 5
and checks hoe many sessionID that have been used during these time windows. 
The return value is the average for the entire dataframe
"""

def avg_tokens_5mins(dataframe):
    """
    Section that calculates how many requests that was made during each 5 min period. 
    The result will be used to partition the in 5 min chunks.
    """
    df = pd.DataFrame()
    df['timestamp'] = pd.to_datetime(dataframe['timestamp'], unit='ms')
    df['val'] = 1
    df = df.set_index('timestamp')
    df = df.resample('5min').sum()
    partitions = df['val'].to_list()
    
    #Dataframe chunks of all requests within 5 min windows
    dfs = np.array_split(dataframe, partitions)

    """
    Counts number of unique tokens within a 5 min window 
    and adds to the list 'unique_tokens'
    """
    unique_tokens = []
    for i in range (0, len(dfs)):
        unique_tokens.append(len(list(dfs[i]['sessionID'].unique())))

    return np.average(unique_tokens)    

In [62]:
"""
Code for testing the functions above
"""
data_chunks = split_user_df(data, user1)
avg_req_frequences = []
avg_tokens = []

for i in range(0, len(data_chunks)):
    avg_req_frequences.append(calc_avg_timediff(data_chunks[i]))
    avg_tokens.append(avg_tokens_5mins(data_chunks[i]))

In [63]:

print(avg_req_frequences)
print(avg_tokens)

[2.13522, 2.43886, 2.44234, 1.85938, 2.1524, 1.8756, 2.559, 2.3568000000000002, 2.6318, 2.4751999999999996, 2.38062, 2.1144000000000003, 1.714, 2.397, 2.4162, 1.7754, 2.18942, 2.8286, 3.0711, 2.207, 2.1012, 2.1024000000000003, 2.4174, 1.6036, 1.9712, 2.3562, 2.1935, 2.20378, 1.52372, 2.7207600000000003, 1.76842, 1.75494, 2.45966, 2.18594, 1.5975599999999999, 2.675, 2.19162, 2.7896799999999997, 1.9432, 2.1195999999999997, 2.0906, 1.7181, 2.3506, 2.372469387755102, 2.560265306122449, 2.383877551020408, 2.1169795918367345, 2.5226530612244895, 3.031591836734694, 1.7766122448979593, 2.084204081632653, 2.0310816326530614, 2.7669387755102037, 3.2493265306122447, 2.1844897959183673, 1.7932653061224488, 2.1563265306122448, 2.0714285714285716, 1.8322448979591837, 2.649530612244898, 1.625122448979592, 1.6414693877551019, 2.2954489795918365, 1.701469387755102, 1.8713265306122449, 2.130612244897959, 2.6925510204081635, 2.483816326530612, 1.79065306122449, 2.4975714285714283, 1.5187755102040816, 2.3