Implement RITA into jupyter notebook 

In [None]:
import math
import pandas as pd
import numpy as np

Loading data

In [None]:
f_timestamp = '@timestamp'
f_src_ip = 'source.ip'
f_dst_ip = 'cisco.ftd.security.dst_ip'
f_dst_host = 'url.original'
f_dst_port = 'cisco.ftd.security.dst_port'
f_sent_bytes = 'cisco.ftd.security.responder_bytes'


columns_to_filter = [f_timestamp, f_src_ip, f_dst_ip, f_dst_host, f_dst_port, f_sent_bytes]
columns_to_groupby = [f_src_ip, f_dst_ip, f_dst_host, f_dst_port] #Group the connect together that are the same. 
# columns to display after the analysis
columns_to_display = ['Score','tsScore','dsScore','conn_count',f_src_ip,f_dst_ip,f_dst_host, f_dst_port,f_sent_bytes,'deltas']

In [None]:
df = pd.read_csv('data_test.csv')

Filtering the req columns

Preparing the data for analysis

In [None]:
df = df.loc[:,columns_to_filter]
df[f_timestamp] = pd.to_datetime(df[f_timestamp]) #Converting str to datetime
df = df.groupby(columns_to_groupby).agg(list)
df.reset_index(inplace=True)
df.head()

Cal connection count

In [None]:
# create a new column 'conn_count', and for each row in the 'timestamp' column, apply a function and assign the returned value to the 'conn_count' column
df['ConnectionCount'] = df[f_timestamp].apply(lambda x: len(x))
df.head()

In [None]:
df = df.loc[df['ConnectionCount'] > 10] #Remove all connection with less then 10 connections, it was choosen because of the small data sample I used, The goal is to reduce the amount of data that need to be processed
df.head()

Sorting by timestamps

In [None]:
df[f_timestamp] = df[f_timestamp].apply(lambda x: sorted(x))
df.head()

Cal time delta

In [None]:
# Convert list into a Series object, get time delta, convert the result back into a list and assign it to the 'deltas' column
df['delta_time'] = df[f_timestamp].apply(lambda x: pd.Series(x).diff().dt.seconds.dropna().tolist())
df.head()

cal time series variables

In [None]:
df['tsLow'] = df['delta_time'].apply(lambda x: np.percentile(np.array(x),25))
df['tsMid'] = df['delta_time'].apply(lambda x: np.percentile(np.array(x), 50))
df['tsHigh'] = df['delta_time'].apply(lambda x: np.percentile(np.array(x), 75))

df['tsBowleyNum'] = df['tsLow'] + df['tsHigh'] - 2 * df['tsMid']
df['tsBowleyDen'] = df['tsHigh'] - df['tsLow']

# tsSkew should equal zero if the denominator equals zero
# bowley skew is unreliable if Q2 = Q1 or Q2 = Q3
df['tsSkew'] = df[['tsLow', 'tsMid', 'tsHigh', 'tsBowleyNum','tsBowleyDen']].apply(
    lambda x: x['tsBowleyNum'] / x['tsBowleyDen'] if x['tsBowleyDen'] !=0 and x['tsMid'] != x['tsLow'] and x['tsMid'] != x['tsHigh'] !=0 else 0.0, axis=1
    )
df['tsMadm'] = df['delta_time'].apply(lambda x: np.median(np.absolute(np.array(x) - np.median(np.array(x)))))
df['tsConnDiv'] = df[f_timestamp].apply(lambda x: (x[-1].to_pydatetime() - x[0].to_pydatetime()).seconds / 90)

# Time delta score calculation
df['tsConnCountScore'] = df.apply(lambda x: 0.0 if x['tsConnDiv'] == 0  else x['ConnectionCount'] / x['tsConnDiv'] if x['ConnectionCount'] / x['tsConnDiv'] < 1.0 else 1.0 , axis=1)
df['tsSkewScore'] = 1.0 - abs(df['tsSkew'])
df['tsMadmScore'] = df['tsMadm'].apply(lambda x: 0 if 1.0 - (x / 30.0) < 0 else 1.0 - (x / 30.0))
df['tsScore'] = (((df['tsSkewScore'] + df['tsMadmScore'] + df['tsConnCountScore']) / 3.0) * 1000) / 1000

df.head()

Varibles for data size dispersion

In [None]:
df['dsLow'] = df[f_sent_bytes].apply(lambda x: np.percentile(np.array(x), 25))
df['dsMid'] = df[f_sent_bytes].apply(lambda x: np.percentile(np.array(x), 50))
df['dsHigh'] = df[f_sent_bytes].apply(lambda x: np.percentile(np.array(x), 75))
df['dsBowleyNum'] = df['dsLow'] + df['dsHigh'] - 2 * df['dsMid']
df['dsBowleyDen'] = df['dsHigh'] - df['dsLow']


# dsSkew should equal zero if the denominator equals zero
# bowley skew is unreliable if Q2 = Q1 or Q2 = Q3
df['dsSkew'] = df[['dsLow','dsMid','dsHigh','dsBowleyNum','dsBowleyDen']].apply(
    lambda x: x['dsBowleyNum'] / x['dsBowleyDen'] if x['dsBowleyDen'] != 0 and x['dsMid'] != x['dsLow'] and x['dsMid'] != x['dsHigh'] else 0.0, axis=1
    )
df['dsMadm'] = df[f_sent_bytes].apply(lambda x: np.median(np.absolute(np.array(x) - np.median(np.array(x)))))


# Data size score calculation of sent bytes
df['dsSkewScore'] = 1.0 - abs(df['dsSkew'])
df['dsMadmScore'] = df['dsMadm'].apply(lambda x: 0 if x/ 128.0 < 0 else x/ 128.0)
df['dsSmallnessScore'] = df['dsMid'].apply(lambda x: 0 if 1.0 - x / 8192.0 < 0 else 1.0 - x / 8192.0)
df['dsScore'] = (((df['dsSkewScore'] + df['dsMadmScore'] + df['dsSmallnessScore']) / 3.0) * 1000) / 1000

In [None]:

# Overal Score calculation
df['Score'] = (df['dsScore'] + df['tsScore']) / 2

df.sort_values(by= 'Score', ascending=False, inplace=True, ignore_index=True)
df.head(30)

In [None]:
df.loc[df['Score'] > 0.80, columns_to_display]