In [112]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import auc
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
import timeit
import river
import math

# Clean Data

## July Data

## December Data

In [None]:
values = ['tdmp_bytes_created','tdmp_bytes_total','tdmp_packets_created','tdmp_packets_total']

In [None]:
for value in values:
    path = 'new_dataset/' + value
    with open(path + '.csv', "r") as input_file:
        rows = input_file.readlines()

    with open(path + '_clean.csv', "w") as output_file:
        for row in rows:
            if row[0] != '#':
                output_file.write(row) 
    try:
        df = pd.read_csv(path + '_clean.csv',header=0)
        df = df.drop(['Unnamed: 0','result','table','_start','_stop','_field','_measurement','host'], axis=1)
        df['hash_table'] = df.apply(lambda row: row['dst'] + '_' + row['dstp'] + '_' + row['proto'] + '_' + row['service'] + '_' + row['src'] + '_' + row['srcp'] + '_' + row['url'],axis=1)
        df.to_csv(path + '_clean.csv',index=False)
    except:
        continue
            

# Plot Data

## July Data

## December Data

In [None]:
values = ['tdmp_bytes_created_clean','tdmp_bytes_total_clean','tdmp_packets_created_clean','tdmp_packets_total_clean']

for value in values:
    path = 'new_dataset/' + value
    df = pd.DataFrame()          
    f = pd.read_csv(path + '.csv',header=0).sort_values(by='_time',ascending=True)                
    df = pd.concat([df,f],axis=0,ignore_index=True)                
    
    fig = plt.figure(figsize=(40, 15))        
    ax1 = fig.add_subplot(121)
    ax2 = fig.add_subplot(122)
    ax1.set_title(value)
    ax2.set_title(value + " differenced")
    ax1.plot(df["_time"], df["_value"])
    ax1.tick_params(axis='x', rotation=90)
    ax2.plot(df["_time"], df["_value"].diff())
    ax2.tick_params(axis='x', rotation=90)
    plt.show()
    fig.savefig('new_dataset/plots/' + value + '.png', dpi=fig.dpi)
    

# Z-Score New Data

In [29]:
def isAnomaly(value,mean,std):
    if float(std) == 0:
        return 0
    else:
        zscore = (value - mean) / std
        if zscore > 3:
            return 1
        else:
            return 0

In [24]:
values = ['tdmp_bytes_created_clean','tdmp_bytes_total_clean','tdmp_packets_created_clean','tdmp_packets_total_clean']

for value in values:
    path = 'new_dataset/' + value
    df = pd.DataFrame()          
    f = pd.read_csv(path + '.csv',header=0).sort_values(by='_time',ascending=True)                
    df = pd.concat([df,f],axis=0,ignore_index=True) 
    df['_value_diff'] = df['_value'].diff()
    df['_value_diff_squared'] = df['_value_diff'].pow(2)
    stats = pd.DataFrame()
    #stats['avg'] = df.groupby('hash_table')['_value'].mean()
    #stats['std'] = df.groupby('hash_table')['_value'].std()
    stats['avg_diff'] = df.groupby('hash_table')['_value_diff'].mean()
    stats['std_diff'] = df.groupby('hash_table')['_value_diff'].std()  
    stats['cnt_diff'] = df.groupby('hash_table')['_value_diff'].size()
    stats['ss_diff'] = df.groupby('hash_table')['_value_diff_squared'].sum()
    stats_dict = stats.T.to_dict('list')
    #df['anomaly'] = df.apply(lambda row: isAnomaly(row['_value'],stats_dict[row['hash_table']][0],stats_dict[row['hash_table']][1]),axis=1)
    df['anomaly_diff'] = df.apply(lambda row: isAnomaly(row['_value_diff'],stats_dict[row['hash_table']][0],stats_dict[row['hash_table']][1]),axis=1)
    df.to_csv('new_dataset/zscore/' + value + '.csv',index=False)
    stats.reset_index().to_csv('new_dataset/zscore/stats/' + value + '.csv',index=False)


In [25]:
for value in values:
    path = 'new_dataset/zscore/' + value
    f = pd.read_csv(path + '.csv',header=0)
    print(value)
    #print('#anomalies: ' + str(f.loc[f['anomaly']==1].shape[0]), '#normal: ' + str(f.loc[f['anomaly']==0].shape[0]))
    print('#anomalies_diff: ' + str(f.loc[f['anomaly_diff']==1].shape[0]), '#normal_diff: ' + str(f.loc[f['anomaly_diff']==0].shape[0]))
    print()
      

tdmp_bytes_created_clean
#anomalies_diff: 1 #normal_diff: 4971

tdmp_bytes_total_clean
#anomalies_diff: 14 #normal_diff: 4958

tdmp_packets_created_clean
#anomalies_diff: 1 #normal_diff: 4971

tdmp_packets_total_clean
#anomalies_diff: 4 #normal_diff: 4968



# Set InfluxDB Connection

In [26]:
import influxdb_client
from influxdb_client.client.write_api import SYNCHRONOUS

In [27]:
bucket = "riot"
org = "polimi"
token = "d2VsY29tZQ=="
url="http://35.152.63.133:8086"

client = influxdb_client.InfluxDBClient(
   url=url,
   token=token,
   org=org
)

query_api = client.query_api()

# Check new Anomalies z-score

In [72]:
def checkNewHash(hash_table,diff_hash):
    if hash_table in diff_hash:
        return 1
    else:
        return 0

In [83]:
def updateStatsDict(stats_dict,df):
    actual_hash = list(stats_dict.keys())
    new_hash = df['hash_table'].unique().tolist()
    diff_hash = list(set(new_hash) - set(actual_hash))    
    stats = pd.DataFrame()
    stats['avg_diff'] = df.groupby('hash_table')['_value_diff'].mean()
    stats['std_diff'] = df.groupby('hash_table')['_value_diff'].std()  
    stats['cnt_diff'] = df.groupby('hash_table')['_value_diff'].size()
    stats['ss_diff'] = df.groupby('hash_table')['_value_diff_squared'].sum()
    stats = stats[stats.index.isin(diff_hash)]
    new_stats_dict = stats.T.to_dict('list')
    stats_dict.update(new_stats_dict)
    df['new_hash'] = df.apply(lambda row: checkNewHash(row['hash_table'],diff_hash),axis=1)
    return stats_dict,df

In [131]:
measurements = ['tdmp_bytes_created','tdmp_bytes_total','tdmp_packets_created','tdmp_packets_total']

for m in measurements:
    path = 'new_dataset/zscore/stats/' + m + '_clean'
    stats_dict = pd.read_csv(path + '.csv',header=0).set_index('hash_table').T.to_dict('list')
    #eseguito giornalieralmente, come parametro
    q = 'from(bucket: "' + bucket + '")\
      |> range(start: -1d)\
      |> filter(fn: (r) => r["_measurement"] == "' + m + '")\
      |> filter(fn: (r) => r["service"] == "afbackup" or r["service"] == "afmbackup" or r["service"] == "amanda" or r["service"] == "amandaidx" or r["service"] == "amidxtape" or r["service"] == "amqp" or r["service"] == "amqps" or r["service"] == "asp" or r["service"] == "bgpd" or r["service"] == "bpcd" or r["service"] == "bpdbm" or r["service"] == "bprd" or r["service"] == "canna" or r["service"] == "cfengine" or r["service"] == "codasrv" or r["service"] == "cvspserver" or r["service"] == "daap" or r["service"] == "dcap" or r["service"] == "dict" or r["service"] == "dircproxy" or r["service"] == "distcc" or r["service"] == "distmp3" or r["service"] == "eklogin" or r["service"] == "fax" or r["service"] == "fido" or r["service"] == "freeciv" or r["service"] == "ggz" or r["service"] == "git" or r["service"] == "gnunet" or r["service"] == "gpsd" or r["service"] == "gsidcap" or r["service"] == "gris" or r["service"] == "gsiftp" or r["service"] == "gsigatekeeper" or r["service"] == "hkp" or r["service"] == "hostmon" or r["service"] == "hylafax" or r["service"] == "iax" or r["service"] == "icpv2" or r["service"] == "iprop" or r["service"] == "ircd" or r["service"] == "isdnlog" or r["service"] == "isisd" or r["service"] == "isns" or r["service"] == "kamanda" or r["service"] == "knetd" or r["service"] == "kx" or r["service"] == "mmcc" or r["service"] == "mdns" or r["service"] == "mon" or r["service"] == "mrtd" or r["service"] == "mtn" or r["service"] == "munin" or r["service"] == "nbd" or r["service"] == "nfs" or r["service"] == "ninstall" or r["service"] == "noclog" or r["service"] == "nut" or r["service"] == "ospf6d" or r["service"] == "ospfapi" or r["service"] == "ospfd" or r["service"] == "pcrd" or r["service"] == "postgresql" or r["service"] == "puppet" or r["service"] == "remctl" or r["service"] == "rfe" or r["service"] == "ripd" or r["service"] == "ripngd" or r["service"] == "sieve" or r["service"] == "sip" or r["service"] == "smsqp" or r["service"] == "suucp" or r["service"] == "svn" or r["service"] == "sysrqd" or r["service"] == "tfido" or r["service"] == "vboxd" or r["service"] == "venus" or r["service"] == "vopied" or r["service"] == "webmin" or r["service"] == "wnn6" or r["service"] == "x11" or r["service"] == "xinetd" or r["service"] == "xmms2" or r["service"] == "xpilot" or r["service"] == "xtell" or r["service"] == "zebra" or r["service"] == "zebrasrv" or r["service"] == "zope")'
    result = query_api.query(org=org, query=q)
    results = []
    for table in result:
        for record in table.records:
            results.append([record.values.get("dst"),record.values.get("dstp"),record.values.get("proto"),record.values.get("service"),record.values.get("src"),record.values.get("srcp"),record.values.get("url"), record.get_value()])
    df = pd.DataFrame(results,columns=['dst', 'dstp', 'proto','service','src','srcp','url','_value'])
    df['_value_diff'] = df['_value'].diff()
    df['_value_diff_squared'] = df['_value_diff'].pow(2)
    df['hash_table'] = df.apply(lambda row: row['dst'] + '_' + row['dstp'] + '_' + row['proto'] + '_' + row['service'] + '_' + row['src'] + '_' + row['srcp'] + '_' + row['url'],axis=1)
    stats_dict,df = updateStatsDict(stats_dict,df)
    df['anomaly_diff'] = df.apply(lambda row: isAnomaly(row['_value_diff'],stats_dict[row['hash_table']][0],stats_dict[row['hash_table']][1]),axis=1)
    anomalies = df.loc[df['anomaly_diff'] == 1]
    if anomalies.shape[0] != 0:
        #call API
        print('*** Anomalies in ' + m + ' ***')
        print(anomalies.values.tolist())
    normalities = df.loc[(df['anomaly_diff'] == 0) & (df['new_hash'] == 0)]
    if normalities.shape[0] != 0:
        for index, row in normalities.iterrows():
            stat = stats_dict[row['hash_table']]
            #avg += (value-avg)/n 
            stat[0] += (row['_value_diff'] - stat[0]) / stat[2]
            stat[3] += math.pow(row['_value_diff'],2)
            stat[2] += 1
            #std = sqrt((1/(n-1)*(ss-(avg.pow(2)/n))))
            stat[1] = math.sqrt((1 / (stat[2]-1)) * (stat[3] - (math.pow(stat[0],2) / stat[2])))
            stats_dict[row['hash_table']] = stat
    pd.DataFrame.from_dict(stats_dict,orient='index').reset_index().rename(columns={'index':'hash_table', 0:'avg_diff', 1:'std_diff', 2:'cnt_diff', 3:'ss_diff'}).to_csv(path + '.csv',index=False)


*** Anomalies in tdmp_bytes_created ***
[['94.234.170.103', 'hostmon', 'tcp', 'hostmon', 'demo.riotlocal', 'iotserver-exchange', 'http://demo.riotsecure.io:7888/metrics', 1639592354.408828, 2861.8567411899567, 8190224.007094398, '94.234.170.103_hostmon_tcp_hostmon_demo.riotlocal_iotserver-exchange_http://demo.riotsecure.io:7888/metrics', 1, 1], ['94.234.170.103', 'noclog', 'tcp', 'noclog', 'demo.riotlocal', 'iotserver-exchange', 'http://demo.riotsecure.io:7888/metrics', 1639592278.627758, 64.18851065635681, 4120.164900281232, '94.234.170.103_noclog_tcp_noclog_demo.riotlocal_iotserver-exchange_http://demo.riotsecure.io:7888/metrics', 1, 1], ['94.234.170.103', 'x11', 'tcp', 'x11', 'demo.riotlocal', 'iotserver-exchange', 'http://demo.riotsecure.io:7888/metrics', 1639599192.301019, 13219.553738594055, 174756601.04757607, '94.234.170.103_x11_tcp_x11_demo.riotlocal_iotserver-exchange_http://demo.riotsecure.io:7888/metrics', 1, 1], ['94.234.170.108', 'vboxd', 'tcp', 'vboxd', 'demo.riotlocal',

*** Anomalies in tdmp_bytes_total ***
[['94.234.170.103', 'mdns', 'tcp', 'mdns', 'demo.riotlocal', 'iotserver-exchange', 'http://demo.riotsecure.io:7888/metrics', 300.0, 48.0, 2304.0, '94.234.170.103_mdns_tcp_mdns_demo.riotlocal_iotserver-exchange_http://demo.riotsecure.io:7888/metrics', 1, 1], ['94.234.170.103', 'x11', 'tcp', 'x11', 'demo.riotlocal', 'iotserver-exchange', 'http://demo.riotsecure.io:7888/metrics', 300.0, 96.0, 9216.0, '94.234.170.103_x11_tcp_x11_demo.riotlocal_iotserver-exchange_http://demo.riotsecure.io:7888/metrics', 1, 1], ['94.234.170.108', 'isdnlog', 'tcp', 'isdnlog', 'demo.riotlocal', 'iotserver-exchange', 'http://demo.riotsecure.io:7888/metrics', 252.0, 4.0, 16.0, '94.234.170.108_isdnlog_tcp_isdnlog_demo.riotlocal_iotserver-exchange_http://demo.riotsecure.io:7888/metrics', 1, 1], ['94.234.170.115', 'git', 'tcp', 'git', 'demo.riotlocal', 'iotserver-exchange', 'http://demo.riotsecure.io:7888/metrics', 300.0, 48.0, 2304.0, '94.234.170.115_git_tcp_git_demo.riotlocal

*** Anomalies in tdmp_packets_created ***
[['94.234.170.103', 'hostmon', 'tcp', 'hostmon', 'demo.riotlocal', 'iotserver-exchange', 'http://demo.riotsecure.io:7888/metrics', 1639592354.408767, 2861.856723546982, 8190223.906111066, '94.234.170.103_hostmon_tcp_hostmon_demo.riotlocal_iotserver-exchange_http://demo.riotsecure.io:7888/metrics', 1, 1], ['94.234.170.103', 'noclog', 'tcp', 'noclog', 'demo.riotlocal', 'iotserver-exchange', 'http://demo.riotsecure.io:7888/metrics', 1639592278.6277153, 64.1885154247284, 4120.165512430595, '94.234.170.103_noclog_tcp_noclog_demo.riotlocal_iotserver-exchange_http://demo.riotsecure.io:7888/metrics', 1, 1], ['94.234.170.103', 'x11', 'tcp', 'x11', 'demo.riotlocal', 'iotserver-exchange', 'http://demo.riotsecure.io:7888/metrics', 1639599192.3009744, 13219.553740501404, 174756601.09800467, '94.234.170.103_x11_tcp_x11_demo.riotlocal_iotserver-exchange_http://demo.riotsecure.io:7888/metrics', 1, 1], ['94.234.170.108', 'vboxd', 'tcp', 'vboxd', 'demo.riotlocal

*** Anomalies in tdmp_packets_total ***
[['94.234.170.103', 'mdns', 'tcp', 'mdns', 'demo.riotlocal', 'iotserver-exchange', 'http://demo.riotsecure.io:7888/metrics', 6.0, 1.0, 1.0, '94.234.170.103_mdns_tcp_mdns_demo.riotlocal_iotserver-exchange_http://demo.riotsecure.io:7888/metrics', 1, 1], ['94.234.170.103', 'x11', 'tcp', 'x11', 'demo.riotlocal', 'iotserver-exchange', 'http://demo.riotsecure.io:7888/metrics', 6.0, 2.0, 4.0, '94.234.170.103_x11_tcp_x11_demo.riotlocal_iotserver-exchange_http://demo.riotsecure.io:7888/metrics', 1, 1], ['94.234.170.115', 'git', 'tcp', 'git', 'demo.riotlocal', 'iotserver-exchange', 'http://demo.riotsecure.io:7888/metrics', 6.0, 1.0, 1.0, '94.234.170.115_git_tcp_git_demo.riotlocal_iotserver-exchange_http://demo.riotsecure.io:7888/metrics', 1, 1], ['94.234.170.123', 'xpilot', 'tcp', 'xpilot', 'demo.riotlocal', 'iotserver-exchange', 'http://demo.riotsecure.io:7888/metrics', 6.0, 2.0, 4.0, '94.234.170.123_xpilot_tcp_xpilot_demo.riotlocal_iotserver-exchange_htt

# Check new entries New Data

In [None]:
tags = ['dst','dstp','service','src','srcp','url']

for tag in tags:
    path = 'new_dataset/lists/' + tag
    df = pd.read_csv(path + '.csv',header=0)
    l = df['value'].tolist()
    #eseguito giornalieralmente, come parametro
    q = 'import "influxdata/influxdb/schema"\
        schema.tagValues(\
        bucket: "riot",\
        tag: "' + tag + '",\
        start: -1d)'
    #system.exit(0)
    result = query_api.query(org=org, query=q)
    results = []
    for table in result:
        for record in table.records:
            results.append((record.get_value()))
    diffs = list(set(results) - set(l))
    if len(diffs) != 0:
        #call API
        print('*** New ' + tag + ' ***')
        for diff in diffs:
            print(diff)
            new_row = {'value': diff}
            df = df.append(new_row, ignore_index=True)
    df.to_csv('new_dataset/lists/' + tag + '.csv',index=False)

# Make request API

In [None]:
import hashlib
import requests
from requests import Request, Session
import time
import random

In [None]:
def createAuth(username,method,uri):
    #auth1 = "user@host.com:" + username + ':' + password
    #hash_object = hashlib.md5(auth1.encode())
    #hash1 = hash_object.hexdigest().upper()
    hash1 = '7CC384C2A10CEA62FB2A37CFDA222C04'
    
    now = '{:8X}'.format(int(time.time()))    
    POOL = "ABCDEF0123456789"
    nonce_list = []
    for i in range(0,24):
        nonce_list.append(POOL[random.randint(0,len(POOL)-1)])
    nonce_str = ''.join(nonce_list)
    nonce = now + nonce_str
    
    auth2 = method + ':' + uri
    hash_object = hashlib.md5(auth2.encode())
    hash2 = hash_object.hexdigest().upper()
    
    auth3 = hash1 + ':' + nonce + ':' + hash2
    hash_object = hashlib.md5(auth3.encode())
    authority = hash_object.hexdigest().upper()
    
    return "oasis username=\"" + "service" + "\", nonce=\"" + nonce + "\", authority=\"" + authority + "\""

In [None]:
method = 'GET'
uri = '/global' # use '/global' to access the variables, '/auth' only for the authentication
username = 'service'
BASE_URI = 'https://demo.riotsecure.io:6443'
headers = {
    'Authorization': createAuth(username,method,uri), 
    'Content-Type': "application/json",
    'Cache-Control': "no-cache"
}
print(headers)
body = {}

In [None]:
if method == "GET":
    _uri = uri + "?expand"
    response = requests.get(BASE_URI + _uri, headers=headers)
elif method == "POST":
    response = requests.post(BASE_URI + _uri, headers=headers, json=body)
    
if response.status_code == 200:
    #in console, print in stderror
    print("The request was a success!")
    print('The tolerance is ' + str(response.json()['keys']['net_anomoly']['tolerance']))
    print('The frequency is ' + str(response.json()['keys']['net_anomoly']['frequency']))
elif response.status_code == 404:
    print("Error 404")
elif response.status_code == 401:
    print("401: Unauthorized")

# Process Data

In [None]:
# Define metrics to use

# più grande è, meglio è
def em(t, t_max, volume_support, s_unif, s_X, n_generated):
    EM_t = np.zeros(t.shape[0])
    n_samples = s_X.shape[0]
    s_X_unique = np.unique(s_X)
    EM_t[0] = 1.
    for u in s_X_unique:
        # if (s_unif >= u).sum() > n_generated / 1000:
        EM_t = np.maximum(EM_t, 1. / n_samples * (s_X > u).sum() -
                          t * (s_unif > u).sum() / n_generated
                          * volume_support)
    amax = np.argmax(EM_t <= t_max) + 1
    if amax == 1:
        print ('\n failed to achieve t_max \n')
        amax = -1
    AUC = auc(t[:amax], EM_t[:amax])
    return AUC, EM_t, amax


# più piccolo è, meglio è
def mv(axis_alpha, volume_support, s_unif, s_X, n_generated):
    n_samples = s_X.shape[0]
    s_X_argsort = s_X.argsort()
    mass = 0
    cpt = 0
    u = s_X[s_X_argsort[-1]]
    mv = np.zeros(axis_alpha.shape[0])
    for i in range(axis_alpha.shape[0]):
        # pdb.set_trace()
        while mass < axis_alpha[i]:
            cpt += 1
            u = s_X[s_X_argsort[-cpt]]
            mass = 1. / n_samples * cpt  # sum(s_X > u)
        mv[i] = float((s_unif >= u).sum()) / n_generated * volume_support
    return auc(axis_alpha, mv), mv


In [None]:
n_generated = 100000
alpha_min = 0.9
alpha_max = 0.999
t_max = 0.9
ocsvm_max_train = 10000
np.random.seed(1)

In [None]:
def split_df(df):    
    df['_time'] = pd.to_datetime(df['_time']).astype(np.int64) / int(1e6)
    encoder = OneHotEncoder(handle_unknown='ignore')
    ohencDf = df.copy()
    encoding = pd.DataFrame(encoder.fit_transform(ohencDf[["dst","dstp","proto","src","srcp","url"]]).toarray())
    ohencDf = ohencDf.drop(["dst","dstp","proto","src","srcp","url"],axis=1)
    ohencDf = ohencDf.join(encoding)
    n_train = int(ohencDf.shape[0] * 0.8)
    train = np.array(ohencDf.head(n_train))
    test = (ohencDf.tail(ohencDf.shape[0]-n_train))
    return ohencDf,np.array(train),np.array(test)

In [None]:
def calculate_parameters(df):
    lim_inf = np.array(df).min(axis=0)
    lim_sup = np.array(df).max(axis=0)
    volume_support = (lim_sup - lim_inf).prod()
    volume_support = volume_support if volume_support > 1 else 1
    t = np.arange(0, 100 / volume_support, 0.01 / volume_support)
    axis_alpha = np.arange(alpha_min, alpha_max, 0.0001)
    unif = np.random.uniform(lim_inf, lim_sup,size=(n_generated, df.shape[1]))
    return t,volume_support,axis_alpha,unif

In [None]:
def evaluate_model(model,train,test,unif,reshape=False):
    model.fit(train)
    s_X_model = model.decision_function(test)
    s_unif_model = model.decision_function(unif)
    if reshape:
        #model.fit(train[:min(ocsvm_max_train, n_samples_train - 1)])
        s_X_model = s_X_model.reshape(1, -1)[0]
        s_unif_model = s_unif_model.reshape(1, -1)[0]                           
    return s_X_model,s_unif_model

In [None]:
def calculate_metrics(t,volume_support,s_unif_model,s_X_model,axis_alpha):
    auc_em_model,em_model,amax_model = em(t, t_max,volume_support,s_unif_model,s_X_model,n_generated)
    auc_mv_model,mv_model = mv(axis_alpha,volume_support,s_unif_model, s_X_model,n_generated)
    return auc_em_model,em_model,amax_model,auc_mv_model,mv_model

In [None]:
def plot(path,title,t,amax,em_iforest,mv_iforest,auc_em_iforest,auc_mv_iforest,em_lof,mv_lof,auc_em_lof,auc_mv_lof,em_ocsvm,mv_ocsvm,auc_em_ocsvm,auc_mv_ocsvm,axis_alpha):
    plt.clf()    
    fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(10, 5))
    fig.suptitle(title,fontsize=25,y=1.05)    

    ax1.plot(t[:amax], em_iforest[:amax], lw=1,label='%s (em_score = %0.3e)'% ('iforest', auc_em_iforest))
    ax1.plot(t[:amax], em_lof[:amax], lw=1,label='%s (em-score = %0.3e)'% ('lof', auc_em_lof))
    ax1.plot(t[:amax], em_ocsvm[:amax], lw=1,label='%s (em-score = %0.3e)'% ('ocsvm', auc_em_ocsvm))
    ax1.set_xlabel('t',fontsize=20)
    ax1.set_ylabel('EM(t)',fontsize=20)
    ax1.set_ylim([-0.05, 1.05])
    ax1.set_title('Excess-Mass curve', fontsize=20)
    ax1.legend(loc="upper center",bbox_to_anchor=(0.5,-0.15),fancybox=True)

    ax2.plot(axis_alpha, mv_iforest, lw=1,label='%s (mv-score = %0.3e)'% ('iforest', auc_mv_iforest))
    ax2.plot(axis_alpha, mv_lof, lw=1,label='%s (mv-score = %0.3e)'% ('lof', auc_mv_lof))
    ax2.plot(axis_alpha, mv_ocsvm, lw=1,label='%s (mv-score = %0.3e)'% ('ocsvm', auc_mv_ocsvm))    
    ax2.set_xlabel('alpha', fontsize=20)
    ax2.set_ylabel('MV(alpha)', fontsize=20)
    ax2.set_title('Mass-Volume Curve', fontsize=20)
    ax2.legend(loc="upper center",bbox_to_anchor=(0.5,-0.15),fancybox=True)
    
    fig.subplots_adjust(wspace=0.4)
    #plt.show()
    plt.tight_layout()
    plt.savefig(path + 'EM_MV_plot.png',bbox_inches='tight')

In [None]:
files = ['2021-04-01_2021-05-01_clean','2021-05-01_2021-06-01_clean','2021-06-01_2021-07-01_clean','2021-07-01_2021-08-01_clean']

for value in values:
    for service in services:
        title = value + ' ' + service
        print(title)
        path = 'dataset/' + value + '/' + service + '/'
        df = pd.DataFrame()
        for file in files:            
            try:
                f = pd.read_csv(path + file + '.csv',header=0).sort_values(by='_time',ascending=True)                
                df = pd.concat([df,f],axis=0,ignore_index=True)                
            except:
                continue        
        df_tot,df_train,df_test = split_df(df)
        t,volume_support,axis_alpha,unif = calculate_parameters(df_tot)
        start = timeit.default_timer()
        s_X_iforest,s_unif_iforest = evaluate_model(IsolationForest(),df_train,df_test,unif)
        s_X_lof,s_unif_lof = evaluate_model(LocalOutlierFactor(n_neighbors=20,novelty=True),df_train,df_test,unif)
        s_X_ocsvm,s_unif_ocsvm = evaluate_model(OneClassSVM(),df_train,df_test,unif,True)
        stop = timeit.default_timer()
        print('Time: ', stop - start)
        auc_em_iforest,em_iforest,amax_iforest,auc_mv_iforest,mv_iforest = calculate_metrics(t,volume_support,s_unif_iforest,s_X_iforest,axis_alpha)
        auc_em_lof,em_lof,amax_lof,auc_mv_lof,mv_lof = calculate_metrics(t,volume_support,s_unif_lof,s_X_lof,axis_alpha)
        auc_em_ocsvm,em_ocsvm,amax_ocsvm,auc_mv_ocsvm,mv_ocsvm = calculate_metrics(t,volume_support,s_unif_ocsvm,s_X_ocsvm,axis_alpha)
        if amax_iforest == -1 or amax_lof == -1 or amax_ocsvm == -1:
            amax = -1
        else:
            amax = max(amax_iforest, amax_lof, amax_ocsvm)
        plot(path,title,t,amax,em_iforest,mv_iforest,auc_em_iforest,auc_mv_iforest,em_lof,mv_lof,auc_em_lof,auc_mv_lof,em_ocsvm,mv_ocsvm,auc_em_ocsvm,auc_mv_ocsvm,axis_alpha)
        