# Welcome! And good luck :)
Make sure to have the 'IMAP-login-events.csv' and 'ground_truth.pickle' in the same working directory as this notebook and load all the required packages

This is a modified version of the IMAP dataset, where only login commands are kept and the cIp colummn was split in ip and port for your convenince

In [15]:
# installing and loading the required packages
# !pip install dask
# !pip install ciso8601
import dask.dataframe as dd
import numpy as np
import ciso8601
import pickle

# loading in the dataset
login_df = dd.read_csv('./IMAP-login-events.csv')

# used to avoid np.Datetime64 warning
def to_np_datetime64(timestamp):
    # convert to datetime.datetime and strip timezone
    dt = ciso8601.parse_datetime(timestamp[:26]).replace(tzinfo=None)
    # convert to numpy.datetime64
    dt64 = np.datetime64(dt)
    return dt64

## Getting familiar with the login data

In [16]:
print('Count of login instances: ', len(login_df))
login_df.head()

Count of login instances:  2278031


Unnamed: 0.1,Unnamed: 0,index,dateTime,sessionId,seqNumber,sIp,ip,port,user,duration,rqsize,rpsize,command,parameters,context,puid,server
0,5,5,2021-11-08 23:00:02.102000+00:00,000000000015B715,3,131.155.15.37:993,170.51.30.236,14490,striddle,5,22,20,login,striddle *****,"r=""3 no login failed."";msg=logonfailed:logonde...",,EXCH19MBX-OP01
1,5393195,3,2021-11-08 23:00:17.071000+00:00,00000000000B3D71,3,131.155.15.38:993,170.51.30.236,14476,striddle,3,22,20,login,striddle *****,"r=""3 no login failed."";msg=logonfailed:logonde...",,EXCH19MBX-OP02
2,5393203,11,2021-11-08 23:00:32.094000+00:00,00000000000B3D73,3,131.155.15.38:993,170.51.30.236,14478,striddle,3,22,20,login,striddle *****,"r=""3 no login failed."";msg=logonfailed:logonde...",,EXCH19MBX-OP02
3,5393207,15,2021-11-08 23:00:47.174000+00:00,00000000000B3D74,3,131.155.15.38:993,170.51.30.236,13104,striddle,3,22,20,login,striddle *****,"r=""3 no login failed."";msg=logonfailed:logonde...",,EXCH19MBX-OP02
4,5393213,21,2021-11-08 23:01:02.202000+00:00,00000000000B3D75,3,131.155.15.38:993,170.51.30.236,13106,striddle,3,22,20,login,striddle *****,"r=""3 no login failed."";msg=logonfailed:logonde...",,EXCH19MBX-OP02


## Creating a framework for data analysis
Feel free to not use it or create your own methods, here are some examples

In [17]:
class AnomalyInfo:
    def __init__(self, timestamp):
        self.counter = 0
        self.start_timestamp = timestamp
        self.ips = set()
        self.users = set()

    def increase_counter(self):
        self.counter += 1
        
    def reset_counter(self, timestamp):
        self.counter = 0
        self.start_timestamp = timestamp
        
    def get_counter(self):
        return self.counter
    
    def set_start_timestamp(self, start_timestamp):
        self.start_timestamp = start_timestamp
        
    def set_end_timestamp(self, end_timestamp):
        self.end_timestamp = end_timestamp
        
    def get_timestamp_diff(self, end_timestamp):
        diff = end_timestamp - self.start_timestamp
        return diff.item().total_seconds()
    
    def get_ips(self):
        return self.ips
    
    def add_ip(self, ip):
        self.ips.add(ip)

    def get_users(self):
        return self.users
    
    def add_user(self, user):
        self.users.add(user)

## Formalize your anomaly detection
Feel free to edit/add any methods in the above defined class. It is ment for provinding you with a framework and make your work easier, not restrict your creativity

Find a way to filter the malicious IPs (you can do users as well, but the verification method will take in the IPs)

In [89]:
ip_dic = {}
user_dic = {}
detected_ip_set = set()
detected_users_set = set()
successfull_login_attempt = set()
short_ips = set()

# TODO: come up with some parameters to use in your filtering
tolerance = None
hyperparameter = None


# Loop through each row of dask dataframe
for index, row in login_df.iterrows():
    # Add new ip to hashmap
    ip = row['ip']
    user = row['user']
    timestamp = to_np_datetime64(row['dateTime'])
    
    if not ip_dic.get(ip):
        # initialize object
        ip_info = AnomalyInfo(timestamp)
        ip_dic[ip] = ip_info
        
    #\ TODO: Implement checks and make use of parameters to filter out the anomaly. Explore what works best :)
    if None in row['context']:        
        pass

# print(f"detected_users_set {detected_users_set}")
print(f"detected_ip_set has size {len(detected_ip_set)}")
# print(f"malicious ips {detected_ip_set}")

detected_ip_set has size 49992


## Verify your results

In [88]:
with open('./ground_truth.pickle', 'rb') as truth:
    sets_dict = pickle.load(truth)
malicious_user_set = sets_dict['malicious_user_set']
malicious_ip_set = sets_dict['malicious_ip_set']


intersection = malicious_ip_set.intersection(detected_ip_set)
print(f"Actual malicious IPs count: {len(malicious_ip_set)}")
print(f"Detected IPs count: {len(detected_ip_set)}")
print(f"len of intersection {len(intersection)}")
ratio = len(intersection) / len(detected_ip_set)
false_positive_rate = 1 - ratio
print(f'Accuracy: {ratio}')
print(f"False positive rate of {false_positive_rate}")

Actual malicious IPs count: 50794
Detected IPs count: 41387
len of intersection 41231
Accuracy: 0.9962307004614975
False positive rate of 0.0037692995385024686
