## Behavioral Analytics: Insider Threat Detection

### Anomaly Detection Model: IsolationForest Algorithm



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest

#### Graph data: User-PC interrelationship

- Model with input parameters from graph analysis (node degree which is PC count for each user)

In [3]:
user_pc = pd.read_csv('./Data_Subset/Input_features/user_pc_gdegree.csv')

In [4]:
user_pc.head()

Unnamed: 0,user,pc_count
0,XTM0246,1
1,MHH0652,1
2,WXW0044,2
3,CAE0080,3
4,REM0274,1


In [5]:
user_pc_ct = np.array(user_pc['pc_count'])

In [6]:
user_pc_ct

array([ 1,  1,  2,  3,  1,  1, 34,  2,  3,  1,  1,  1,  2,  1,  3,  2,  3,
        2,  1,  1,  1,  3,  1,  1,  1,  1,  2,  1, 27, 33, 20,  6,  1,  1,
        2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  1,  1,
        2,  2,  1,  3,  2,  3,  1,  1,  5,  1,  1,  1,  1,  1,  1, 12,  1,
        1,  1,  1,  1,  1,  2,  1,  1,  1,  1,  1,  3,  1,  1,  1,  1,  1,
        1, 43,  1,  1,  1,  1,  1,  1,  1,  1, 35,  1,  1,  2,  1,  3,  1,
        2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  1,  3,  1,  1,  3,
        2,  1,  2,  3,  1,  1,  1,  1,  1,  1,  1,  3,  1,  1,  1,  2,  1,
        1,  3,  1,  1,  1,  3, 55,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1, 30,  1,  1,  3,  1,  1,  1,  1,  3,  1,  1,  5,  1,  2,  1,
        3,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2, 40,  3,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  2,  3,  1,  1,  1,  1,  2,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  2,  2,  1,  3,  1,  2,  2,  1,  1,  1,  3,
        1,  1,  1,  1,  1

In [7]:
user_pc_ct = user_pc_ct.reshape(-1,1) # need to reshape(-1,1) for single feature, (1,-1) 
                                    #for single sample or multiple features
forest = IsolationForest()

forest.fit(user_pc_ct)

IsolationForest(bootstrap=False, contamination=0.1, max_features=1.0,
        max_samples='auto', n_estimators=100, n_jobs=1, random_state=None,
        verbose=0)

In [9]:
# Getting the anomaly score for each user

graph_a_score = forest.decision_function(user_pc_ct)
graph_a_score


array([ 0.10330184,  0.10330184,  0.01363095,  0.01311561,  0.10330184,
        0.10330184, -0.10326297,  0.01363095,  0.01311561,  0.10330184,
        0.10330184,  0.10330184,  0.01363095,  0.10330184,  0.01311561,
        0.01363095,  0.01311561,  0.01363095,  0.10330184,  0.10330184,
        0.10330184,  0.01311561,  0.10330184,  0.10330184,  0.10330184,
        0.10330184,  0.01363095,  0.10330184, -0.155851  , -0.1116786 ,
       -0.20361915, -0.17791213,  0.10330184,  0.10330184,  0.01363095,
        0.10330184,  0.10330184,  0.10330184,  0.10330184,  0.10330184,
        0.10330184,  0.10330184,  0.10330184,  0.10330184,  0.10330184,
        0.10330184,  0.10330184,  0.10330184,  0.01363095,  0.10330184,
        0.10330184,  0.01363095,  0.01363095,  0.10330184,  0.01311561,
        0.01363095,  0.01311561,  0.10330184,  0.10330184, -0.14461441,
        0.10330184,  0.10330184,  0.10330184,  0.10330184,  0.10330184,
        0.10330184, -0.2227895 ,  0.10330184,  0.10330184,  0.10

In [13]:
type(graph_a_score)

numpy.ndarray

In [14]:
graph_a_score.tolist()

[0.10330183674042703,
 0.10330183674042703,
 0.013630947752081679,
 0.013115610090399854,
 0.10330183674042703,
 0.10330183674042703,
 -0.10326296510555455,
 0.013630947752081679,
 0.013115610090399854,
 0.10330183674042703,
 0.10330183674042703,
 0.10330183674042703,
 0.013630947752081679,
 0.10330183674042703,
 0.013115610090399854,
 0.013630947752081679,
 0.013115610090399854,
 0.013630947752081679,
 0.10330183674042703,
 0.10330183674042703,
 0.10330183674042703,
 0.013115610090399854,
 0.10330183674042703,
 0.10330183674042703,
 0.10330183674042703,
 0.10330183674042703,
 0.013630947752081679,
 0.10330183674042703,
 -0.15585099785783885,
 -0.11167860358098058,
 -0.2036191534798859,
 -0.1779121294552667,
 0.10330183674042703,
 0.10330183674042703,
 0.013630947752081679,
 0.10330183674042703,
 0.10330183674042703,
 0.10330183674042703,
 0.10330183674042703,
 0.10330183674042703,
 0.10330183674042703,
 0.10330183674042703,
 0.10330183674042703,
 0.10330183674042703,
 0.10330183674042

In [16]:
# Define an empty datframe to store the result for the graph input features
graph_result = pd.DataFrame()


In [19]:
#graph_result['user'] = user_pc['user']
graph_result['ascore'] = graph_a_score

In [25]:
graph_result.loc[graph_result['ascore'] < 0] # possible outliers

Unnamed: 0,user,ascore
6,ALC0100,-0.103263
28,CQS0899,-0.155851
29,EER0383,-0.111679
30,BGZ0902,-0.203619
31,DNS0397,-0.177912
59,CGH0088,-0.144614
66,WJP0386,-0.222789
86,FED0275,-0.20459
95,WMP0272,-0.131175
142,LWB0078,-0.288367


In [26]:
# Save the graph result
graph_result.to_csv('./Data_Subset/IFResult/graph_result.csv', index = False)

-  #### Model with Logon/Logoff parameters as input features

In [27]:
# Load Logon/Logoff statistics data saved previously

user_logon_stats = pd.read_csv('./Data_Subset/Input_features/user_logon_stats.csv')

In [28]:
user_logon_stats.head()

Unnamed: 0,user,min,max,mode,mean
0,AAN0077,07:45:00,08:13:00,08:13:00,07:00:00
1,AAW0103,07:00:00,07:18:00,07:15:00,07:00:00
2,ACE0265,08:00:00,08:25:00,08:15:00,08:00:00
3,ACL0394,09:00:00,16:52:37,09:15:00,09:00:00
4,ADR0362,07:30:00,07:48:00,07:48:00,07:00:00


In [31]:
user_logon_stats['min'].dtype

dtype('O')

In [41]:
pd.to_datetime(user_logon_stats['min'][1]).hour

7

#### sklearn algorithms do not take categorical/object as input parameters, hence need to convert the time object into numerical equivalents/values

In [47]:
# Function to convert datetime 'time' to time in seconds

def dtt2timestamp(dtt):
    ts = (dtt.hour * 60 + dtt.minute) * 60 + dtt.second
    #if you want microseconds as well
    #ts += dtt.microsecond * 10**(-6)
    return ts


In [42]:
user_logon_stats1 = user_logon_stats

In [45]:
user_logon_stats1['min_dt'] = pd.to_datetime(user_logon_stats['min'])
user_logon_stats1['max_dt'] = pd.to_datetime(user_logon_stats['max'])
user_logon_stats1['mode_dt'] = pd.to_datetime(user_logon_stats['mode'])
user_logon_stats1['mean_dt'] = pd.to_datetime(user_logon_stats['mean'])

In [46]:
user_logon_stats1.head()

Unnamed: 0,user,min,max,mode,mean,min_dt,max_dt,mode_dt,mean_dt
0,AAN0077,07:45:00,08:13:00,08:13:00,07:00:00,2017-06-19 07:45:00,2017-06-19 08:13:00,2017-06-19 08:13:00,2017-06-19 07:00:00
1,AAW0103,07:00:00,07:18:00,07:15:00,07:00:00,2017-06-19 07:00:00,2017-06-19 07:18:00,2017-06-19 07:15:00,2017-06-19 07:00:00
2,ACE0265,08:00:00,08:25:00,08:15:00,08:00:00,2017-06-19 08:00:00,2017-06-19 08:25:00,2017-06-19 08:15:00,2017-06-19 08:00:00
3,ACL0394,09:00:00,16:52:37,09:15:00,09:00:00,2017-06-19 09:00:00,2017-06-19 16:52:37,2017-06-19 09:15:00,2017-06-19 09:00:00
4,ADR0362,07:30:00,07:48:00,07:48:00,07:00:00,2017-06-19 07:30:00,2017-06-19 07:48:00,2017-06-19 07:48:00,2017-06-19 07:00:00


In [50]:
# use the function to generate time in sec value

min_ts = [dtt2timestamp(dtt) for dtt in user_logon_stats1['min_dt']] 
max_ts = [dtt2timestamp(dtt) for dtt in user_logon_stats1['max_dt']]
mode_ts = [dtt2timestamp(dtt) for dtt in user_logon_stats1['mode_dt']]
mean_ts = [dtt2timestamp(dtt) for dtt in user_logon_stats1['mean_dt']]


In [53]:

user_logon_stats1['min_ts'] = min_ts
user_logon_stats1['max_ts'] = max_ts
user_logon_stats1['mode_ts'] = mode_ts
user_logon_stats1['mean_ts'] = mean_ts

In [54]:
# new df to store the tsec values
user_logon_stats_tsec = pd.DataFrame()

user_logon_stats_tsec['user'] = user_logon_stats1['user']
user_logon_stats_tsec['min_ts'] = user_logon_stats1['min_ts']
user_logon_stats_tsec['max_ts'] = user_logon_stats1['max_ts']
user_logon_stats_tsec['mode_ts'] = user_logon_stats1['mode_ts']
user_logon_stats_tsec['mean_ts'] = user_logon_stats1['mean_ts']


In [55]:
user_logon_stats_tsec.head()

Unnamed: 0,user,min_ts,max_ts,mode_ts,mean_ts
0,AAN0077,27900,29580,29580,25200
1,AAW0103,25200,26280,26100,25200
2,ACE0265,28800,30300,29700,28800
3,ACL0394,32400,60757,33300,32400
4,ADR0362,27000,28080,28080,25200


In [56]:
# save the data
user_logon_stats_tsec.to_csv('./Data_Subset/Input_features/user_logon_stats_tsec.csv', index = False)

In [58]:
user_logoff_stats = pd.read_csv('./Data_Subset/Input_features/user_logoff_stats.csv')

In [59]:
user_logoff_stats.head()

Unnamed: 0,user,min,max,mode,mean
0,AAN0077,16:47:00,17:15:00,16:56:00,16:00:00
1,AAW0103,16:12:00,16:30:00,16:14:00,16:00:00
2,ACE0265,16:05:00,16:30:00,16:06:00,16:00:00
3,ACL0394,09:13:55,17:30:00,17:15:00,16:00:00
4,ADR0362,16:42:00,17:00:00,16:45:00,16:00:00


In [60]:
user_logoff_stats1 = user_logoff_stats

In [61]:
user_logoff_stats1['min_dt'] = pd.to_datetime(user_logoff_stats['min'])
user_logoff_stats1['max_dt'] = pd.to_datetime(user_logoff_stats['max'])
user_logoff_stats1['mode_dt'] = pd.to_datetime(user_logoff_stats['mode'])
user_logoff_stats1['mean_dt'] = pd.to_datetime(user_logoff_stats['mean'])

In [62]:
user_logoff_stats1.head()

Unnamed: 0,user,min,max,mode,mean,min_dt,max_dt,mode_dt,mean_dt
0,AAN0077,16:47:00,17:15:00,16:56:00,16:00:00,2017-06-19 16:47:00,2017-06-19 17:15:00,2017-06-19 16:56:00,2017-06-19 16:00:00
1,AAW0103,16:12:00,16:30:00,16:14:00,16:00:00,2017-06-19 16:12:00,2017-06-19 16:30:00,2017-06-19 16:14:00,2017-06-19 16:00:00
2,ACE0265,16:05:00,16:30:00,16:06:00,16:00:00,2017-06-19 16:05:00,2017-06-19 16:30:00,2017-06-19 16:06:00,2017-06-19 16:00:00
3,ACL0394,09:13:55,17:30:00,17:15:00,16:00:00,2017-06-19 09:13:55,2017-06-19 17:30:00,2017-06-19 17:15:00,2017-06-19 16:00:00
4,ADR0362,16:42:00,17:00:00,16:45:00,16:00:00,2017-06-19 16:42:00,2017-06-19 17:00:00,2017-06-19 16:45:00,2017-06-19 16:00:00


In [63]:
# use the function to generate time in sec value

off_min_ts = [dtt2timestamp(dtt) for dtt in user_logoff_stats1['min_dt']] 
off_max_ts = [dtt2timestamp(dtt) for dtt in user_logoff_stats1['max_dt']]
off_mode_ts = [dtt2timestamp(dtt) for dtt in user_logoff_stats1['mode_dt']]
off_mean_ts = [dtt2timestamp(dtt) for dtt in user_logoff_stats1['mean_dt']]



In [64]:
user_logoff_stats1['min_ts'] =off_min_ts
user_logoff_stats1['max_ts'] = off_max_ts
user_logoff_stats1['mode_ts'] = off_mode_ts
user_logoff_stats1['mean_ts'] = off_mean_ts

In [65]:
user_logoff_stats1.head()

Unnamed: 0,user,min,max,mode,mean,min_dt,max_dt,mode_dt,mean_dt,min_ts,max_ts,mode_ts,mean_ts
0,AAN0077,16:47:00,17:15:00,16:56:00,16:00:00,2017-06-19 16:47:00,2017-06-19 17:15:00,2017-06-19 16:56:00,2017-06-19 16:00:00,60420,62100,60960,57600
1,AAW0103,16:12:00,16:30:00,16:14:00,16:00:00,2017-06-19 16:12:00,2017-06-19 16:30:00,2017-06-19 16:14:00,2017-06-19 16:00:00,58320,59400,58440,57600
2,ACE0265,16:05:00,16:30:00,16:06:00,16:00:00,2017-06-19 16:05:00,2017-06-19 16:30:00,2017-06-19 16:06:00,2017-06-19 16:00:00,57900,59400,57960,57600
3,ACL0394,09:13:55,17:30:00,17:15:00,16:00:00,2017-06-19 09:13:55,2017-06-19 17:30:00,2017-06-19 17:15:00,2017-06-19 16:00:00,33235,63000,62100,57600
4,ADR0362,16:42:00,17:00:00,16:45:00,16:00:00,2017-06-19 16:42:00,2017-06-19 17:00:00,2017-06-19 16:45:00,2017-06-19 16:00:00,60120,61200,60300,57600


In [66]:
# new df to store the tsec values
user_logoff_stats_tsec = pd.DataFrame()

user_logoff_stats_tsec['user'] = user_logoff_stats1['user']
user_logoff_stats_tsec['min_ts'] = user_logoff_stats1['min_ts']
user_logoff_stats_tsec['max_ts'] = user_logoff_stats1['max_ts']
user_logoff_stats_tsec['mode_ts'] = user_logoff_stats1['mode_ts']
user_logoff_stats_tsec['mean_ts'] = user_logoff_stats1['mean_ts']

In [67]:
user_logoff_stats_tsec.head()

Unnamed: 0,user,min_ts,max_ts,mode_ts,mean_ts
0,AAN0077,60420,62100,60960,57600
1,AAW0103,58320,59400,58440,57600
2,ACE0265,57900,59400,57960,57600
3,ACL0394,33235,63000,62100,57600
4,ADR0362,60120,61200,60300,57600


In [68]:
# save the data
user_logoff_stats_tsec.to_csv('./Data_Subset/Input_features/user_logoff_stats_tsec.csv', index = False)


In [69]:
# combined logon/logoff data for IForest input

ulog_on_off_stats = pd.DataFrame()

ulog_on_off_stats['user'] = user_logon_stats_tsec['user']

ulog_on_off_stats['on_min_ts'] = user_logon_stats_tsec['min_ts']
ulog_on_off_stats['on_max_ts'] = user_logon_stats_tsec['max_ts']
ulog_on_off_stats['on_mode_ts'] = user_logon_stats_tsec['mode_ts']
ulog_on_off_stats['on_mean_ts'] = user_logon_stats_tsec['mean_ts']

ulog_on_off_stats['off_min_ts'] = user_logoff_stats_tsec['min_ts']
ulog_on_off_stats['off_max_ts'] = user_logoff_stats_tsec['max_ts']
ulog_on_off_stats['off_mode_ts'] = user_logoff_stats_tsec['mode_ts']
ulog_on_off_stats['off_mean_ts'] = user_logoff_stats_tsec['mean_ts']


In [72]:
ulog_on_off_stats.head()

Unnamed: 0,user,on_min_ts,on_max_ts,on_mode_ts,on_mean_ts,off_min_ts,off_max_ts,off_mode_ts,off_mean_ts
0,AAN0077,27900,29580,29580,25200,60420,62100,60960,57600
1,AAW0103,25200,26280,26100,25200,58320,59400,58440,57600
2,ACE0265,28800,30300,29700,28800,57900,59400,57960,57600
3,ACL0394,32400,60757,33300,32400,33235,63000,62100,57600
4,ADR0362,27000,28080,28080,25200,60120,61200,60300,57600


In [73]:
# save the data
ulog_on_off_stats.to_csv('./Data_Subset/Input_features/ulog_on_off_stats.csv', index = False)

In [80]:
# fit the model

# input array
ulog_on_off_stats.columns[1:]

ulog_stats = ulog_on_off_stats.as_matrix(columns=ulog_on_off_stats.columns[1:])
ulog_stats


array([[27900, 29580, 29580, ..., 62100, 60960, 57600],
       [25200, 26280, 26100, ..., 59400, 58440, 57600],
       [28800, 30300, 29700, ..., 59400, 57960, 57600],
       ..., 
       [27000, 28020, 27900, ..., 61200, 60300, 57600],
       [29700, 52585, 30600, ..., 71100, 70200, 68400],
       [25200, 26760, 26100, ..., 59400, 57900, 57600]])

In [81]:
# fit the model contd..

ulog_stats = ulog_stats #.reshape(1,-1) # need to reshape(-1,1) for single feature, (1,-1) 
                                      #for single sample or multiple features
forest = IsolationForest()

forest.fit(ulog_stats)

IsolationForest(bootstrap=False, contamination=0.1, max_features=1.0,
        max_samples='auto', n_estimators=100, n_jobs=1, random_state=None,
        verbose=0)

In [82]:
# anomaly score

ulog_ascore = forest.decision_function(ulog_stats)
ulog_ascore

array([ 0.05824133,  0.05692065,  0.07518662,  0.01529839,  0.10056027,
        0.05087507,  0.04265962,  0.10629041,  0.05432064,  0.04140386,
        0.0430556 ,  0.03612857, -0.04771475,  0.00201915,  0.01407778,
        0.06965512, -0.0562279 ,  0.04867722, -0.03200366,  0.06277468,
        0.0188337 ,  0.10728633, -0.00220919, -0.02580492,  0.01090464,
        0.04021933,  0.05787506, -0.00338478,  0.04039823, -0.02468814,
        0.02162531, -0.11432718,  0.04234088,  0.04867722,  0.08861629,
        0.07386465,  0.05095684,  0.02598132,  0.042656  ,  0.09807391,
        0.03646571,  0.10678032,  0.05448421,  0.02057304, -0.03939323,
        0.08635676,  0.03260143,  0.06291462,  0.0724682 ,  0.00403469,
        0.063508  ,  0.03976181, -0.03314996,  0.09810085,  0.05095684,
       -0.00838449, -0.11788993,  0.02884665, -0.01879766,  0.04632841,
        0.10263663, -0.00616013,  0.0120744 ,  0.09921192,  0.07290652,
        0.00340998,  0.00175891,  0.06062245,  0.05484924, -0.01

In [83]:
ulog_ascore.shape

(232,)

In [95]:
# Save the result
user_log_result = pd.DataFrame()

user_log_result['user'] = ulog_on_off_stats['user']

user_log_result['ascore'] = ulog_ascore


In [96]:
user_log_result.head()

Unnamed: 0,user,ascore
0,AAN0077,0.058241
1,AAW0103,0.056921
2,ACE0265,0.075187
3,ACL0394,0.015298
4,ADR0362,0.10056


In [97]:
# save the result

user_log_result.to_csv('./Data_Subset/IFResult/user_log_result.csv', index = False)


- #### Model with removable media (device) and file transfer stats as input features

In [98]:
# Load the data

device_conn_stats = pd.read_csv('./Data_Subset/Input_features/device_conn_stats.csv')
device_disconn_stats = pd.read_csv('./Data_Subset/Input_features/device_disconn_stats.csv')
files_per_day_stats = pd.read_csv('./Data_Subset/Input_features/files_per_day_stats.csv')


In [99]:
device_conn_stats.head()

Unnamed: 0,user,min,max,mode,mean
0,AJQ0376,08:26:41,19:04:54,13:58:30,12:00:00
1,AJR0231,07:31:40,15:48:38,15:01:53,10:00:00
2,AOD0066,07:41:33,15:31:35,09:56:08,10:00:00
3,ARH0777,07:42:36,16:34:49,14:19:10,11:00:00
4,BCP0247,01:21:10,23:32:44,12:57:44,11:00:00


In [100]:
device_conn_stats1 = device_conn_stats

In [101]:
device_conn_stats1['min_dt'] = pd.to_datetime(device_conn_stats['min'])
device_conn_stats1['max_dt'] = pd.to_datetime(device_conn_stats['max'])
device_conn_stats1['mode_dt'] = pd.to_datetime(device_conn_stats['mode'])
device_conn_stats1['mean_dt'] = pd.to_datetime(device_conn_stats['mean'])


In [102]:
# use the function to generate time in sec value

con_min_ts = [dtt2timestamp(dtt) for dtt in device_conn_stats1['min_dt']]
con_max_ts = [dtt2timestamp(dtt) for dtt in device_conn_stats1['max_dt']]
con_mode_ts = [dtt2timestamp(dtt) for dtt in device_conn_stats1['mode_dt']]
con_mean_ts = [dtt2timestamp(dtt) for dtt in device_conn_stats1['mean_dt']]



In [103]:
device_conn_stats1.head()

Unnamed: 0,user,min,max,mode,mean,min_dt,max_dt,mode_dt,mean_dt
0,AJQ0376,08:26:41,19:04:54,13:58:30,12:00:00,2017-06-19 08:26:41,2017-06-19 19:04:54,2017-06-19 13:58:30,2017-06-19 12:00:00
1,AJR0231,07:31:40,15:48:38,15:01:53,10:00:00,2017-06-19 07:31:40,2017-06-19 15:48:38,2017-06-19 15:01:53,2017-06-19 10:00:00
2,AOD0066,07:41:33,15:31:35,09:56:08,10:00:00,2017-06-19 07:41:33,2017-06-19 15:31:35,2017-06-19 09:56:08,2017-06-19 10:00:00
3,ARH0777,07:42:36,16:34:49,14:19:10,11:00:00,2017-06-19 07:42:36,2017-06-19 16:34:49,2017-06-19 14:19:10,2017-06-19 11:00:00
4,BCP0247,01:21:10,23:32:44,12:57:44,11:00:00,2017-06-19 01:21:10,2017-06-19 23:32:44,2017-06-19 12:57:44,2017-06-19 11:00:00


In [104]:
# new dataframe 

device_conn_stats_tsec = pd.DataFrame()

device_conn_stats_tsec['user'] = device_conn_stats1['user']
device_conn_stats_tsec['con_min_ts'] = con_min_ts
device_conn_stats_tsec['con_max_ts'] = con_max_ts
device_conn_stats_tsec['con_mode_ts'] = con_mode_ts
device_conn_stats_tsec['con_mean_ts'] = con_mean_ts


In [105]:
device_conn_stats_tsec.head()

Unnamed: 0,user,con_min_ts,con_max_ts,con_mode_ts,con_mean_ts
0,AJQ0376,30401,68694,50310,43200
1,AJR0231,27100,56918,54113,36000
2,AOD0066,27693,55895,35768,36000
3,ARH0777,27756,59689,51550,39600
4,BCP0247,4870,84764,46664,39600


In [106]:
# save the data
device_conn_stats_tsec.to_csv('./Data_Subset/Input_features/device_conn_stats_tsec.csv', index = False)


In [107]:
device_disconn_stats1 = device_disconn_stats

In [108]:
device_disconn_stats1['min_dt'] = pd.to_datetime(device_disconn_stats['min'])
device_disconn_stats1['max_dt'] = pd.to_datetime(device_disconn_stats['max'])
device_disconn_stats1['mode_dt'] = pd.to_datetime(device_disconn_stats['mode'])
device_disconn_stats1['mean_dt'] = pd.to_datetime(device_disconn_stats['mean'])

In [109]:
device_disconn_stats1.head()

Unnamed: 0,user,min,max,mode,mean,min_dt,max_dt,mode_dt,mean_dt
0,AJQ0376,08:45:27,19:38:50,15:46:32,14:00:00,2017-06-19 08:45:27,2017-06-19 19:38:50,2017-06-19 15:46:32,2017-06-19 14:00:00
1,AJR0231,07:41:45,15:55:37,14:40:14,11:00:00,2017-06-19 07:41:45,2017-06-19 15:55:37,2017-06-19 14:40:14,2017-06-19 11:00:00
2,AOD0066,08:09:15,15:43:27,12:37:43,12:00:00,2017-06-19 08:09:15,2017-06-19 15:43:27,2017-06-19 12:37:43,2017-06-19 12:00:00
3,ARH0777,07:46:44,16:52:13,09:19:46,12:00:00,2017-06-19 07:46:44,2017-06-19 16:52:13,2017-06-19 09:19:46,2017-06-19 12:00:00
4,BCP0247,00:30:41,23:49:41,12:46:37,12:00:00,2017-06-19 00:30:41,2017-06-19 23:49:41,2017-06-19 12:46:37,2017-06-19 12:00:00


In [110]:
# use the function to generate time in sec value

dcon_min_ts = [dtt2timestamp(dtt) for dtt in device_disconn_stats1['min_dt']]
dcon_max_ts = [dtt2timestamp(dtt) for dtt in device_disconn_stats1['max_dt']]
dcon_mode_ts = [dtt2timestamp(dtt) for dtt in device_disconn_stats1['mode_dt']]
dcon_mean_ts = [dtt2timestamp(dtt) for dtt in device_disconn_stats1['mean_dt']]


In [111]:
# new dataframe 

device_disconn_stats_tsec = pd.DataFrame()

device_disconn_stats_tsec['user'] = device_disconn_stats1['user']
device_disconn_stats_tsec['dcon_min_ts'] = dcon_min_ts
device_disconn_stats_tsec['dcon_max_ts'] = dcon_max_ts
device_disconn_stats_tsec['dcon_mode_ts'] = dcon_mode_ts
device_disconn_stats_tsec['dcon_mean_ts'] = dcon_mean_ts

In [112]:
device_disconn_stats_tsec.head()

Unnamed: 0,user,dcon_min_ts,dcon_max_ts,dcon_mode_ts,dcon_mean_ts
0,AJQ0376,31527,70730,56792,50400
1,AJR0231,27705,57337,52814,39600
2,AOD0066,29355,56607,45463,43200
3,ARH0777,28004,60733,33586,43200
4,BCP0247,1841,85781,45997,43200


In [113]:
# save the data

device_disconn_stats_tsec.to_csv('./Data_Subset/Input_features/device_disconn_stats_tsec.csv', index = False)


In [115]:
files_per_day_stats.head()

Unnamed: 0,user,mode,max
0,AJQ0376,27,27
1,AJR0231,5,14
2,AOD0066,5,12
3,ARH0777,2,8
4,BCP0247,4,11


In [116]:
# Combine all the removable media (device) parameters

device_stats = pd.DataFrame() # new df

device_stats['user'] = device_conn_stats_tsec['user']

# connect stats
device_stats['con_min_ts'] = device_conn_stats_tsec['con_min_ts']
device_stats['con_max_ts'] = device_conn_stats_tsec['con_max_ts']
device_stats['con_mode_ts'] = device_conn_stats_tsec['con_mode_ts']
device_stats['con_mean_ts'] = device_conn_stats_tsec['con_mean_ts']

# disconnect stats
device_stats['dcon_min_ts'] = device_disconn_stats_tsec['dcon_min_ts']
device_stats['dcon_max_ts'] = device_disconn_stats_tsec['dcon_max_ts']
device_stats['dcon_mode_ts'] = device_disconn_stats_tsec['dcon_mode_ts']
device_stats['dcon_mean_ts'] = device_disconn_stats_tsec['dcon_mean_ts']

# files per day stats
device_stats['file_mode'] = files_per_day_stats['mode']
device_stats['file_max'] = files_per_day_stats['max']


In [117]:
device_stats.head()

Unnamed: 0,user,con_min_ts,con_max_ts,con_mode_ts,con_mean_ts,dcon_min_ts,dcon_max_ts,dcon_mode_ts,dcon_mean_ts,file_mode,file_max
0,AJQ0376,30401,68694,50310,43200,31527,70730,56792,50400,27,27
1,AJR0231,27100,56918,54113,36000,27705,57337,52814,39600,5,14
2,AOD0066,27693,55895,35768,36000,29355,56607,45463,43200,5,12
3,ARH0777,27756,59689,51550,39600,28004,60733,33586,43200,2,8
4,BCP0247,4870,84764,46664,39600,1841,85781,45997,43200,4,11


In [118]:
# save the data
device_stats.to_csv('./Data_Subset/Input_features/device_stats.csv', index = False)


- Fitting the model

In [137]:
# input array
device_stats.columns[1:]

device_params = device_stats.as_matrix(columns = device_stats.columns[1:])
device_params

array([[30401, 68694, 50310, 43200, 31527, 70730, 56792, 50400,    27,
           27],
       [27100, 56918, 54113, 36000, 27705, 57337, 52814, 39600,     5,
           14],
       [27693, 55895, 35768, 36000, 29355, 56607, 45463, 43200,     5,
           12],
       [27756, 59689, 51550, 39600, 28004, 60733, 33586, 43200,     2,
            8],
       [ 4870, 84764, 46664, 39600,  1841, 85781, 45997, 43200,     4,
           11],
       [33007, 61926, 35672, 39600, 35304, 62393, 59056, 46800,     4,
           13],
       [ 1185, 76882, 56810, 43200,  3000, 68624, 57911, 54000,     2,
           13],
       [28882, 59022, 50806, 39600, 29083, 59266, 49299, 39600,    27,
           27],
       [16018, 83814, 33156, 39600, 14887, 60913, 51885, 43200,     2,
           11],
       [29154, 57723, 31389, 39600, 31314, 59062, 41058, 43200,     1,
           11],
       [27480, 55931, 48979, 36000, 28088, 57315, 56067, 39600,     2,
            9],
       [28096, 61247, 45887, 39600, 28544, 

In [138]:
# fit the model
device_params = device_params #.reshape(-1,1)
forest = IsolationForest()

forest.fit(device_params)


IsolationForest(bootstrap=False, contamination=0.1, max_features=1.0,
        max_samples='auto', n_estimators=100, n_jobs=1, random_state=None,
        verbose=0)

In [139]:
# anomaly score

dev_file_ascore = forest.decision_function(device_params)
dev_file_ascore

array([ 0.01061819,  0.07490963,  0.0746807 ,  0.08874951,  0.02714255,
        0.04623366, -0.07313351,  0.06772047, -0.00146748,  0.06214182,
        0.07529703,  0.07239563, -0.05354237,  0.06691815,  0.0901711 ,
        0.09666711,  0.0547618 ,  0.05222577,  0.06583506,  0.05011917,
        0.04667266,  0.0503641 ,  0.08036392,  0.07615057,  0.08080458,
        0.08049453,  0.05546656,  0.08973955,  0.1145217 ,  0.02167804,
        0.04050933,  0.00563107,  0.02007753,  0.06111717,  0.05427255,
       -0.00731932,  0.04795068, -0.05184947,  0.06558586,  0.02208341,
       -0.06089685,  0.04443151,  0.07105566,  0.11431883,  0.10250222,
        0.05825164, -0.03805721])

In [140]:
dev_file_ascore.shape

(47,)

In [132]:
device_stats.shape

(47, 11)

In [142]:
# Save the result
device_file_result = pd.DataFrame()

device_file_result['user'] = device_stats['user']
device_file_result['ascore'] = dev_file_ascore


In [143]:
device_file_result.head()

Unnamed: 0,user,ascore
0,AJQ0376,0.010618
1,AJR0231,0.07491
2,AOD0066,0.074681
3,ARH0777,0.08875
4,BCP0247,0.027143


In [144]:
# save the result
device_file_result.to_csv('./Data_Subset/IFResult/device_file_result.csv', index = False)

- #### Model with Psychometric data

In [145]:
psychometric = pd.read_csv('./Data_Subset/fu2_psychometric.csv')

In [146]:
psychometric.head()

Unnamed: 0,employee_name,user_id,O,C,E,A,N
0,Hayden Brennan Browning,HBB0090,41,18,18,44,36
1,Quin Colette Fuller,QCF0390,42,50,22,20,26
2,Jermaine Dominic Dorsey,JDD0087,41,42,19,20,29
3,Allistair Akeem Nichols,AAN0077,42,15,32,22,29
4,Galvin Jordan Parsons,GJP0098,42,44,39,31,30


In [148]:
psychometric.shape

(232, 7)

In [151]:
# fit the model

# input array
psychometric_params = psychometric.as_matrix(columns = psychometric.columns[2:])
psychometric_params


array([[41, 18, 18, 44, 36],
       [42, 50, 22, 20, 26],
       [41, 42, 19, 20, 29],
       ..., 
       [35, 40, 45, 39, 31],
       [16, 14, 37, 43, 23],
       [22, 17, 39, 34, 18]])

In [152]:
#device_params = device_params
forest = IsolationForest()

forest.fit(psychometric_params)

IsolationForest(bootstrap=False, contamination=0.1, max_features=1.0,
        max_samples='auto', n_estimators=100, n_jobs=1, random_state=None,
        verbose=0)

In [154]:
# anomaly score
psych_ascore = forest.decision_function(psychometric_params)
psych_ascore


array([ 0.03035229,  0.0334336 ,  0.09724311,  0.06614153,  0.06187728,
        0.02497976,  0.06341521,  0.04461608,  0.03568697,  0.05787885,
       -0.00202239,  0.04886835, -0.04051539,  0.0502179 ,  0.04461547,
        0.08024991,  0.08669555,  0.04872239,  0.05940291,  0.00273841,
       -0.02350418,  0.05968872,  0.02115387,  0.05133515,  0.08456098,
        0.07822694,  0.04601829,  0.03632484,  0.05751695,  0.0755021 ,
        0.04582202,  0.06914013, -0.00888816,  0.07296075,  0.05648154,
        0.04919691,  0.05099657, -0.01153901,  0.06044029,  0.04032208,
        0.01233749,  0.07141743,  0.05050764,  0.05012355,  0.07396945,
        0.04005369,  0.00531456,  0.0725553 ,  0.06095211,  0.07304976,
        0.08463581,  0.01355371,  0.02516697,  0.00172579,  0.02862381,
        0.04475177,  0.06383215,  0.06098771,  0.04902803,  0.06039   ,
        0.03002834,  0.06915628, -0.00551379,  0.02448589,  0.05171851,
        0.0505288 , -0.00720233, -0.03005982, -0.00582019,  0.04

In [155]:
psych_ascore.shape

(232,)

In [156]:
# save the result
psychometric_result = pd.DataFrame()

psychometric_result['user'] = psychometric['user_id']
psychometric_result['ascore'] = psych_ascore


In [157]:
psychometric_result.head()

Unnamed: 0,user,ascore
0,HBB0090,0.030352
1,QCF0390,0.033434
2,JDD0087,0.097243
3,AAN0077,0.066142
4,GJP0098,0.061877


In [158]:
psychometric_result.to_csv('./Data_Subset/IFResult/psychometric_result.csv', index = False)

- #### Model with all the input features combined (Graph, Logon/Logoff, Device and File transfer, and Psychometric data)

In [177]:
dfmerge1 = pd.merge(ulog_on_off_stats, user_pc, on = 'user')
dfmerge1.head()

Unnamed: 0,user,on_min_ts,on_max_ts,on_mode_ts,on_mean_ts,off_min_ts,off_max_ts,off_mode_ts,off_mean_ts,pc_count
0,AAN0077,27900,29580,29580,25200,60420,62100,60960,57600,1
1,AAW0103,25200,26280,26100,25200,58320,59400,58440,57600,1
2,ACE0265,28800,30300,29700,28800,57900,59400,57960,57600,1
3,ACL0394,32400,60757,33300,32400,33235,63000,62100,57600,2
4,ADR0362,27000,28080,28080,25200,60120,61200,60300,57600,1


In [185]:
dfmerge2 = pd.merge(dfmerge1, psychometric, left_on = 'user', right_on = 'user_id')
dfmerge2.head()

Unnamed: 0,user,on_min_ts,on_max_ts,on_mode_ts,on_mean_ts,off_min_ts,off_max_ts,off_mode_ts,off_mean_ts,pc_count,employee_name,user_id,O,C,E,A,N
0,AAN0077,27900,29580,29580,25200,60420,62100,60960,57600,1,Allistair Akeem Nichols,AAN0077,42,15,32,22,29
1,AAW0103,25200,26280,26100,25200,58320,59400,58440,57600,1,Alexa Allegra Walton,AAW0103,39,42,47,39,28
2,ACE0265,28800,30300,29700,28800,57900,59400,57960,57600,1,Amos Carson Emerson,ACE0265,26,23,14,19,40
3,ACL0394,32400,60757,33300,32400,33235,63000,62100,57600,2,Adam Clark Lindsey,ACL0394,25,44,18,19,17
4,ADR0362,27000,28080,28080,25200,60120,61200,60300,57600,1,Amir Damian Rosario,ADR0362,32,42,35,38,26


In [186]:
dfmerge2.shape

(232, 17)

In [187]:
# drop the 'employee_name' and 'user_id' columns
dfmerge2.drop(['employee_name', 'user_id'], axis=1, inplace=True)  # axis: 1 for col, 0 for row

In [188]:
dfmerge2.head()

Unnamed: 0,user,on_min_ts,on_max_ts,on_mode_ts,on_mean_ts,off_min_ts,off_max_ts,off_mode_ts,off_mean_ts,pc_count,O,C,E,A,N
0,AAN0077,27900,29580,29580,25200,60420,62100,60960,57600,1,42,15,32,22,29
1,AAW0103,25200,26280,26100,25200,58320,59400,58440,57600,1,39,42,47,39,28
2,ACE0265,28800,30300,29700,28800,57900,59400,57960,57600,1,26,23,14,19,40
3,ACL0394,32400,60757,33300,32400,33235,63000,62100,57600,2,25,44,18,19,17
4,ADR0362,27000,28080,28080,25200,60120,61200,60300,57600,1,32,42,35,38,26


In [190]:
All_params = dfmerge2

# save
All_params.to_csv('./Data_Subset/Input_features/All_params.csv', index = False)


- Fit the model

In [191]:
#input array

All_params_input = All_params.as_matrix(columns = All_params.columns[1:])
All_params_input

array([[27900, 29580, 29580, ...,    32,    22,    29],
       [25200, 26280, 26100, ...,    47,    39,    28],
       [28800, 30300, 29700, ...,    14,    19,    40],
       ..., 
       [27000, 28020, 27900, ...,    16,    19,    40],
       [29700, 52585, 30600, ...,    44,    42,    28],
       [25200, 26760, 26100, ...,    17,    46,    26]])

In [192]:
forest = IsolationForest()

forest.fit(All_params_input)

IsolationForest(bootstrap=False, contamination=0.1, max_features=1.0,
        max_samples='auto', n_estimators=100, n_jobs=1, random_state=None,
        verbose=0)

In [194]:
# Anomaly score
All_params_ascore = forest.decision_function(All_params_input)
All_params_ascore

array([ 0.09306731,  0.04907915,  0.05230022,  0.01226161,  0.08859862,
        0.06478952,  0.06382545,  0.10821374,  0.06759128,  0.05793062,
        0.05437928,  0.04526372, -0.05702581, -0.06819795,  0.014173  ,
        0.05511947,  0.01639531,  0.01236846,  0.02868082,  0.01508705,
        0.02390975,  0.08153256,  0.04431548, -0.03220311,  0.02415096,
        0.04705111,  0.06824238,  0.00993244,  0.06054351,  0.01478441,
        0.03290786, -0.05301363,  0.0624407 ,  0.07754993,  0.05802697,
        0.0634078 ,  0.0549517 ,  0.05030033,  0.05402928,  0.09800174,
        0.03593456,  0.07007377,  0.03242095,  0.05241744, -0.04400098,
        0.06607366,  0.04937949,  0.07777261,  0.07190937,  0.00620378,
        0.0118727 ,  0.04399525,  0.03898421,  0.10801991,  0.07764995,
        0.03252773, -0.08949721,  0.0758242 , -0.01302638,  0.05081461,
        0.10623094,  0.06250043,  0.01948568,  0.07469456,  0.07398662,
        0.02543436,  0.01091637,  0.05775381,  0.06201844,  0.00

In [195]:
# save the result

All_params_result = pd.DataFrame()

All_params_result['user'] = All_params['user']
All_params_result['ascore'] = All_params_ascore


In [196]:
All_params_result.head()

Unnamed: 0,user,ascore
0,AAN0077,0.093067
1,AAW0103,0.049079
2,ACE0265,0.0523
3,ACL0394,0.012262
4,ADR0362,0.088599


In [197]:
All_params_result.to_csv('./Data_Subset/IFResult/All_params_result.csv', index = False)