In [1]:
import pandas as pd

In [4]:
help(pd.DataFrame)

Help on class DataFrame in module pandas.core.frame:

class DataFrame(pandas.core.generic.NDFrame, pandas.core.arraylike.OpsMixin)
 |  DataFrame(data=None, index: 'Optional[Axes]' = None, columns: 'Optional[Axes]' = None, dtype: 'Optional[Dtype]' = None, copy: 'bool' = False)
 |  
 |  Two-dimensional, size-mutable, potentially heterogeneous tabular data.
 |  
 |  Data structure also contains labeled axes (rows and columns).
 |  Arithmetic operations align on both row and column labels. Can be
 |  thought of as a dict-like container for Series objects. The primary
 |  pandas data structure.
 |  
 |  Parameters
 |  ----------
 |  data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
 |      Dict can contain Series, arrays, constants, dataclass or list-like objects. If
 |      data is a dict, column order follows insertion-order.
 |  
 |      .. versionchanged:: 0.25.0
 |         If data is a list of dicts, column order follows insertion-order.
 |  
 |  index : Index or

In [136]:
# We're going to read another data set in with more variety
logons_full_df = pd.read_pickle("./data/host_logons.pkl")
net_full_df = pd.read_pickle("./data/az_net_comms_df.pkl")

# also create a demo version with just 3 rows
logons_df = logons_full_df.sample(20)
logons_df.head(5)

Unnamed: 0,TenantId,Account,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
99,52b1ab41-869e-4138-9e40-2a4457f09bf0,Window Manager\DWM-2,4624,2019-02-12 22:22:21.240,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,DWM-2,Window Manager,S-1-5-90-0-2,0x106b458,Advapi,2,Negotiate,,-,-,2019-02-12 22:22:21.240
89,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-14 04:20:55.763,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-14 04:20:55.763
32,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 06:42:08.110,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 06:42:08.110
3,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:38:16.550,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc912d62,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:38:16.550
123,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 20:39:14.110,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 20:39:14.110


### Rows and Columns of a DataFrame are pandas *Series*

In [9]:
print(type(logons_df.iloc[0]))

<class 'pandas.core.series.Series'>


In [12]:
print("logons_df", type(logons_df))
print("logons_df.Account", type(logons_df.Account)) # "Account" row
print("logons_df.iloc[0])", type(logons_df.iloc[0])) # First row

# At the intersection of a row and column we get a simple type - the cell content
print("logons_df.iloc[0].Account", type(logons_df.iloc[0].Account), logons_df.iloc[0].Account)

logons_df <class 'pandas.core.frame.DataFrame'>
logons_df.Account <class 'pandas.core.series.Series'>
logons_df.iloc[0]) <class 'pandas.core.series.Series'>
logons_df.iloc[0].Account <class 'str'> NT AUTHORITY\SYSTEM


### What does a series look like?

In [None]:
# Column
logons_df.Account

In [14]:
# Row
logons_df.iloc[0]

TenantId                     52b1ab41-869e-4138-9e40-2a4457f09bf0
Account                                       NT AUTHORITY\SYSTEM
EventID                                                      4624
TimeGenerated                          2019-02-12 04:56:34.307000
SourceComputerId             263a788b-6526-4cdc-8ed9-d79402fe4aa0
Computer                                          MSTICAlertsWin1
SubjectUserName                                  MSTICAlertsWin1$
SubjectDomainName                                       WORKGROUP
SubjectUserSid                                           S-1-5-18
TargetUserName                                             SYSTEM
TargetDomainName                                     NT AUTHORITY
TargetUserSid                                            S-1-5-18
TargetLogonId                                               0x3e7
LogonProcessName                                         Advapi  
LogonType                                                       5
Authentica

## Selecting Columns

Selecting a single column

In [13]:
logons_df.Account

0           NT AUTHORITY\SYSTEM
1    MSTICAlertsWin1\MSTICAdmin
2    MSTICAlertsWin1\MSTICAdmin
3    MSTICAlertsWin1\MSTICAdmin
4    MSTICAlertsWin1\MSTICAdmin
5           NT AUTHORITY\SYSTEM
6           NT AUTHORITY\SYSTEM
7           NT AUTHORITY\SYSTEM
8           NT AUTHORITY\SYSTEM
9           NT AUTHORITY\SYSTEM
Name: Account, dtype: object

More general syntax and mandatory if column name has spaces or other illegal chars (like ".")

In [None]:
logons_df["Account"]

To select multiple columns you use a Python list

In [None]:
my_cols = ["Account", "TimeGenerated"]
logons_df[my_cols]

In [92]:
# Or just
logons_df[["Account", "TimeGenerated"]]

Unnamed: 0,Account,TimeGenerated
0,NT AUTHORITY\SYSTEM,2019-02-12 04:56:34.307
1,MSTICAlertsWin1\MSTICAdmin,2019-02-12 04:37:25.340
2,MSTICAlertsWin1\MSTICAdmin,2019-02-12 04:37:27.997
3,MSTICAlertsWin1\MSTICAdmin,2019-02-12 04:38:16.550
4,MSTICAlertsWin1\MSTICAdmin,2019-02-12 04:38:21.370
5,NT AUTHORITY\SYSTEM,2019-02-12 04:50:09.713
6,NT AUTHORITY\SYSTEM,2019-02-12 04:50:18.660
7,NT AUTHORITY\SYSTEM,2019-02-12 04:43:56.327
8,NT AUTHORITY\SYSTEM,2019-02-12 04:44:10.343
9,NT AUTHORITY\SYSTEM,2019-02-12 04:40:11.867


### Use the columns property to get the column names

In [None]:
logons_df.columns

In [None]:
logons_df[[]]

## Indexes - brief introduction

In [20]:
logons_df.index

RangeIndex(start=0, stop=10, step=1)


In [21]:
# Access a row at an index location
logons_df.loc[3]

TenantId                             52b1ab41-869e-4138-9e40-2a4457f09bf0
Account                                        MSTICAlertsWin1\MSTICAdmin
EventID                                                              4624
TimeGenerated                                  2019-02-12 04:38:16.550000
SourceComputerId                     263a788b-6526-4cdc-8ed9-d79402fe4aa0
Computer                                                  MSTICAlertsWin1
SubjectUserName                                                         -
SubjectDomainName                                                       -
SubjectUserSid                                                    S-1-0-0
TargetUserName                                                 MSTICAdmin
TargetDomainName                                          MSTICAlertsWin1
TargetUserSid                S-1-5-21-996632719-2361334927-4038480536-500
TargetLogonId                                                   0xc912d62
LogonProcessName                      

In [22]:
# Access a row at a physical row location
logons_df.iloc[3]

TenantId                             52b1ab41-869e-4138-9e40-2a4457f09bf0
Account                                        MSTICAlertsWin1\MSTICAdmin
EventID                                                              4624
TimeGenerated                                  2019-02-12 04:38:16.550000
SourceComputerId                     263a788b-6526-4cdc-8ed9-d79402fe4aa0
Computer                                                  MSTICAlertsWin1
SubjectUserName                                                         -
SubjectDomainName                                                       -
SubjectUserSid                                                    S-1-0-0
TargetUserName                                                 MSTICAdmin
TargetDomainName                                          MSTICAlertsWin1
TargetUserSid                S-1-5-21-996632719-2361334927-4038480536-500
TargetLogonId                                                   0xc912d62
LogonProcessName                      

In [24]:
indexed_logons_df = logons_df.set_index("Account")
display(logons_df.head(3))
display(indexed_logons_df.head(3))

Unnamed: 0,TenantId,Account,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
0,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 04:56:34.307,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:56:34.307
1,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:37:25.340,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90e957,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:25.340
2,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:37:27.997,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90ea44,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:27.997


Unnamed: 0_level_0,TenantId,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
Account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
NT AUTHORITY\SYSTEM,52b1ab41-869e-4138-9e40-2a4457f09bf0,4624,2019-02-12 04:56:34.307,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:56:34.307
MSTICAlertsWin1\MSTICAdmin,52b1ab41-869e-4138-9e40-2a4457f09bf0,4624,2019-02-12 04:37:25.340,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90e957,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:25.340
MSTICAlertsWin1\MSTICAdmin,52b1ab41-869e-4138-9e40-2a4457f09bf0,4624,2019-02-12 04:37:27.997,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90ea44,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:27.997


In [47]:
indexed_logons_df.loc["NT AUTHORITY\\SYSTEM"].head()

Unnamed: 0_level_0,TenantId,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
Account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
NT AUTHORITY\SYSTEM,52b1ab41-869e-4138-9e40-2a4457f09bf0,4624,2019-02-12 04:56:34.307,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:56:34.307
NT AUTHORITY\SYSTEM,52b1ab41-869e-4138-9e40-2a4457f09bf0,4624,2019-02-12 04:50:09.713,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:50:09.713
NT AUTHORITY\SYSTEM,52b1ab41-869e-4138-9e40-2a4457f09bf0,4624,2019-02-12 04:50:18.660,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:50:18.660
NT AUTHORITY\SYSTEM,52b1ab41-869e-4138-9e40-2a4457f09bf0,4624,2019-02-12 04:43:56.327,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:43:56.327
NT AUTHORITY\SYSTEM,52b1ab41-869e-4138-9e40-2a4457f09bf0,4624,2019-02-12 04:44:10.343,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:44:10.343


In [48]:
# Physical row indexing works as before
indexed_logons_df.iloc[3]

TenantId                             52b1ab41-869e-4138-9e40-2a4457f09bf0
EventID                                                              4624
TimeGenerated                                  2019-02-12 04:38:16.550000
SourceComputerId                     263a788b-6526-4cdc-8ed9-d79402fe4aa0
Computer                                                  MSTICAlertsWin1
SubjectUserName                                                         -
SubjectDomainName                                                       -
SubjectUserSid                                                    S-1-0-0
TargetUserName                                                 MSTICAdmin
TargetDomainName                                          MSTICAlertsWin1
TargetUserSid                S-1-5-21-996632719-2361334927-4038480536-500
TargetLogonId                                                   0xc912d62
LogonProcessName                                                 NtLmSsp 
LogonType                             

## Accessing individual values

In [40]:
print("iloc + named column", logons_df.iloc[0].Account)
print("at - row idx + named column", logons_df.at[0, "Account"])
print("iat - row idx + column idx", logons_df.iat[0, 1])


print("\nBut if the index is not unique 'at' returns a series\n")

print(
    "at - row idx + named column",
    "Type:",
    type(indexed_logons_df.at["NT AUTHORITY\\SYSTEM", "EventID"]),
    "Result:",
    indexed_logons_df.at["NT AUTHORITY\\SYSTEM", "EventID"],
    sep="\n",
)

iloc + named column NT AUTHORITY\SYSTEM
at - row idx + named column NT AUTHORITY\SYSTEM
iat - row idx + column idx NT AUTHORITY\SYSTEM

But if the index is not unique 'at' returns a series

at - row idx + named column
Type:
<class 'pandas.core.series.Series'>
Result:
Account
NT AUTHORITY\SYSTEM    4624
NT AUTHORITY\SYSTEM    4624
NT AUTHORITY\SYSTEM    4624
NT AUTHORITY\SYSTEM    4624
NT AUTHORITY\SYSTEM    4624
NT AUTHORITY\SYSTEM    4624
Name: EventID, dtype: int64


# Selecting/Searching

## Specific row by number

In [49]:
logons_df.iloc[2].Account

'MSTICAlertsWin1\\MSTICAdmin'

In [55]:
logons_df.iloc[3:6]

Unnamed: 0,TenantId,Account,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
3,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:38:16.550,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc912d62,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:38:16.550
4,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:38:21.370,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc913737,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:38:21.370
5,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 04:50:09.713,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:50:09.713


In [56]:
logons_df.head(5) == logons_df.iloc[0:5]

Unnamed: 0,TenantId,Account,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
0,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True


## Select by content - Boolean expression

- ==
- !=
- \>, <, >=, <=

In [67]:
logons_df["Account"] == "MSTICAlertsWin1\\MSTICAdmin"

0     False
1      True
2      True
3      True
4      True
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
Name: Account, dtype: bool

### Use boolean result of expression to filter DataFrame

In [69]:
logons_df.loc[logons_df["Account"] == "MSTICAlertsWin1\\MSTICAdmin"]

Unnamed: 0,TenantId,Account,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
1,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:37:25.340,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90e957,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:25.340
2,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:37:27.997,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90ea44,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:27.997
3,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:38:16.550,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc912d62,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:38:16.550
4,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:38:21.370,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc913737,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:38:21.370


## Other operators with boolean indexing

Operators vary depending on data type!!!

In [70]:
logons_df.dtypes

TenantId                             object
Account                              object
EventID                               int64
TimeGenerated                datetime64[ns]
SourceComputerId                     object
Computer                             object
SubjectUserName                      object
SubjectDomainName                    object
SubjectUserSid                       object
TargetUserName                       object
TargetDomainName                     object
TargetUserSid                        object
TargetLogonId                        object
LogonProcessName                     object
LogonType                             int64
AuthenticationPackageName            object
Status                               object
IpAddress                            object
WorkstationName                      object
TimeCreatedUtc               datetime64[ns]
dtype: object

In [71]:
logons_df[logons_df["Account"].endswith("MSTICAdmin")]

AttributeError: 'Series' object has no attribute 'endswith'

In [72]:
logons_df["Account"]

0              NT AUTHORITY\SYSTEM
1       MSTICAlertsWin1\MSTICAdmin
2       MSTICAlertsWin1\MSTICAdmin
3       MSTICAlertsWin1\MSTICAdmin
4       MSTICAlertsWin1\MSTICAdmin
5              NT AUTHORITY\SYSTEM
6              NT AUTHORITY\SYSTEM
7              NT AUTHORITY\SYSTEM
8              NT AUTHORITY\SYSTEM
9              NT AUTHORITY\SYSTEM
10               NT AUTHORITY\IUSR
11             NT AUTHORITY\SYSTEM
12             NT AUTHORITY\SYSTEM
13             NT AUTHORITY\SYSTEM
14    NT AUTHORITY\NETWORK SERVICE
15            Window Manager\DWM-1
16            Window Manager\DWM-1
17      NT AUTHORITY\LOCAL SERVICE
18             NT AUTHORITY\SYSTEM
19             NT AUTHORITY\SYSTEM
Name: Account, dtype: object

### We need to tell pandas to treat the series as a string
(a bit like tostring(dynamic) in KQL)

logons_df[logons_df["Account"].**str**.contains("MSTICAdmin")]

In [73]:
logons_df[logons_df["Account"].str.endswith("MSTICAdmin")]

Unnamed: 0,TenantId,Account,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
1,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:37:25.340,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90e957,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:25.340
2,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:37:27.997,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90ea44,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:27.997
3,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:38:16.550,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc912d62,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:38:16.550
4,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:38:21.370,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc913737,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:38:21.370


## Multiple conditions
```
& == AND
| == OR
~ == NOT
```

*Always use parentheses around individual expressions in composite logical expressions!*

In [None]:
logons_df[
    logons_df["Account"].str.contains("MSTICAdmin")
]

In [79]:
t1 = pd.Timestamp("2019-02-12 04:37:25")
t2 = pd.to_datetime("2019-02-12 04:37:26")
t1, t2

(Timestamp('2019-02-12 04:37:25'), Timestamp('2019-02-12 04:37:26'))

In [80]:
logons_df[
    (logons_df["Account"].str.contains("MSTICAdmin"))
    &
    (logons_df["TimeGenerated"] >= t1)
    &
    (logons_df["TimeGenerated"] <= t2)
]

Unnamed: 0,TenantId,Account,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
1,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:37:25.340,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90e957,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:25.340


In [82]:
logons_df[
    ~(logons_df["Account"].str.contains("MSTICAdmin"))
    &
    (logons_df["TimeGenerated"] >= t2)
].head(5)

Unnamed: 0,TenantId,Account,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
0,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 04:56:34.307,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:56:34.307
5,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 04:50:09.713,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:50:09.713
6,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 04:50:18.660,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:50:18.660
7,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 04:43:56.327,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:43:56.327
8,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 04:44:10.343,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:44:10.343


In [78]:
logons_df[
    (logons_df["LogonType"] == 3)
    &
    (logons_df["TimeGenerated"].dt.hour == 4)
    &
    (logons_df["TimeGenerated"].dt.minute == 37)
]

Unnamed: 0,TenantId,Account,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
1,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:37:25.340,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90e957,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:25.340
2,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:37:27.997,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90ea44,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:27.997


### Boolean indexes are Pandas series - you can save and re-use

In [86]:
logon_type_3 = logons_df["LogonType"] == 3
hour_4 = logons_df["TimeGenerated"].dt.hour == 4
minute_37 = logons_df["TimeGenerated"].dt.minute == 37

logons_df[logon_type_3 & hour_4 & minute_37]

Unnamed: 0,TenantId,Account,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
1,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:37:25.340,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90e957,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:25.340
2,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:37:27.997,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90ea44,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:27.997


### Pandas `str` and `dt` (datetime) accessor functions

In [83]:
str_funcs = [func for func in dir(logons_df["Account"].str) if not func.startswith("_")]
print("Pandas 'str' functions")
print("----------------------")
print(", ".join(str_funcs))
print("\nRead more here")
print("https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html#method-summary")
dt_funcs = [func for func in dir(logons_df["TimeGenerated"].dt) if not func.startswith("_")]
print("\nPandas 'dt' (datetime) functions")
print("----------------------------------")
print(", ".join(dt_funcs))

Pandas 'str' functions
----------------------
capitalize, casefold, cat, center, contains, count, decode, encode, endswith, extract, extractall, find, findall, fullmatch, get, get_dummies, index, isalnum, isalpha, isdecimal, isdigit, islower, isnumeric, isspace, istitle, isupper, join, len, ljust, lower, lstrip, match, normalize, pad, partition, repeat, replace, rfind, rindex, rjust, rpartition, rsplit, rstrip, slice, slice_replace, split, startswith, strip, swapcase, title, translate, upper, wrap, zfill

Read more here
https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html#method-summary

Pandas 'dt' (datetime) functions
----------------------------------
ceil, date, day, day_name, day_of_week, day_of_year, dayofweek, dayofyear, days_in_month, daysinmonth, floor, freq, hour, is_leap_year, is_month_end, is_month_start, is_quarter_end, is_quarter_start, is_year_end, is_year_start, isocalendar, microsecond, minute, month, month_name, nanosecond, normalize, quarter, round, seco

## `isin` operator/function

In [None]:
logons_df[logons_df["TargetUserName"].isin(["MSTICAdmin", "SYSTEM"])]

## query function
Useful for simpler queries - and definitely nicer-looking but some limitations - only simple operators supported.

Good for quick things but I prefer the boolean stuff for more complex queries.

To reference Python variables prefix the variable name with "@" (see second example)

In [87]:
logons_df.query("TargetUserName == 'MSTICAdmin' and TargetLogonId == '0xc90ea44'")

logons_df.query("TargetUserName == 'MSTICAdmin' and TargetLogonId == '0xc90ea44' and TimeGenerated > @t2")

Unnamed: 0,TenantId,Account,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
2,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:37:27.997,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90ea44,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:27.997


In [90]:
(
    logons_df
    .query("TargetLogonId == '0xc90ea44' and TimeGenerated > @t2")
    [logons_df["Account"].str.match("MST.*")]
)

  after removing the cwd from sys.path.


Unnamed: 0,TenantId,Account,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
2,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:37:27.997,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90ea44,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:27.997


## Combing Column Select and Query

In [91]:
(
    logons_df[logons_df["Account"].str.contains("MSTICAdmin")]
    [["Account", "TimeGenerated"]]
)

Unnamed: 0,Account,TimeGenerated
1,MSTICAlertsWin1\MSTICAdmin,2019-02-12 04:37:25.340
2,MSTICAlertsWin1\MSTICAdmin,2019-02-12 04:37:27.997
3,MSTICAlertsWin1\MSTICAdmin,2019-02-12 04:38:16.550
4,MSTICAlertsWin1\MSTICAdmin,2019-02-12 04:38:21.370


### and with column rename

In [None]:
(
    logons_df[logons_df["Account"].str.contains("MSTICAdmin")]
    [["Account", "TimeGenerated"]]
    .rename(columns={"Account": "User", "TimeGenerated": "Time"})
)

# Sorting and removing duplicates

In [122]:
logons_df.sort_values("TimeGenerated", ascending=False).head(3)

Unnamed: 0,TenantId,Account,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
0,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 04:56:34.307,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:56:34.307
6,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 04:50:18.660,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:50:18.660
5,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 04:50:09.713,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:50:09.713


In [126]:
(
    logons_df[["Account", "LogonType"]]
    .drop_duplicates()
    .sort_values("Account")
)

Unnamed: 0,Account,LogonType
1,MSTICAlertsWin1\MSTICAdmin,3
10,NT AUTHORITY\IUSR,5
17,NT AUTHORITY\LOCAL SERVICE,5
14,NT AUTHORITY\NETWORK SERVICE,5
0,NT AUTHORITY\SYSTEM,5
12,NT AUTHORITY\SYSTEM,0
15,Window Manager\DWM-1,2


# Grouping and Aggregation
<p style="color: white; background-color: green; text-align: center">New</p>

In [94]:
logons_df.groupby("Account")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001ABCCC48788>

### You need an aggregator (or iterator) make use of grouping

In [95]:
logons_df.groupby("Account").count() # Yuk!

Unnamed: 0_level_0,TenantId,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
Account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
MSTICAlertsWin1\MSTICAdmin,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
NT AUTHORITY\IUSR,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
NT AUTHORITY\LOCAL SERVICE,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
NT AUTHORITY\NETWORK SERVICE,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
NT AUTHORITY\SYSTEM,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11
Window Manager\DWM-1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2


In [96]:
(
    logons_df[["TimeGenerated", "Account"]]
    .groupby("Account")
    .count()
    .rename(columns={"TimeGenerated": "Count"})
)

Unnamed: 0_level_0,Count
Account,Unnamed: 1_level_1
MSTICAlertsWin1\MSTICAdmin,4
NT AUTHORITY\IUSR,1
NT AUTHORITY\LOCAL SERVICE,1
NT AUTHORITY\NETWORK SERVICE,1
NT AUTHORITY\SYSTEM,11
Window Manager\DWM-1,2


In [107]:
for name, logon_group in logons_df.groupby("Account"):
    print(name, type(logon_group), "size", logon_group.shape)

print("\nCollect in dictionary")
df_dict = {name: df for name, df in logons_df.groupby("Account")}

[print(f"{name}: df") for name in df_dict]


MSTICAlertsWin1\MSTICAdmin <class 'pandas.core.frame.DataFrame'> size (4, 20)
NT AUTHORITY\IUSR <class 'pandas.core.frame.DataFrame'> size (1, 20)
NT AUTHORITY\LOCAL SERVICE <class 'pandas.core.frame.DataFrame'> size (1, 20)
NT AUTHORITY\NETWORK SERVICE <class 'pandas.core.frame.DataFrame'> size (1, 20)
NT AUTHORITY\SYSTEM <class 'pandas.core.frame.DataFrame'> size (11, 20)
Window Manager\DWM-1 <class 'pandas.core.frame.DataFrame'> size (2, 20)

Collect in dictionary
MSTICAlertsWin1\MSTICAdmin: df
NT AUTHORITY\IUSR: df
NT AUTHORITY\LOCAL SERVICE: df
NT AUTHORITY\NETWORK SERVICE: df
NT AUTHORITY\SYSTEM: df
Window Manager\DWM-1: df


[None, None, None, None, None, None]

## Grouping with Multiple aggregation functions

In [None]:
(
    logons_df[["TimeGenerated", "EventID", "Account"]]
    .groupby("Account")
    .agg({"TimeGenerated": "max", "EventID": "sum"})
    .rename(columns={"TimeGenerated": "LastTime"})
)

Unnamed: 0_level_0,LastTime,EventID
Account,Unnamed: 1_level_1,Unnamed: 2_level_1
MSTICAlertsWin1\MSTICAdmin,2019-02-15 03:57:02.593,83232
MSTICAlertsWin1\ian,2019-02-15 03:56:34.440,36992
NT AUTHORITY\IUSR,2019-02-14 04:20:56.110,9248
NT AUTHORITY\LOCAL SERVICE,2019-02-14 04:20:54.803,9248
NT AUTHORITY\NETWORK SERVICE,2019-02-14 04:20:54.630,9248
NT AUTHORITY\SYSTEM,2019-02-15 11:51:37.597,564128
Window Manager\DWM-1,2019-02-14 04:20:54.773,18496
Window Manager\DWM-2,2019-02-15 03:57:01.903,27744


## Grouping with multiple columns

In [115]:
(
    logons_full_df[["TimeGenerated", "EventID", "Account", "LogonType"]]      # DF input fields
    .groupby(["Account", "LogonType"])                                        # Grouping fields
    .agg({"TimeGenerated": "max", "EventID": "count"})                        # aggregate operations
    .rename(columns={"TimeGenerated": "LastTime", "EventID": "Count"})        # Rename output
)


Unnamed: 0_level_0,Unnamed: 1_level_0,LastTime,Count
Account,LogonType,Unnamed: 2_level_1,Unnamed: 3_level_1
MSTICAlertsWin1\MSTICAdmin,3,2019-02-15 03:57:00.207,8
MSTICAlertsWin1\MSTICAdmin,4,2019-02-14 11:51:37.603,8
MSTICAlertsWin1\MSTICAdmin,10,2019-02-15 03:57:02.593,2
MSTICAlertsWin1\ian,2,2019-02-12 20:29:51.030,2
MSTICAlertsWin1\ian,3,2019-02-15 03:56:34.440,5
MSTICAlertsWin1\ian,4,2019-02-12 20:41:17.310,1
NT AUTHORITY\IUSR,5,2019-02-14 04:20:56.110,2
NT AUTHORITY\LOCAL SERVICE,5,2019-02-14 04:20:54.803,2
NT AUTHORITY\NETWORK SERVICE,5,2019-02-14 04:20:54.630,2
NT AUTHORITY\SYSTEM,0,2019-02-14 04:20:54.370,2


## Using pd.Grouper to group by time interval

In [118]:
(
    logons_full_df[["TimeGenerated", "EventID", "Account", "LogonType"]]
    .groupby(["Account", pd.Grouper(key="TimeGenerated", freq="1D")])
    .agg({"TimeGenerated": "max", "EventID": "count"})
    .rename(columns={"TimeGenerated": "LastTime", "EventID": "Count"})
)

Unnamed: 0_level_0,Unnamed: 1_level_0,LastTime,Count
Account,TimeGenerated,Unnamed: 2_level_1,Unnamed: 3_level_1
MSTICAlertsWin1\MSTICAdmin,2019-02-09,2019-02-09 23:26:47.700,1
MSTICAlertsWin1\MSTICAdmin,2019-02-11,2019-02-11 22:47:53.750,4
MSTICAlertsWin1\MSTICAdmin,2019-02-12,2019-02-12 20:19:44.767,7
MSTICAlertsWin1\MSTICAdmin,2019-02-13,2019-02-13 23:07:23.823,2
MSTICAlertsWin1\MSTICAdmin,2019-02-14,2019-02-14 11:51:37.603,1
MSTICAlertsWin1\MSTICAdmin,2019-02-15,2019-02-15 03:57:02.593,3
MSTICAlertsWin1\ian,2019-02-12,2019-02-12 20:41:17.310,3
MSTICAlertsWin1\ian,2019-02-13,2019-02-13 00:57:37.187,3
MSTICAlertsWin1\ian,2019-02-15,2019-02-15 03:56:34.440,2
NT AUTHORITY\IUSR,2019-02-12,2019-02-12 04:40:12.360,1


# Adding and removing columns

In [148]:
new_df = logons_df.copy()

# Adding a static value
new_df["StaticValue"] = "A logon"
# Extracting a substring (there are several ways to do this)
new_df["NTDomain"] = new_df.Account.str.split("\\", 1, expand=True)[0]
# Transforming using an accessor
new_df["DayOfWeek"] = new_df.TimeGenerated.dt.day_name()
# Arithmetic calculations
new_df["BigEventID"] = new_df.EventID * 1000000
new_df["SameTimeTomorrow"] = new_df.TimeGenerated + pd.Timedelta("1D")

new_df[[
    "Account", "TimeGenerated", "StaticValue", "NTDomain", "DayOfWeek", "BigEventID", "SameTimeTomorrow"
]].head()

Unnamed: 0,Account,TimeGenerated,StaticValue,NTDomain,DayOfWeek,BigEventID,SameTimeTomorrow
99,Window Manager\DWM-2,2019-02-12 22:22:21.240,A logon,Window Manager,Tuesday,4624000000,2019-02-13 22:22:21.240
89,NT AUTHORITY\SYSTEM,2019-02-14 04:20:55.763,A logon,NT AUTHORITY,Thursday,4624000000,2019-02-15 04:20:55.763
32,NT AUTHORITY\SYSTEM,2019-02-12 06:42:08.110,A logon,NT AUTHORITY,Tuesday,4624000000,2019-02-13 06:42:08.110
3,MSTICAlertsWin1\MSTICAdmin,2019-02-12 04:38:16.550,A logon,MSTICAlertsWin1,Tuesday,4624000000,2019-02-13 04:38:16.550
123,NT AUTHORITY\SYSTEM,2019-02-12 20:39:14.110,A logon,NT AUTHORITY,Tuesday,4624000000,2019-02-13 20:39:14.110


In [138]:
new_df[["Account", "TimeGenerated", "StaticValue", "NTDomain", "DayOfWeek"]].drop(columns=["NTDomain"])

Unnamed: 0,Account,TimeGenerated,StaticValue,DayOfWeek
99,Window Manager\DWM-2,2019-02-12 22:22:21.240,A logon,Tuesday
89,NT AUTHORITY\SYSTEM,2019-02-14 04:20:55.763,A logon,Thursday
32,NT AUTHORITY\SYSTEM,2019-02-12 06:42:08.110,A logon,Tuesday
3,MSTICAlertsWin1\MSTICAdmin,2019-02-12 04:38:16.550,A logon,Tuesday
123,NT AUTHORITY\SYSTEM,2019-02-12 20:39:14.110,A logon,Tuesday
47,NT AUTHORITY\SYSTEM,2019-02-10 00:48:31.607,A logon,Sunday
45,NT AUTHORITY\SYSTEM,2019-02-10 20:03:57.417,A logon,Sunday
1,MSTICAlertsWin1\MSTICAdmin,2019-02-12 04:37:25.340,A logon,Tuesday
81,NT AUTHORITY\SYSTEM,2019-02-14 04:20:55.503,A logon,Thursday
158,Window Manager\DWM-2,2019-02-15 03:57:01.903,A logon,Friday


## Some other quick ways of filtering out (in) columns

In [139]:
logons_df.filter(regex="Target.*", axis=1).head()

Unnamed: 0,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId
99,DWM-2,Window Manager,S-1-5-90-0-2,0x106b458
89,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7
32,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7
3,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc912d62
123,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7


In [146]:
logons_df.select_dtypes(include="datetime").head()  # also "number", "object"

Unnamed: 0,TimeGenerated,TimeCreatedUtc
99,2019-02-12 22:22:21.240,2019-02-12 22:22:21.240
89,2019-02-14 04:20:55.763,2019-02-14 04:20:55.763
32,2019-02-12 06:42:08.110,2019-02-12 06:42:08.110
3,2019-02-12 04:38:16.550,2019-02-12 04:38:16.550
123,2019-02-12 20:39:14.110,2019-02-12 20:39:14.110


# Cleaning Data

- Nans
- Transforming
- Reshaping

# Simple Joins

(relational joins tomorrow)

In [166]:
df1 = logons_full_df[0:10]
df2 = logons_full_df[100:110]

print("df1:", df1.shape, "df2:", df2.shape)
df1.head(3)

df1: (10, 20) df2: (10, 20)


Unnamed: 0,TenantId,Account,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
0,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 04:56:34.307,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:56:34.307
1,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:37:25.340,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90e957,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:25.340
2,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:37:27.997,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90ea44,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:27.997


In [168]:
joined_df = pd.concat([df1, df2])

print(joined_df.shape)
joined_df


(20, 20)


Unnamed: 0,TenantId,Account,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
0,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 04:56:34.307,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:56:34.307
1,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:37:25.340,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90e957,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:25.340
2,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:37:27.997,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90ea44,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:37:27.997
3,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:38:16.550,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc912d62,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:38:16.550
4,52b1ab41-869e-4138-9e40-2a4457f09bf0,MSTICAlertsWin1\MSTICAdmin,4624,2019-02-12 04:38:21.370,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc913737,NtLmSsp,3,NTLM,,131.107.147.209,IANHELLE-DEV17,2019-02-12 04:38:21.370
5,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 04:50:09.713,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:50:09.713
6,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 04:50:18.660,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:50:18.660
7,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 04:43:56.327,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:43:56.327
8,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 04:44:10.343,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:44:10.343
9,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 04:40:11.867,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 04:40:11.867


In [172]:
df_list = [df1] * 3 + [df2] * 5
joined_df = pd.concat(df_list, ignore_index=True)

print(joined_df.shape)
joined_df.tail(5)

(80, 20)


Unnamed: 0,TenantId,Account,EventID,TimeGenerated,SourceComputerId,Computer,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId,LogonProcessName,LogonType,AuthenticationPackageName,Status,IpAddress,WorkstationName,TimeCreatedUtc
75,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-13 04:44:32.913,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-13 04:44:32.913
76,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-13 03:15:18.813,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-13 03:15:18.813
77,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-13 20:08:47.880,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-13 20:08:47.880
78,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 21:53:36.280,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 21:53:36.280
79,52b1ab41-869e-4138-9e40-2a4457f09bf0,NT AUTHORITY\SYSTEM,4624,2019-02-12 21:53:53.453,263a788b-6526-4cdc-8ed9-d79402fe4aa0,MSTICAlertsWin1,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7,Advapi,5,Negotiate,,-,-,2019-02-12 21:53:53.453


## Joining columns (horizontal)

In [155]:
df_col_1 = logons_full_df[0:10].filter(regex="Subject.*")
df_col_2 = logons_full_df[0:10].filter(regex="Target.*")
print(df_col_1.shape, df_col_2.shape)
df_col_1.head()

(10, 3) (10, 4)


Unnamed: 0,SubjectUserName,SubjectDomainName,SubjectUserSid
0,MSTICAlertsWin1$,WORKGROUP,S-1-5-18
1,-,-,S-1-0-0
2,-,-,S-1-0-0
3,-,-,S-1-0-0
4,-,-,S-1-0-0


In [160]:
pd.concat([df_col_1, df_col_2], axis="columns")

Unnamed: 0,SubjectUserName,SubjectDomainName,SubjectUserSid,TargetUserName,TargetDomainName,TargetUserSid,TargetLogonId
0,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7
1,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90e957
2,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc90ea44
3,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc912d62
4,-,-,S-1-0-0,MSTICAdmin,MSTICAlertsWin1,S-1-5-21-996632719-2361334927-4038480536-500,0xc913737
5,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7
6,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7
7,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7
8,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7
9,MSTICAlertsWin1$,WORKGROUP,S-1-5-18,SYSTEM,NT AUTHORITY,S-1-5-18,0x3e7
