# Local: Tokenizing & Embedding
---
* Collaborators:
    * Roberto Rodriguez (@Cyb3rWard0g)
* References:
    * https://huggingface.co/docs/transformers/main/tokenizer_summary
    * https://huggingface.co/docs/transformers/model_doc/big_bird

## Download Data

In [56]:
import os

repository_url = 'https://github.com/OTRF/Security-Datasets/raw/master/datasets/compound/GoldenSAMLADFSMailAccess/Microsoft365DefenderEvents.Zip'
output_directory = './'  # Destination directory for unzipped files

# Extract the filename from the URL
file_name = repository_url.split('/')[-1]

# Check if the file already exists in the output directory
download_required = not os.path.exists(os.path.join(output_directory, file_name))

if download_required:
    import requests
    import zipfile
    import io

    # Download the zip file
    response = requests.get(repository_url)

    if response.status_code == 200:
        # Create the output directory if it doesn't exist
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)

        # Unzip the downloaded file to the output directory
        with zipfile.ZipFile(io.BytesIO(response.content), 'r') as zip_ref:
            zip_ref.extractall(output_directory)

        print(f"Downloaded and unzipped '{file_name}' to '{output_directory}'")
    else:
        print(f"Failed to download '{file_name}' from GitHub. Status code: {response.status_code}")
else:
    print(f"File '{file_name}' already exists in '{output_directory}', no need to download.")

File 'Microsoft365DefenderEvents.Zip' already exists in './', no need to download.


## Read Event Logs

In [31]:
import json

file_path = "Microsoft365DefenderEvents.json"

# Open and read the JSON file
with open(file_path, 'r') as file:
    data = file.read()

# Parse the JSON data
json_data = [json.loads(line) for line in data.splitlines()]

In [32]:
json_data[7]

{'Timestamp': '2021-08-02T13:32:07Z',
 'ActionType': 'MailItemsAccessed',
 'Application': 'Microsoft Exchange Online',
 'ApplicationId': 20893,
 'AccountObjectId': '5a95e683-08ad-424e-a441-1d1aec52c02c',
 'AccountDisplayName': 'SimuLandApp',
 'IsAdminOperation': 0,
 'DeviceType': 'Other',
 'OSPlatform': 'Unknown',
 'IPAddress': '1.2.3.4',
 'IsAnonymousProxy': 0,
 'CountryCode': 'US',
 'City': 'chicago',
 'ISP': 'Microsoft 365 Common and Office Online server',
 'UserAgent_dynamic': None,
 'UserAgent_string': 'Client=REST;;',
 'ActivityType': 'Run',
 'ActivityObjects': [{'ServiceObjectType': 'Session ID',
   'Type': 'Structured object',
   'Role': 'Parameter'},
  {'Type': 'Task', 'Role': 'Target object', 'Name': 'MailItemsAccessed'},
  {'Type': 'Property',
   'Role': 'Parameter',
   'Name': 'MailAccessType',
   'Value': 'Bind'},
  {'Type': 'Property',
   'Role': 'Parameter',
   'Name': 'IsThrottled',
   'Value': 'False'},
  {'ApplicationInstance': 0,
   'ApplicationId': 11161,
   'Type':

## Tokenization

In [45]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")

# Tokenize an event log
tokens = tokenizer.tokenize(json.dumps(json_data[7]))

### Raw Example Log - Index 7

{"Timestamp":"2021-08-02T13:32:07Z","ActionType":"MailItemsAccessed","Application":"Microsoft Exchange Online","ApplicationId":20893,"AccountObjectId":"5a95e683-08ad-424e-a441-1d1aec52c02c","AccountDisplayName":"SimuLandApp","IsAdminOperation":0,"DeviceType":"Other","OSPlatform":"Unknown","IPAddress":"1.2.3.4","IsAnonymousProxy":0,"CountryCode":"US","City":"chicago","ISP":"Microsoft 365 Common and Office Online server","UserAgent_dynamic":null,"UserAgent_string":"Client=REST;;","ActivityType":"Run","ActivityObjects":[{"ServiceObjectType":"Session ID","Type":"Structured object","Role":"Parameter"},{"Type":"Task","Role":"Target object","Name":"MailItemsAccessed"},{"Type":"Property","Role":"Parameter","Name":"MailAccessType","Value":"Bind"},{"Type":"Property","Role":"Parameter","Name":"IsThrottled","Value":"False"},{"ApplicationInstance":0,"ApplicationId":11161,"Type":"User","Role":"Parameter","Name":"Gustavo Pedro","Id":"aead923d-498b-4f64-a66c-2af91447a8b6"},{"ApplicationInstance":0,"ApplicationId":11161,"Type":"Account","Role":"Actor","Name":"SimuLandApp","Id":"5a95e683-08ad-424e-a441-1d1aec52c02c"}],"ObjectName":"MailItemsAccessed","ObjectType":"Task","ObjectId":"","ReportId":"106830890_20893_699e0b10-1c53-403e-976f-ce0847a92b44","AdditionalFields":{"IsSatelliteProvider":false},"UserId":"","Permissions":null,"PermissionsAddedTo":"","RawEventData":{"OrganizationId":"00000000-0000-0000-0000-000000000000","CreationTime":"2021-08-02T13:32:07.0000000Z","RecordType":50,"Operation":"MailItemsAccessed","UserType":0,"Workload":"Exchange","Version":1,"UserKey":"100320015858B802","UserId":"pgustavo@simulandlabs.com","OriginatingServer":"AB1CD23EF4567 (15.20.4200.000)\r\n","InternalLogonType":0,"OrganizationName":"simulandlabs.onmicrosoft.com","ClientInfoString":"Client=REST;;","MailboxOwnerSid":"S-1-5-21-1825954961-3338807533-2873504967-26087451","ClientIPAddress":"1.2.3.4","MailboxOwnerUPN":"pgustavo@simulandlabs.com","ExternalAccess":false,"ResultStatus":"Succeeded","Id":"699e0b10-1c53-403e-976f-ce0847a92b44","LogonUserSid":"S-1-5-21-1825954961-3338807533-2873504967-26087451","MailboxGuid":"d0c5f8ae-9ed7-4e46-bfdf-ea1460f5a31b","LogonType":0,"OperationProperties":["@{Value=Bind; Name=MailAccessType}","@{Value=False; Name=IsThrottled}"],"OperationCount":7,"AppId":"00000003-0000-0000-c000-000000000000","Folders":["@{Id=LgAAAAAM7KyTTmWeRac2KXBEz/7aAQARGHK+grzLTpRJraC1QR6kAAAAAAEMAAAB; Path=\\Inbox; FolderItems=System.Object[]}"],"ClientAppId":"5a95e683-08ad-424e-a441-1d1aec52c02c"},"spnID":"","rawData":{"OrganizationId":"00000000-0000-0000-0000-000000000000","CreationTime":"2021-08-02T13:32:07.0000000Z","RecordType":50,"Operation":"MailItemsAccessed","UserType":0,"Workload":"Exchange","Version":1,"UserKey":"100320015858B802","UserId":"pgustavo@simulandlabs.com","OriginatingServer":"AB1CD23EF4567 (15.20.4200.000)\r\n","InternalLogonType":0,"OrganizationName":"simulandlabs.onmicrosoft.com","ClientInfoString":"Client=REST;;","MailboxOwnerSid":"S-1-5-21-1825954961-3338807533-2873504967-26087451","ClientIPAddress":"1.2.3.4","MailboxOwnerUPN":"pgustavo@simulandlabs.com","ExternalAccess":false,"ResultStatus":"Succeeded","Id":"699e0b10-1c53-403e-976f-ce0847a92b44","LogonUserSid":"S-1-5-21-1825954961-3338807533-2873504967-26087451","MailboxGuid":"d0c5f8ae-9ed7-4e46-bfdf-ea1460f5a31b","LogonType":0,"OperationProperties":["@{Value=Bind; Name=MailAccessType}","@{Value=False; Name=IsThrottled}"],"OperationCount":7,"AppId":"00000003-0000-0000-c000-000000000000","Folders":["@{Id=LgAAAAAM7KyTTmWeRac2KXBEz/7aAQARGHK+grzLTpRJraC1QR6kAAAAAAEMAAAB; Path=\\Inbox; FolderItems=System.Object[]}"],"ClientAppId":"5a95e683-08ad-424e-a441-1d1aec52c02c"},"AppId":"00000003-0000-0000-c000-000000000000","OAuthAppId":"5a95e683-08ad-424e-a441-1d1aec52c02c","TargetAccountUpn":"","TargetAccountDisplayName":"","TargetDeviceName":"","DestinationDeviceName":"","DestinationIPAddress":"","DestinationPort":null,"Protocol":"","AccountName":"","AccountDomain":"","AccountUpn":"","AccountSid":"","DeviceName":"","Port":null,"Location":""}

In [46]:
tokens[:10]

['▁{"', 'Tim', 'estamp', '":', '▁"', '2', '021', '-', '08', '-']

In [47]:
import torch

# Convert tokens to input IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)

# Create a PyTorch tensor
input_ids = torch.tensor(input_ids).unsqueeze(0)  # Add a batch dimension

## Embeddings

In [49]:
from transformers import BigBirdModel

model = BigBirdModel.from_pretrained("google/bigbird-roberta-base")

# Get BigBird Embeddings
with torch.no_grad():
    outputs = model(input_ids)

embeddings = outputs.last_hidden_state

In [55]:
embeddings

tensor([[[ 0.1869,  0.0935, -0.0685,  ..., -0.0211, -0.1009, -0.0862],
         [ 0.0773, -0.1688, -0.1072,  ...,  0.1626,  0.1574,  0.1664],
         [ 0.1790, -0.0747,  0.0009,  ...,  0.2164,  0.1874,  0.1451],
         ...,
         [ 0.1302, -0.0763, -0.1211,  ...,  0.0049, -0.3677, -0.0096],
         [ 0.3301, -0.5417, -0.3299,  ..., -0.2153, -0.4639, -0.1000],
         [ 0.1222, -0.2770, -0.0157,  ...,  0.0253, -0.0452,  0.0553]]])