In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Training GPT2 with custom dataset**

In [None]:
!pip install transformers torch accelerate -U

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer

# 1. Load and Preprocess Data:

# Load the datasets
log_data = open('/content/extended_logs.csv', 'r').read()
rule_data = open('/content/extended_rules.csv', 'r').read()

# Combine the data
combined_data = log_data + rule_data

# Initialize the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


In [None]:
def chunk_data(data, chunk_size=1024, overlap=100):
    tokenized_data = tokenizer.tokenize(data)
    chunked_data = []

    # Create overlapping segments of the data
    start = 0
    while start < len(tokenized_data):
        end = start + chunk_size
        chunk = tokenized_data[start:end]
        chunked_data.append(tokenizer.convert_tokens_to_string(chunk))
        start = end - overlap

    return chunked_data

chunked_combined_data = chunk_data(combined_data)

# Tokenize chunked data
tokenized_chunks = [tokenizer.encode(chunk, truncation=True, max_length=1024, return_tensors="pt") for chunk in chunked_combined_data]

In [None]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token



In [None]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts

    def __len__(self):
        return len(self.tokenized_texts)

    def __getitem__(self, idx):
        return {"input_ids": self.tokenized_texts[idx], "labels": self.tokenized_texts[idx]}


dataset = CustomDataset(tokenized_chunks)

In [None]:

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()




Step,Training Loss
500,1.0402
1000,0.9985
1500,0.9873
2000,0.9807
2500,0.981
3000,0.9738
3500,0.9715
4000,0.97
4500,0.9681
5000,0.9679


TrainOutput(global_step=6233, training_loss=0.9801929803753711, metrics={'train_runtime': 2377.7394, 'train_samples_per_second': 2.621, 'train_steps_per_second': 2.621, 'total_flos': 3257149093632000.0, 'train_loss': 0.9801929803753711, 'epoch': 1.0})

In [None]:
# Save the model
model.save_pretrained("./fine_tuned_gpt2")
tokenizer.save_pretrained("./fine_tuned_gpt2")

# Load and use the fine-tuned mode



('./fine_tuned_gpt2/tokenizer_config.json',
 './fine_tuned_gpt2/special_tokens_map.json',
 './fine_tuned_gpt2/vocab.json',
 './fine_tuned_gpt2/merges.txt',
 './fine_tuned_gpt2/added_tokens.json')

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the Fine-Tuned GPT-2 Model and Tokenizer
# Load the Fine-Tuned GPT-2 Model and Tokenizer
model = GPT2LMHeadModel.from_pretrained("./fine_tuned_gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_gpt2")

# Define the log entry and create a more explicit prompt for insights
log_entry = "UserID: user5000 Action: ACCESS Resource: /resource/99 Timestamp: 2023-08-19 14:45:23 IP Address: 192.168.2.50 Status: FAILURE"
prompt = f"Analyze the log entry below:\n\n{log_entry}\n\nWhat does this log entry suggest about the user's activity and were there any issues?"

# Encode the prompt and generate insights
input_ids = tokenizer.encode(prompt, return_tensors="pt")
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)  # Create attention mask

output = model.generate(input_ids,
                        attention_mask=attention_mask,
                        pad_token_id=tokenizer.eos_token_id,
                        max_length=300,  # Increase max length
                        temperature=0.7,  # Adjust temperature for more deterministic outputs
                        num_return_sequences=1)

# Decode the output and extract insights
insights = tokenizer.decode(output[0], skip_special_tokens=True)
generated_insights = insights.split("were there any issues?")[-1].strip()

print("Generated Insights:")
print(generated_insights)


Generated Insights:
Response time: 10:00:27.067897,user100,ACCESS,/resource/9,192.168.1.71,FAILURE,session91858,Chrome/90.0,POST,836,404
2023-08-02 06:33:27.067900,user100,ACCESS,/resource/7,192.168.1.71,FAILURE,session91858,Edge/18.0,PUT,868,400
2023-08-02 06:33:27.068019,user100,MODIFY,/resource/6,192.168.1.71,FAILURE,session91858,Edge/18.0,POST,868,404
2023-08-02 06:33:27.068019,user100,MODIFY,/resource/8,192.168.1.71,FAILURE,session91858,Edge/18.0,PUT,868


In [None]:
import re
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the Fine-Tuned GPT-2 Model and Tokenizer
model = GPT2LMHeadModel.from_pretrained("./fine_tuned_gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_gpt2")

# Sample rule patterns
patterns = {
    'after_hours_access': r'UserID: (.+?) .+? Timestamp: .+? (2[0-3]:\d{2}:\d{2}|0[0-6]:\d{2}:\d{2}) .+? Resource: /resource/X',
    # ... add more patterns for different rules
}

def check_violations_with_llm(log_entry):
    for rule, pattern in patterns.items():
        if re.search(pattern, log_entry):
            return f"Violation Detected: {rule}"

    # If no explicit rule matches, consult the LLM for insights
    prompt = f"Please provide a clear analysis of the following log entry for any potential compliance issues:\n{log_entry}"

    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)  # Create attention mask

    output = model.generate(input_ids,
                            attention_mask=attention_mask,
                            pad_token_id=tokenizer.eos_token_id,
                            max_length=300,
                            temperature=0.7,
                            num_return_sequences=1)

    insights = tokenizer.decode(output[0], skip_special_tokens=True).split("\n")[-1]

    return insights

# Sample log entry for testing
log_entry = "UserID: user123 Action: ACCESS Resource: /resource/X Timestamp: 2023-08-19 23:45:23 IP Address: 192.168.2.50 Status: SUCCESS"
result = check_violations_with_llm(log_entry)
print(result)


2023-08-02 06:33:27.549013,user77,MODIFY,/resource/9,192.168.1.71,SUCCESS,session91858,Edge/18


# **Using LSTM**

In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load data
logs_df = pd.read_csv('/content/extended_logs.csv')
rules_df = pd.read_csv('/content/extended_rules.csv')

# Combine columns to form a single log entry
logs_df["combined_logs"] = logs_df.apply(lambda row: ' '.join(row.astype(str)), axis=1)
logs = logs_df["combined_logs"].values

# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(logs)
sequences = tokenizer.texts_to_sequences(logs)
padded_sequences = pad_sequences(sequences, padding='post')

# For simplicity, generate random labels; ideally, they should be derived from actual compliance checks.
import numpy as np
labels = np.random.randint(0, 2, size=len(logs))

# Split data
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [None]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 64, input_length=padded_sequences.shape[1]),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=64)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {accuracy*100:.2f}%")

# Predict on actual logs
sample_logs = logs[:5]  # Extracting the first 5 logs from the dataset for demonstration
sample_sequences = tokenizer.texts_to_sequences(sample_logs)
sample_padded_sequences = pad_sequences(sample_sequences, padding='post', maxlen=padded_sequences.shape[1])
predictions = model.predict(sample_padded_sequences)

# Displaying the predictions
for log, pred in zip(sample_logs, predictions):
    status = "Non-compliant" if pred >= 0.5 else "Compliant"
    print(f"Log Entry: {log[:50]}... -> {status}")

Validation Accuracy: 49.44%
Log Entry: 2023-07-23 17:22:29.019727 user66 MODIFY /resource... -> Non-compliant
Log Entry: 2023-07-24 19:29:48.019763 user78 MODIFY /resource... -> Compliant
Log Entry: 2023-08-04 14:00:58.019776 user15 MODIFY /resource... -> Compliant
Log Entry: 2023-08-03 19:23:57.019787 user60 MODIFY /resource... -> Compliant
Log Entry: 2023-08-05 14:23:47.019800 user60 DELETE /resource... -> Compliant


In [None]:
!cp -r fine_tuned_gpt2 /content/drive/MyDrive

# **Training DistilBERT**

In [None]:
!pip install transformers



In [None]:
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from transformers import glue_convert_examples_to_features, InputExample
import tensorflow as tf
# Load the data
logs_df = pd.read_csv('/content/extended_logs.csv')
rules_df = pd.read_csv('/content/extended_rules.csv')

# Combine all the columns of logs to form a single log entry
logs_df["combined_logs"] = logs_df.apply(lambda row: ' '.join(row.astype(str)), axis=1)
logs = logs_df["combined_logs"].values
rules = rules_df[rules_df.columns[1]].values


# Randomly pair logs with rules and generate random labels
paired_logs = []
paired_rules = []
labels = []

for log in logs:
    rule = np.random.choice(rules)
    paired_logs.append(log)
    paired_rules.append(rule)
    labels.append(np.random.randint(0, 2))  # Random label; 1 for non-compliant, 0 for compliant


In [None]:
# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
# Convert paired logs and rules to InputExample format
input_examples = [InputExample(guid=None, text_a=log, text_b=rule, label=label) for log, rule, label in zip(paired_logs, paired_rules, labels)]
# Tokenize the inputs
features = glue_convert_examples_to_features(examples=input_examples, tokenizer=tokenizer, max_length=128, task='mrpc', label_list=[0, 1])

# Extract features
input_ids = [f.input_ids for f in features]
attention_masks = [f.attention_mask for f in features]
labels = [f.label for f in features]
# Convert to tf.data.Dataset with correct shapes
def gen():
    for i in range(len(input_ids)):
        yield ({'input_ids': input_ids[i], 'attention_mask': attention_masks[i]}, labels[i])

train_dataset = tf.data.Dataset.from_generator(gen,
                                               ({'input_ids': tf.int32, 'attention_mask': tf.int32}, tf.int64),
                                               ({'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None])}, tf.TensorShape([])))
train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [None]:
model.fit(train_dataset, epochs=2, steps_per_epoch=115)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fc34ba9ef50>

In [None]:
# Sample log and rule pairs for prediction
# Sample log and rule pairs for prediction
sample_logs = logs[:5]  # Extracting the first 5 logs from the dataset for demonstration
sample_rules = [np.random.choice(rules) for _ in range(5)]  # Randomly pairing with 5 rules for demonstration

# Convert the logs and rules to InputExamples
predict_examples = [InputExample(guid=None, text_a=log, text_b=rule, label=0) for log, rule in zip(sample_logs, sample_rules)]

# Tokenize the inputs
features = glue_convert_examples_to_features(examples=predict_examples, tokenizer=tokenizer, max_length=128, task='mrpc', label_list=[0, 1])
all_input_ids = [f.input_ids for f in features]
all_attention_masks = [f.attention_mask for f in features]

# Convert to tf.data.Dataset
predict_dataset = tf.data.Dataset.from_tensor_slices(({
    'input_ids': all_input_ids,
    'attention_mask': all_attention_masks
}))



In [None]:
# Predict with the model
predictions = model.predict(predict_dataset.batch(32))
predicted_labels = [np.argmax(pred) for pred in predictions[0]]

# Display the predictions
for log, rule, label in zip(sample_logs, sample_rules, predicted_labels):
    status = "Non-compliant" if label == 1 else "Compliant"
    print(f"Log: {log[:50]}... | Rule: {rule[:50]}... -> {status}")


Log: 2023-07-23 17:22:29.019727 user66 MODIFY /resource... | Rule: Users should not access /resource/9 more than 5 ti... -> Compliant
Log: 2023-07-24 19:29:48.019763 user78 MODIFY /resource... | Rule: Modification of /resource/4 should always be from ... -> Compliant
Log: 2023-08-04 14:00:58.019776 user15 MODIFY /resource... | Rule: Users should not modify /resource/1... -> Compliant
Log: 2023-08-03 19:23:57.019787 user60 MODIFY /resource... | Rule: Users should not access /resource/7 between 12:00 ... -> Compliant
Log: 2023-08-05 14:23:47.019800 user60 DELETE /resource... | Rule: Users should not access /resource/1 more than 5 ti... -> Compliant


In [None]:
# Sample logs and rules for evaluation
test_logs = logs[1000:1050]  # Taking 50 logs for evaluation as an example
test_rules = [np.random.choice(rules) for _ in range(50)]
test_labels = [np.random.randint(0, 2) for _ in range(50)]  # Random labels for demonstration; in practice, you'd use actual labels

# Convert the logs and rules to InputExamples
test_examples = [InputExample(guid=None, text_a=log, text_b=rule, label=label) for log, rule, label in zip(test_logs, test_rules, test_labels)]

# Tokenize the inputs
features = glue_convert_examples_to_features(examples=test_examples, tokenizer=tokenizer, max_length=128, task='mrpc', label_list=[0, 1])
test_input_ids = [f.input_ids for f in features]
test_attention_masks = [f.attention_mask for f in features]

# Convert to tf.data.Dataset
test_dataset = tf.data.Dataset.from_tensor_slices(({
    'input_ids': test_input_ids,
    'attention_mask': test_attention_masks
}, test_labels)).batch(32)
loss, accuracy = model.evaluate(test_dataset)
print(f"Model accuracy: {accuracy * 100:.2f}%")


Model accuracy: 44.00%


# **GPT2 without finetuning**

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load pretrained GPT-2 model and tokenizer
model_name = "gpt2-large"  # You can use "gpt2-small", "gpt2-medium", "gpt2-large" based on available resources
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)


Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [None]:
def generate_insight(log, rule):
    # Format the prompt
    prompt = f"Given the log entry: {log} and associated rule: {rule}, provide an insight:"

    # Tokenize the prompt and generate response
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    output = model.generate(input_ids, max_length=200, num_return_sequences=1, no_repeat_ngram_size=2, top_k=50, temperature=0.7)

    # Decode and return the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    insight = generated_text.split("provide an insight:")[-1].strip()
    return insight

# Test the function
log_sample = "UserID: user5000 Action: ACCESS Resource: /resource/99 Timestamp: 2023-08-19 14:45:23 IP Address: 192.168.2.50 Status: FAILURE"
rule_sample = "No user should access /resource/99 after 14:00"
print(generate_insight(log_sample, rule_sample))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The above rule is a failure. The user is not allowed to access the resource after the 14th minute.
...
 the user can access it after 15 minutes. But the rule fails because the IP address is already in use. So the next rule will fail. And the last rule. It will be a success. This is the way to handle the situation. You can use the following rule to make sure that the users can't access a resource:. /resources/100/resource.json
This rule allows the access of the resources after a certain time. If the time is less than 15 min, the rules


# **Using GPT-Large**

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m69.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m57.9 MB/s[0m eta [36m0:00:0

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the GPT-2 large model and tokenizer
model_name = "gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)


Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [None]:
def generate_insight_gpt2_large(log, rule):
    # Format the prompt
    prompt = f"Analyze the log given : '{log}' and the rule given: '{rule}', is the log compliant with the rule?"

    # Tokenize the prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    attention_mask = torch.tensor([[1] * len(input_ids[0])])  # create attention mask of 1s

    # Generate response
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=250, num_return_sequences=1, no_repeat_ngram_size=2, top_k=50, temperature=0.8, pad_token_id=tokenizer.eos_token_id)

    # Decode and return the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    insight = generated_text.split("infer?")[-1].strip()
    return insight

# Test the function
log_sample = "UserID: user5000 Action: ACCESS Resource: /resource/99 Timestamp: 2023-08-19 14:45:23 IP Address: 192.168.2.50 Status: FAILURE"
rule_sample = "No user should access /resource/99 after 14:00"
print(generate_insight_gpt2_large(log_sample, rule_sample))


Analyze the log given : 'UserID: user5000 Action: ACCESS Resource: /resource/99 Timestamp: 2023-08-19 14:45:23 IP Address: 192.168.2.50 Status: FAILURE' and the rule given: 'No user should access /resource/99 after 14:00', is the log compliant with the rule?

The answer is yes.
...
 the answer to the question is no. The log is not compliant. It is a log that is being used to log the user's actions. This is why the action is logged. If the rules are not being followed, the logs are being ignored. In this case, it is important to understand that the actions are logged, not the users. So, if the policy is to be followed and a user is accessing the resource, then the logging should be done. But, in this example, there is nothing wrong with logging the access. There is also nothing to do with ignoring the requests.
