In [None]:
import os
import json
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from transformers import pipeline
import torch
from torch.utils.data import Dataset, DataLoader
import spacy
from dateutil import parser as date_parser
from functools import lru_cache
from tqdm import tqdm

# dostupnost gpu + mem.
torch.cuda.empty_cache()
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")

class LogDataset(Dataset):
    """PyTorch Dataset for log data"""
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

class BERTLogOnboarder:
    def __init__(self):
        #inicializace samotného BERT
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.event_classifier = None
        self.field_extractor = None
        self.label_encoder = None
        self.max_length = 128 # budu muset hodně omezovat kvůli max_memory 512 nedám a 256 už taky ne
        
        
        #feature extraction pro logy (parsování) <-- moc nefunguje bude třeba samostatný a custom       
        self.nlp = spacy.load("en_core_web_sm")
        self.ner_pipeline = pipeline(
            "ner",
            model="dslim/bert-base-NER",
            device=0 if torch.cuda.is_available() else -1
        )
        
        self.common_events = self._load_common_event_patterns()
        
    def _load_common_event_patterns(self): #tohle ještě převedu do samostatného souboru
        try:
            with open('event_patterns##.json') as f:
                return json.load(f)
        except FileNotFoundError:
            return {
                'authentication_success': [
                    "accepted password for",
                    "successful authentication",
                    "login successful for",
                    "authenticated successfully"
                ],
                'authentication_failure': [
                    "login failed for user",
                    "authentication failure",
                    "invalid credentials",
                    "failed password for",
                    "authentication failure; logname="
                ]
            }
    
    def train_event_classifier(self, texts, labels, epochs=3, batch_size=32): #epochs= 3, batch_size=16
        """
        Train BERT model for log event classification
        :param texts: List of log texts
        :param labels: List of string labels
        :param epochs: Number of training epochs
        :param batch_size: Batch size for training
        """
        # labels
        unique_labels = sorted(set(labels))
        self.label_encoder = {label: idx for idx, label in enumerate(unique_labels)}
        self.inv_label_encoder = {idx: label for label, idx in self.label_encoder.items()}
        num_labels = len(unique_labels)
        
        # encode labels
        encoded_labels = [self.label_encoder[label] for label in labels]
        
        # train test split klasicky
        train_texts, val_texts, train_labels, val_labels = train_test_split(texts, encoded_labels, test_size=0.2, random_state=1337)
        
        #datasety
        train_dataset = LogDataset(train_texts, train_labels, self.tokenizer, self.max_length)
        val_dataset = LogDataset(val_texts, val_labels, self.tokenizer, self.max_length)
        
        # Create dataloaders
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)
        
        # Initialize model with correct number of labels
        self.event_classifier = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased',
            num_labels=num_labels
        ).to(device)
        
        # training setup + trénovací loop
        optimizer = torch.optim.AdamW(self.event_classifier.parameters(), lr=2e-5)
        for epoch in range(epochs):
            self.event_classifier.train()
            total_loss = 0
            
            for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
                optimizer.zero_grad()
                
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                
                outputs = self.event_classifier(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                loss = outputs.loss
                total_loss += loss.item()
                
                loss.backward()
                optimizer.step()
            
            # validace
            self.event_classifier.eval()
            val_loss = 0
            correct = 0
            total = 0
            
            with torch.no_grad():
                for batch in val_loader:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['label'].to(device)
                    
                    outputs = self.event_classifier(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )
                    
                    val_loss += outputs.loss.item()
                    _, predicted = torch.max(outputs.logits, 1)
                    correct += (predicted == labels).sum().item()
                    total += labels.size(0)
            
            print(f"Epoch {epoch + 1}:")
            print(f"Train Loss: {total_loss / len(train_loader):.4f}")
            print(f"Val Loss: {val_loss / len(val_loader):.4f}")
            print(f"Val Accuracy: {correct / total:.4f}")
    
    def identify_event(self, log_line):
        """
        Identify security-relevant events using BERT
        :param log_line: Raw log text
        :return: Detected event type and confidence
        """
        # rule-based check paternu
        for event_type, patterns in self.common_events.items():
            for pattern in patterns:
                if pattern.lower() in log_line.lower():
                    return event_type, 1.0
        
        # použití Berta
        if self.event_classifier:
            encoding = self.tokenizer(
                log_line,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            
            with torch.no_grad():
                outputs = self.event_classifier(input_ids=input_ids, attention_mask=attention_mask)
            
            probabilities = torch.softmax(outputs.logits, dim=1)
            confidence, predicted = torch.max(probabilities, dim=1)
            
            if self.label_encoder:
                inv_label_encoder = {v: k for k, v in self.label_encoder.items()}
                event_type = inv_label_encoder[predicted.item()]
            else:
                event_type = str(predicted.item())
            
            return event_type, confidence.item()
        
        return "unknown", 0.0
    
    def extract_fields_with_bert(self, log_line): # random paste NER od odborníka 🤓, ale tento je třeba udělat custom a odkazovat se na něj
        """
        Enhanced field extraction using BERT NER and custom patterns
        :param log_line: Raw log text
        :return: Dictionary of extracted fields
        """
        fields = {}
    
        # Timestamp extraction (keep existing)
        fields['timestamp'] = self._extract_timestamp(log_line)
        
        # Enhanced IP extraction
        ip_pattern = r'(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'
        ip_addresses = re.findall(ip_pattern, log_line)
        if ip_addresses:
            fields['source_ip'] = ip_addresses[0]
        
        # Enhanced username extraction
        user_patterns = [
            r'user\s+([^\s]+)',  # "user jsmith"
            r'for\s+user\s+([^\s]+)',  # "for user admin"
            r'user=([^\s]+)',  # "user=root"
        ]
        for pattern in user_patterns:
            if match := re.search(pattern, log_line, re.IGNORECASE):
                fields['username'] = match.group(1)
                break
        
        # Use BERT NER for entity recognition
        ner_results = self.ner_pipeline(log_line)
        print(f"Tohle je výstup z extract_fields_with_bert  aka ner_results-->{ner_results}")
        # Process NER results
        for entity in ner_results:
            if entity['entity'] == 'B-PER' or entity['entity'] == 'I-PER':
                fields.setdefault('usernames', []).append(entity['word'])
            elif entity['entity'] == 'B-ORG' or entity['entity'] == 'I-ORG':
                fields.setdefault('organizations', []).append(entity['word'])
            elif entity['entity'] == 'B-LOC' or entity['entity'] == 'I-LOC':
                fields.setdefault('locations', []).append(entity['word'])
        
        # Extract error codes
        error_codes = re.findall(r'\b[A-Z]{2,5}-\d{3,5}\b', log_line)
        if error_codes:
            fields['error_code'] = error_codes[0]
        
        # Extract status indicators
        status_keywords = {
            'failed': 'failure',
            'accepted': 'success',
            'error': 'failure',
            'success': 'success',
            'completed': 'success',
            'denied': 'failure',
            'blocked': 'failure'
        }
        
        for keyword, status in status_keywords.items():
            if keyword in log_line.lower():
                fields['status'] = status
                break
        
        # Use spaCy for additional parsing
        doc = self.nlp(log_line)
        
        # Extract verbs as potential actions
        fields['actions'] = [token.lemma_ for token in doc if token.pos_ == "VERB"]
        
        # Extract numbers that might be counts or IDs
        fields['numbers'] = [ent.text for ent in doc.ents if ent.label_ == "CARDINAL"]
        
        return fields
    
    @lru_cache(maxsize=1000)
    def _extract_timestamp(self, text):
        """Enhanced timestamp extraction with multiple strategies"""
        # Try common log timestamp formats first
        common_formats = [
            r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',
            r'\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}',
            r'[A-Z][a-z]{2} \d{2} \d{4} \d{2}:\d{2}:\d{2}',
            r'\d{10}',  # Unix timestamp
            r'\d{13}',  # Unix timestamp with milliseconds
        ]
        
        for fmt in common_formats:
            match = re.search(fmt, text)
            if match:
                try:
                    return date_parser.parse(match.group(0)).isoformat()
                except:
                    continue
        
        # Fallback to dateutil's fuzzy parsing
        try:
            return date_parser.parse(text, fuzzy=True).isoformat()
        except:
            return None
    
    def process_logs(self, log_lines, batch_size=16):
        results = []
        
        # processování btache logů
        for i in tqdm(range(0, len(log_lines), batch_size), desc="Processing logs"):
            batch = log_lines[i:i + batch_size]
            
            for line in batch:
                if not line.strip():
                    continue
                    
                event_type, confidence = self.identify_event(line)
                fields = self.extract_fields_with_bert(line)
                
                result = {
                    'raw_log': line,
                    'event_type': event_type,
                    'confidence': confidence,
                    **fields
                }
                results.append(result)
        
        return pd.DataFrame(results)

if __name__ == "__main__":

    train_data = pd.read_csv("../data/train-logs/df_400logs.csv")
    labeled_data = {
        'texts': train_data['text'].tolist(),
        'labels': train_data['label'].tolist()
    }
    
    onboarder = BERTLogOnboarder()
    
    # trénovačřka
    print("Training event classifier...")
    onboarder.train_event_classifier(labeled_data['texts'], labeled_data['labels'], epochs=5, batch_size=64)
    
    # Process new logs
    new_logs = [
        "2023-05-16 09:15:33 - login failed for user jsmith from 10.1.2.3 with error AUTH-402",
        "May 16 10:22:18 - user mjohnson attempted to use sudo command to install package",
        "ERROR 2023-05-16 11:45:22: Disk utilization exceeded 95% on /dev/sdb1",
        "Dec 10 09:32:20 LabSZ sshd[24680]: Accepted password for fztu from 119.137.62.142 port 49116 ssh2",
        "WARNING: 5 consecutive authentication failures for user demo from 192.0.2.15",
        "2023-05-16 13:30:45 - connection refused from suspicious IP 198.51.100.7",
        "2023-05-15 14:30:45 - login failed for user jsmith from 10.1.2.3",
        "WARNING: 5 consecutive authentication failures for user demo",
        "Jun 14 15:16:01 combo sshd(pam_unix)[19939]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=218.188.2.4", 
        "Jun 14 15:16:02 combo sshd(pam_unix)[19937]: check pass; user unknown",
        "Jun 14 15:16:02 combo sshd(pam_unix)[19937]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=218.188.2.4",
        "Jun 15 02:04:59 combo sshd(pam_unix)[20882]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=220-135-151-1.hinet-ip.hinet.net  user=root",
        "Jun 15 02:04:59 combo sshd(pam_unix)[20884]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=220-135-151-1.hinet-ip.hinet.net  user=roo",
        "Dec 10 07:11:42 LabSZ sshd[24224]: pam_unix(sshd:auth): authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=202.100.179.208",
        "Dec 10 07:11:44 LabSZ sshd[24224]: Failed password for invalid user chen from 202.100.179.208 port 32484 ssh2",
        "Dec 10 07:11:44 LabSZ sshd[24224]: Received disconnect from 202.100.179.208: 11: Bye Bye [preauth]",
        "Dec 10 07:13:31 LabSZ sshd[24227]: pam_unix(sshd:auth): authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=5.36.59.76.dynamic-dsl-ip.omantel.net.om  user=root",
        "Dec 10 07:13:43 LabSZ sshd[24227]: Failed password for root from 5.36.59.76 port 42393 ssh2",
        "Dec 10 09:32:20 LabSZ sshd[24680]: Accepted password for fztu from 119.137.62.142 port 49116 ssh2",
    ]
    
    print("\nProcessing new logs...")
    processed_logs = onboarder.process_logs(new_logs)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    pd.set_option('display.max_colwidth', 150)
    print(processed_logs[['raw_log', 'event_type', 'confidence', 'timestamp', 'source_ip', 'status']])

cuda
Memory allocated: 2518.35 MB


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


Training event classifier...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 5/5 [00:42<00:00,  8.48s/it]


Epoch 1:
Train Loss: 1.0347
Val Loss: 0.8907
Val Accuracy: 0.7000


Epoch 2: 100%|██████████| 5/5 [00:53<00:00, 10.78s/it]


Epoch 2:
Train Loss: 0.8761
Val Loss: 0.7712
Val Accuracy: 0.6500


Epoch 3: 100%|██████████| 5/5 [02:06<00:00, 25.23s/it]


Epoch 3:
Train Loss: 0.7303
Val Loss: 0.6589
Val Accuracy: 0.7250


Epoch 4: 100%|██████████| 5/5 [01:17<00:00, 15.48s/it]


Epoch 4:
Train Loss: 0.6394
Val Loss: 0.5730
Val Accuracy: 0.7250


Epoch 5: 100%|██████████| 5/5 [01:21<00:00, 16.22s/it]


Epoch 5:
Train Loss: 0.5469
Val Loss: 0.5161
Val Accuracy: 0.7625

Processing new logs...


Processing logs:   0%|          | 0/2 [00:00<?, ?it/s]

Tohle je výstup z extract_fields_with_bert  aka ner_results-->[]
Tohle je výstup z extract_fields_with_bert  aka ner_results-->[{'entity': 'B-PER', 'score': 0.5425031, 'index': 11, 'word': '##jo', 'start': 24, 'end': 26}, {'entity': 'B-PER', 'score': 0.7847678, 'index': 12, 'word': '##hn', 'start': 26, 'end': 28}]
Tohle je výstup z extract_fields_with_bert  aka ner_results-->[{'entity': 'B-ORG', 'score': 0.9532609, 'index': 1, 'word': 'ER', 'start': 0, 'end': 2}, {'entity': 'I-ORG', 'score': 0.93673354, 'index': 2, 'word': '##RO', 'start': 2, 'end': 4}, {'entity': 'I-ORG', 'score': 0.8225565, 'index': 3, 'word': '##R', 'start': 4, 'end': 5}]
Tohle je výstup z extract_fields_with_bert  aka ner_results-->[{'entity': 'B-ORG', 'score': 0.9947536, 'index': 8, 'word': 'Lab', 'start': 16, 'end': 19}, {'entity': 'I-ORG', 'score': 0.97525024, 'index': 9, 'word': '##S', 'start': 19, 'end': 20}]
Tohle je výstup z extract_fields_with_bert  aka ner_results-->[]
Tohle je výstup z extract_fields_with

Processing logs:  50%|█████     | 1/2 [00:01<00:01,  1.47s/it]

Tohle je výstup z extract_fields_with_bert  aka ner_results-->[{'entity': 'B-ORG', 'score': 0.9931016, 'index': 8, 'word': 'Lab', 'start': 16, 'end': 19}, {'entity': 'I-ORG', 'score': 0.98801917, 'index': 9, 'word': '##S', 'start': 19, 'end': 20}, {'entity': 'I-ORG', 'score': 0.62680084, 'index': 10, 'word': '##Z', 'start': 20, 'end': 21}]
Tohle je výstup z extract_fields_with_bert  aka ner_results-->[{'entity': 'B-ORG', 'score': 0.99323386, 'index': 8, 'word': 'Lab', 'start': 16, 'end': 19}, {'entity': 'I-ORG', 'score': 0.9883812, 'index': 9, 'word': '##S', 'start': 19, 'end': 20}, {'entity': 'I-ORG', 'score': 0.5991771, 'index': 10, 'word': '##Z', 'start': 20, 'end': 21}]
Tohle je výstup z extract_fields_with_bert  aka ner_results-->[{'entity': 'B-ORG', 'score': 0.99341846, 'index': 8, 'word': 'Lab', 'start': 16, 'end': 19}, {'entity': 'I-ORG', 'score': 0.99155116, 'index': 9, 'word': '##S', 'start': 19, 'end': 20}, {'entity': 'I-ORG', 'score': 0.5445104, 'index': 10, 'word': '##Z', 

Processing logs: 100%|██████████| 2/2 [00:01<00:00,  1.13it/s]

Tohle je výstup z extract_fields_with_bert  aka ner_results-->[{'entity': 'B-ORG', 'score': 0.99566793, 'index': 8, 'word': 'Lab', 'start': 16, 'end': 19}, {'entity': 'I-ORG', 'score': 0.9934236, 'index': 9, 'word': '##S', 'start': 19, 'end': 20}, {'entity': 'I-ORG', 'score': 0.73691124, 'index': 10, 'word': '##Z', 'start': 20, 'end': 21}]
Tohle je výstup z extract_fields_with_bert  aka ner_results-->[{'entity': 'B-ORG', 'score': 0.99229336, 'index': 8, 'word': 'Lab', 'start': 16, 'end': 19}, {'entity': 'I-ORG', 'score': 0.9886895, 'index': 9, 'word': '##S', 'start': 19, 'end': 20}]
Tohle je výstup z extract_fields_with_bert  aka ner_results-->[{'entity': 'B-ORG', 'score': 0.9947536, 'index': 8, 'word': 'Lab', 'start': 16, 'end': 19}, {'entity': 'I-ORG', 'score': 0.97525024, 'index': 9, 'word': '##S', 'start': 19, 'end': 20}]
                                                                                                                                                  raw_log         




In [12]:


test_logs_bert = [
    # 5 successful login events
    "2025-05-18T09:00:12Z Windows Server: EventID=4624 Authentication granted for user021 IP=192.0.2.200",
    "May 18 09:05:23 ubuntu sshd[3456]: Access granted for user022 from 192.0.2.201 port 56321 ssh2",
    "2025-05-18 09:10:45 macOS loginwindow[6001]: Sign-in successful for user user023 (UID 560)",
    "2025-05-18 09:15:30 Cisco ASA: %ASA-6-722051: VPN Connection: credentials accepted: User=user024 IP=192.0.2.202 Duration=00:05:00",
    '{"eventTime":"2025-05-18T09:20:55Z","eventName":"ConsoleLogin","userIdentity":{"type":"IAMUser","userName":"user025"},"sourceIPAddress":"192.0.2.203","responseElements":{"ConsoleLogin":"Success"}}',

    # 10 failed login attempts
    "2025-05-18T10:00:01Z Windows Server: EventID=4625 Authentication failed for user026 IP=192.0.2.204",
    "May 18 10:05:12 ubuntu sshd[3567]: Access denied for user027 from 192.0.2.205 port 57322 ssh2",
    "2025-05-18 10:10:23 macOS loginwindow[6002]: Sign-in denied for user user028 (UID 561)",
    "2025-05-18 10:15:34 Cisco ASA: %ASA-6-722051: VPN Connection: authentication unsuccessful: User=user029 IP=192.0.2.206 Duration=00:00:30",
    "2025/05/18 10:20:45,001801000014,SYSTEM,gplogin,0,2025/05/18 10:20:45,192.0.2.207,10.0.0.2,Login,globalprotect,user=user030,tunnel=no,result=denied",
    '{"eventTime":"2025-05-18T10:25:56Z","eventName":"ConsoleLogin","userIdentity":{"type":"IAMUser","userName":"user031"},"sourceIPAddress":"192.0.2.208","responseElements":{"ConsoleLogin":"Failure"}}',
    '{"TimeGenerated":"2025-05-18T10:30:07Z","UserPrincipalName":"user032@example.com","AppDisplayName":"Azure Portal","Status":{"value":"1","additionalDetails":"Failure"},"IPAddress":"192.0.2.209"}',
    "timestamp=2025-05-18T10:35:18Z event=login login_type=authorized principal=user033@example.com ip_address=192.0.2.210 outcome=FAILURE",
    '{"published":"2025-05-18T10:40:29Z","eventType":"user.session.start","outcome":{"result":"DENIED"},"actor":{"displayName":"user034"},"client":{"ipAddress":"192.0.2.211"}}',
    "2025-05-18T10:45:40,user035@example.com,Login,login.salesforce.com,Failed,192.0.2.212,OAuth",

    # 15 other types of events
    "2025-05-18T11:00:00Z Windows Server: EventID=4647 User logoff for user021",
    "May 18 11:05:12 ubuntu sshd[4000]: session closed for user022",
    "2025-05-18 11:10:23 macOS loginwindow[6003]: User logout for user user023 (UID 562)",
    "2025-05-18 11:15:34 Cisco ASA: %ASA-6-722052: VPN Logout: User=user024 IP=192.0.2.202 Duration=00:05:10",
    "2025/05/18 11:20:45,001801000015,SYSTEM,gplogin,0,2025/05/18 11:20:45,192.0.2.207,10.0.0.2,Logout,globalprotect,user=user026,tunnel=yes",
    '{"eventTime":"2025-05-18T11:25:56Z","eventName":"ModifyUser","userIdentity":{"type":"IAMUser","userName":"user027"},"sourceIPAddress":"192.0.2.208","requestParameters":{"groupName":"Admins"}}',
    '{"TimeGenerated":"2025-05-18T11:30:07Z","Operation":"UserLoggedOut","UserId":"user028@example.com","ClientIP":"192.0.2.209"}',
    "timestamp=2025-05-18T11:35:18Z event=logout principal=user029@example.com ip_address=192.0.2.210",
    '{"published":"2025-05-18T11:40:29Z","eventType":"user.repository.delete","outcome":{"result":"SUCCESS"},"actor":{"displayName":"user030"},"client":{"ipAddress":"192.0.2.211"}}',
    "2025-05-18T11:45:40,user031@example.com,API,updateRecord,login.salesforce.com,Success",
    '{"CreationTime":"2025-05-18T11:50:00","Operation":"FileDownloaded","UserId":"user032@example.com","ClientIP":"192.0.2.212","ItemName":"report.pdf"}',
    "2025-05-18T11:55:00 UTC [3900]: [user033]@hrdb LOG:  statement: SELECT * FROM employees WHERE department='Sales';",
    "2025-05-18T12:00:00Z 50 Query user034@192.0.2.213 on payrolldb: execute UPDATE payroll SET amount=5000;",
    "192.0.2.91 - - [18/May/2025:12:05:00 +0000] \"GET /api/data HTTP/1.1\" 200 1285 \"-\" \"curl/7.68.0\"",
    "2025-05-18 12:10:00 Fortinet FortiGate device_id=FGT002 log_id=0100030001 type=event subtype=system level=notice action=system_reboot_initiated"
]

print("\nProcessing new logs...")
processed_logs = onboarder.process_logs(test_logs_bert)
print(processed_logs[['raw_log', 'event_type', 'confidence', 'timestamp', 'status']])




Processing new logs...


Processing logs:   0%|          | 0/2 [00:00<?, ?it/s]

Tohle je výstup z extract_fields_with_bert  aka ner_results-->[{'entity': 'B-MISC', 'score': 0.88491935, 'index': 15, 'word': 'Windows', 'start': 21, 'end': 28}, {'entity': 'I-MISC', 'score': 0.9660503, 'index': 16, 'word': 'Server', 'start': 29, 'end': 35}]
Tohle je výstup z extract_fields_with_bert  aka ner_results-->[]
Tohle je výstup z extract_fields_with_bert  aka ner_results-->[{'entity': 'B-MISC', 'score': 0.7843622, 'index': 12, 'word': 'mac', 'start': 20, 'end': 23}, {'entity': 'I-MISC', 'score': 0.8377228, 'index': 13, 'word': '##OS', 'start': 23, 'end': 25}]
Tohle je výstup z extract_fields_with_bert  aka ner_results-->[{'entity': 'B-ORG', 'score': 0.87390125, 'index': 12, 'word': 'C', 'start': 20, 'end': 21}, {'entity': 'B-ORG', 'score': 0.663588, 'index': 13, 'word': '##isco', 'start': 21, 'end': 25}, {'entity': 'I-MISC', 'score': 0.7831019, 'index': 14, 'word': 'AS', 'start': 26, 'end': 28}]
Tohle je výstup z extract_fields_with_bert  aka ner_results-->[]
Tohle je výstup 

Processing logs:  50%|█████     | 1/2 [00:01<00:01,  1.49s/it]

Tohle je výstup z extract_fields_with_bert  aka ner_results-->[{'entity': 'B-ORG', 'score': 0.9656732, 'index': 44, 'word': 'O', 'start': 86, 'end': 87}]
Tohle je výstup z extract_fields_with_bert  aka ner_results-->[{'entity': 'B-MISC', 'score': 0.86809117, 'index': 14, 'word': 'Windows', 'start': 21, 'end': 28}, {'entity': 'I-MISC', 'score': 0.9737371, 'index': 15, 'word': 'Server', 'start': 29, 'end': 35}]
Tohle je výstup z extract_fields_with_bert  aka ner_results-->[]
Tohle je výstup z extract_fields_with_bert  aka ner_results-->[{'entity': 'B-MISC', 'score': 0.79285204, 'index': 12, 'word': 'mac', 'start': 20, 'end': 23}, {'entity': 'I-MISC', 'score': 0.9040378, 'index': 13, 'word': '##OS', 'start': 23, 'end': 25}]
Tohle je výstup z extract_fields_with_bert  aka ner_results-->[{'entity': 'B-ORG', 'score': 0.6532236, 'index': 12, 'word': 'C', 'start': 20, 'end': 21}, {'entity': 'I-MISC', 'score': 0.7491876, 'index': 14, 'word': 'AS', 'start': 26, 'end': 28}, {'entity': 'I-MISC', '

Processing logs: 100%|██████████| 2/2 [00:02<00:00,  1.42s/it]

Tohle je výstup z extract_fields_with_bert  aka ner_results-->[{'entity': 'B-MISC', 'score': 0.5699082, 'index': 12, 'word': 'Fort', 'start': 20, 'end': 24}]
                                                                                                                                                  raw_log              event_type  confidence                  timestamp   status
0                                                     2025-05-18T09:00:12Z Windows Server: EventID=4624 Authentication granted for user021 IP=192.0.2.200           Success_login    0.526447                       None      NaN
1                                                          May 18 09:05:23 ubuntu sshd[3456]: Access granted for user022 from 192.0.2.201 port 56321 ssh2            Failed_login    0.487361                       None      NaN
2                                                              2025-05-18 09:10:45 macOS loginwindow[6001]: Sign-in successful for user user023 (UID 560)           


