In [None]:
import numpy as np
import pandas as pd
import torch.nn as nn
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import RobertaForSequenceClassification
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
import torch.optim as optim


data = pd.read_csv("labeled_dataset.csv")
data.drop('Unnamed: 0', axis=1, inplace=True)

label_overview = data.error_cluster.unique()
# 
X_data = data[['log_entry']].to_numpy().reshape(-1)
Y_data = data[["error_cluster"]].to_numpy().reshape(-1)

# Transform categories into numbers
category_to_id = {}
category_to_name = {}

for index, c in enumerate(label_overview):
    if c in category_to_id:
        category_id = category_to_id[c]
    else:
        category_id = len(category_to_id)
        category_to_id[c] = category_id
        category_to_name[category_id] = c
    
    label_overview[index] = category_id

# Display dictionary
category_to_name

# Split into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=0.3, random_state=777) # random_state to reproduce results

In [None]:
# Load and use the custom tokenizer
custom_tokenizer = Tokenizer.from_file("custom_tokenizer.json")

# Example usage
text_to_encode = "well this is a success"
encoded = custom_tokenizer.encode(text_to_encode)
print(encoded.tokens)

In [None]:
## model
class CustomRobertaModel(nn.Module):
    def __init__(self, labels):
        super(CustomRobertaModel, self).__init__()
        # TODO switch num_labels to automatically update
        self.roberta = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels= 13)
        self.dropout = nn.Dropout(0.1)
        self.dense1 = nn.Linear(labels, 256)
        self.relu = nn.ReLU()
        self.dense2 = nn.Linear(256, labels)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_word_ids, input_mask, input_type_ids):
        outputs = self.roberta(input_word_ids, attention_mask=input_mask, token_type_ids=input_type_ids)
        x = outputs.logits
        x = self.dropout(x)
        x = self.dense1(x)
        x = self.relu(x)
        x = self.dense2(x)
        x = self.softmax(x)
        return x
    
# labels = number of labels for classification
model = CustomRobertaModel(labels)

# Load the custom tokenizer from the JSON file
custom_tokenizer = Tokenizer.from_file("custom_tokenizer.json")

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

In [None]:
# Example training loop
for epoch in range(3):  # Number of epochs
    model.train()
    optimizer.zero_grad()
    
    # Example input text
    text = data["log_entry"][0]
    
    # Tokenize the input text using the custom tokenizer
    encoding = custom_tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        return_attention_mask=True,
        return_token_type_ids=True,
        truncation=True
    )
    
    input_word_ids = torch.tensor([encoding['input_ids']])
    input_mask = torch.tensor([encoding['attention_mask']])
    input_type_ids = torch.tensor([encoding['token_type_ids']])
    labels = torch.tensor([1])  # Example labels
    
    # Move tensors to the appropriate device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    input_word_ids = input_word_ids.to(device)
    input_mask = input_mask.to(device)
    input_type_ids = input_type_ids.to(device)
    labels = labels.to(device)
    
    # Forward pass
    outputs = model(input_word_ids, input_mask, input_type_ids)
    loss = criterion(outputs, labels)
    
    # Backward pass and optimization
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

In [None]:
model = build_model(labels)
model.summary()

In [None]:
# Training

class CustomRobertaModel(nn.Module):
    def __init__(self, labels):
        super(CustomRobertaModel, self).__init__()
        self.roberta = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=labels)
        self.dropout = nn.Dropout(0.1)
        self.dense1 = nn.Linear(labels, 256)
        self.relu = nn.ReLU()
        self.dense2 = nn.Linear(256, labels)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_word_ids, input_mask, input_type_ids):
        outputs = self.roberta(input_word_ids, attention_mask=input_mask, token_type_ids=input_type_ids)
        x = outputs.logits
        x = self.dropout(x)
        x = self.dense1(x)
        x = self.relu(x)
        x = self.dense2(x)
        x = self.softmax(x)
        return x

# Example usage
labels = 3  # Number of labels for classification
model = CustomRobertaModel(labels)

# Example input data
input_word_ids = torch.randint(0, 1000, (1, 512))  # Example batch of input IDs
input_mask = torch.ones((1, 512), dtype=torch.long)  # Example attention mask
input_type_ids = torch.zeros((1, 512), dtype=torch.long)  # Example token type IDs

# Forward pass
output = model(input_word_ids, input_mask, input_type_ids)
print(output)