In [26]:
!pip install evaluate datasets
!pip install transformers[torch]

import torch
import os
import torch.nn as nn
import evaluate
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (AutoTokenizer, DataCollatorWithPadding,
                          AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, pipeline)
from huggingface_hub import notebook_login



In [2]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Training dataset preparation

In [23]:
path  = './log_classification_data/'
data  = pd.DataFrame(columns=['logs', 'class'])
files = os.listdir(path)
files = [f for f in files if f.endswith('.parquet')]

for file in files:
    data = pd.concat([data, pd.read_parquet(path + file)])

data = data.rename(columns={'logs': 'text', 'class': 'label'})
for di, d in data.iterrows():
  data.at[di, 'label'] = int(d['label'])
training_dataset = Dataset.from_pandas(data)
dataset = training_dataset.train_test_split(test_size=0.2, shuffle=True)
dataset


DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 595
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 149
    })
})

## Binary classification setup

In [12]:
id2label = {0: "LOG", 1: "CODE"}
label2id = {"LOG": 0, "CODE": 1}
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training pipeline

In [27]:
acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return acc.compute(predictions=predictions, references=labels)


In [28]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)



training_args = TrainingArguments(
    output_dir="log_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

Map:   0%|          | 0/595 [00:00<?, ? examples/s]

Map:   0%|          | 0/149 [00:00<?, ? examples/s]

In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.302527,0.892617
2,No log,0.297443,0.899329


TrainOutput(global_step=76, training_loss=0.16138876111883865, metrics={'train_runtime': 70.0476, 'train_samples_per_second': 16.988, 'train_steps_per_second': 1.085, 'total_flos': 155818917275328.0, 'train_loss': 0.16138876111883865, 'epoch': 2.0})

In [30]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/SzymonSt2808/log_classifier/commit/1dc20ab7bf99abab7c889c2d5f8ba10a40d26bd9', commit_message='End of training', commit_description='', oid='1dc20ab7bf99abab7c889c2d5f8ba10a40d26bd9', pr_url=None, pr_revision=None, pr_num=None)

--------------------------------------------------------------------------------------------------

## Inferece pipeline

In [32]:
import time
path  = './extracted_logs/'
data  = pd.DataFrame(columns=['logs'])
files = os.listdir(path)
files = [f for f in files if f.endswith('.parquet')]

for file in files:
    data = pd.concat([data, pd.read_parquet(path + file)])

dataset = Dataset.from_pandas(data)



tokenizer = AutoTokenizer.from_pretrained("SzymonSt2808/log_classifier")
model = AutoModelForSequenceClassification.from_pretrained("SzymonSt2808/log_classifier")

classifier  = pipeline('text-classification', model=model, tokenizer=tokenizer)

for log in dataset['logs']:
    print(log)
    print(classifier(log))
    print('----------------------------------')
    time.sleep(1)



setTimeout(function() {
document.getElementById('audio_3').addEventListener('play', function(){
document.getElementById('audio_4').currentTime = 0;
document.getElementById('audio_4').pause();
}, false);

document.getElementById('audio_4').addEventListener('play', function(){
document.getElementById('audio_3').currentTime = 0;
document.getElementById('audio_3').pause();
}, false);
},1000);
[{'label': 'CODE', 'score': 0.9663130640983582}]
----------------------------------
??? Attempt to reference field of non-structure array.

    Error in ==> insertion_sort at 6
        for j=2:original.length
[{'label': 'LOG', 'score': 0.9623003602027893}]
----------------------------------
Try

        With IOStructure
            .overlap.hEvent = CreateEvent(Nothing, True, False, Nothing)
            If .overlap.hEvent = 0 Then
                writeSuccess = False
            Else
                writeSuccess = WriteFile(.hdevice, .writeBuf, .writeBuf.Length, .bytesWritten, .overlap)

             

Token indices sequence length is longer than the specified maximum sequence length for this model (946 > 512). Running this sequence through the model will result in indexing errors


org.hibernate.HibernateException: Unable to locate current JTA transaction
    at org.hibernate.context.JTASessionContext.currentSession(JTASessionContext.java:88)
    at org.hibernate.impl.SessionFactoryImpl.getCurrentSession(SessionFactoryImpl.java:574)
    at com.hibernate.EmpleadosHome.findById(EmpleadosHome.java:95)
    at org.apache.jsp.index_jsp._jspService(index_jsp.java:75)
    at org.apache.jasper.runtime.HttpJspBase.service(HttpJspBase.java:70)
    at javax.servlet.http.HttpServlet.service(HttpServlet.java:717)
    at org.apache.jasper.servlet.JspServletWrapper.service(JspServletWrapper.java:369)
    at org.apache.jasper.servlet.JspServlet.serviceJspFile(JspServlet.java:322)
    at org.apache.jasper.servlet.JspServlet.service(JspServlet.java:249)
    at javax.servlet.http.HttpServlet.service(HttpServlet.java:717)
    at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:290)
    at org.apache.catalina.core.ApplicationFilterChain.doFi

RuntimeError: The size of tensor a (946) must match the size of tensor b (512) at non-singleton dimension 1