In [13]:
import torch
import os
import torch.nn as nn
import evaluate
import pandas as pd
from datasets import Dataset
from transformers import (AutoTokenizer, DataCollatorWithPadding, 
                          AutoModelForSequenceClassification, 
                          TrainingArguments, Trainer, pipeline)
from huggingface_hub import notebook_login

In [None]:
notebook_login()

## Training dataset preparation

In [14]:
path  = './log_classification_data/'
data  = pd.DataFrame(columns=['logs', 'class'])
files = os.listdir(path)
files = [f for f in files if f.endswith('.parquet')]

for file in files:
    data = pd.concat([data, pd.read_parquet(path + file)])

training_dataset = Dataset.from_pandas(data)
dataset = training_dataset.train_test_split(test_size=0.2, shuffle=True)
dataset


DatasetDict({
    train: Dataset({
        features: ['logs', 'class', '__index_level_0__'],
        num_rows: 595
    })
    test: Dataset({
        features: ['logs', 'class', '__index_level_0__'],
        num_rows: 149
    })
})

## Binary classification setup

In [18]:
id2label = {0: "LOG", 1: "CODE"}
label2id = {"LOG": 0, "CODE": 1}
def tokenize_function(examples):
    return tokenizer(examples["logs"], truncation=True)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training pipeline

In [16]:
acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = torch.argmax(predictions, dim=1)
    return acc.compute(predictions=predictions, references=labels)
    

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<?, ?B/s]


In [19]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)



training_args = TrainingArguments(
    output_dir="log_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

Map: 100%|██████████| 595/595 [00:00<00:00, 1024.10 examples/s]
Map: 100%|██████████| 149/149 [00:00<00:00, 1330.38 examples/s]


In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

--------------------------------------------------------------------------------------------------

## Inferece pipeline

In [None]:
path  = './extracted_logs/'
data  = pd.DataFrame(columns=['logs'])
files = os.listdir(path)
files = [f for f in files if f.endswith('.parquet')]

for file in files:
    data = pd.concat([data, pd.read_parquet(path + file)])

dataset = Dataset.from_pandas(data)



tokenizer = AutoTokenizer.from_pretrained("SzymonSt2808/log_classifier")
model = AutoModelForSequenceClassification.from_pretrained("SzymonSt2808/log_classifier")

classifier  = pipeline('text-classification', model=model, tokenizer=tokenizer)

for log in dataset['logs']:
    print(classifier(log))

