In [1]:
!pip install evaluate datasets
!pip install transformers[torch]

import torch
import os
import torch.nn as nn
import evaluate
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (AutoTokenizer, DataCollatorWithPadding,
                          AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, pipeline)
from huggingface_hub import notebook_login



In [2]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Training dataset preparation

In [None]:
path  = './log_classification_data/'
data  = pd.DataFrame(columns=['logs', 'class'])
files = os.listdir(path)
files = [f for f in files if f.endswith('.parquet')]

for file in files:
    data = pd.concat([data, pd.read_parquet(path + file)])

data = data.rename(columns={'logs': 'text', 'class': 'label'})
for di, d in data.iterrows():
  data.at[di, 'label'] = int(d['label'])
training_dataset = Dataset.from_pandas(data)
dataset = training_dataset.train_test_split(test_size=0.2, shuffle=True)
dataset


DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 595
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 149
    })
})

## Binary classification setup

In [None]:
id2label = {0: "LOG", 1: "CODE"}
label2id = {"LOG": 0, "CODE": 1}
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training pipeline

In [None]:
acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return acc.compute(predictions=predictions, references=labels)


In [None]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)



training_args = TrainingArguments(
    output_dir="log_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

Map:   0%|          | 0/595 [00:00<?, ? examples/s]

Map:   0%|          | 0/149 [00:00<?, ? examples/s]

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.326921,0.879195
2,No log,0.278057,0.919463


TrainOutput(global_step=76, training_loss=0.31335491883127314, metrics={'train_runtime': 154.9411, 'train_samples_per_second': 7.68, 'train_steps_per_second': 0.491, 'total_flos': 313102155878400.0, 'train_loss': 0.31335491883127314, 'epoch': 2.0})

In [None]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

events.out.tfevents.1707848488.75dc389bcdc8.290.1:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

events.out.tfevents.1707848228.75dc389bcdc8.290.0:   0%|          | 0.00/8.09k [00:00<?, ?B/s]

events.out.tfevents.1707848912.75dc389bcdc8.290.2:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

Upload 9 LFS files:   0%|          | 0/9 [00:00<?, ?it/s]

events.out.tfevents.1707850182.75dc389bcdc8.290.3:   0%|          | 0.00/5.22k [00:00<?, ?B/s]

events.out.tfevents.1707852970.75dc389bcdc8.290.4:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

events.out.tfevents.1707853238.75dc389bcdc8.290.6:   0%|          | 0.00/5.29k [00:00<?, ?B/s]

events.out.tfevents.1707853084.75dc389bcdc8.290.5:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/SzymonSt2808/log_classifier/commit/6f340620f402de65eed34cf031f162713ad1219e', commit_message='End of training', commit_description='', oid='6f340620f402de65eed34cf031f162713ad1219e', pr_url=None, pr_revision=None, pr_num=None)

--------------------------------------------------------------------------------------------------

## Inferece pipeline

In [3]:
!nvidia-smi

Fri Feb 16 16:30:42 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import time
from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset

dataset = load_dataset('Signal0ne/issue-analysis-eval-logs', split=None)
dataset = dataset.rename_column("logs", "text")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/186 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 9948452
    })
})


In [None]:
pipe  = pipeline('text-classification', model="SzymonSt2808/log_classifier", device=0)

## set to non zero value if you want to start from checkpoint
slc = 300000
##
slc_dataset = dataset['train'].select(range(slc, len(dataset['train']['text'])))
print(slc_dataset)
ds_size = len(slc_dataset)
print(f"{ds_size} entries to be processed")
scores = pd.DataFrame(columns=['label','score'])
overall_progress_counter=0
log_counter=0
c=0
for out in pipe(KeyDataset(slc_dataset, "text"), batch_size=200, max_length=512,truncation=True):
    overall_progress_counter += 1
    tmp_score = pd.DataFrame([out])
    scores = pd.concat([scores,tmp_score])
    if (out['label'] == 'CODE' and out['score'] < 0.8) or (out['label'] == 'LOG' and out['score'] > 0.65):
      log_counter += 1
    if overall_progress_counter % 300000 == 0 and overall_progress_counter != 0:
      c += 1
      scores.to_parquet(f"./output/train-logs-labeled-{c}")
      scores = pd.DataFrame(columns=['label','score'])
      print(f"Found {log_counter} entries")
      print(f"checkpoint on {overall_progress_counter}")

if len(scores) > 0:
  c += 1
  scores.to_parquet(f"./output/train-logs-labeled-{c}")



Dataset({
    features: ['text'],
    num_rows: 9648452
})
9648452 entries to be processed
