In [26]:
!pip install evaluate datasets
!pip install transformers[torch]

import torch
import os
import torch.nn as nn
import evaluate
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (AutoTokenizer, DataCollatorWithPadding,
                          AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, pipeline)
from huggingface_hub import notebook_login



In [2]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Training dataset preparation

In [42]:
path  = './log_classification_data/'
data  = pd.DataFrame(columns=['logs', 'class'])
files = os.listdir(path)
files = [f for f in files if f.endswith('.parquet')]

for file in files:
    data = pd.concat([data, pd.read_parquet(path + file)])

data = data.rename(columns={'logs': 'text', 'class': 'label'})
for di, d in data.iterrows():
  data.at[di, 'label'] = int(d['label'])
training_dataset = Dataset.from_pandas(data)
dataset = training_dataset.train_test_split(test_size=0.2, shuffle=True)
dataset


DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 595
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 149
    })
})

## Binary classification setup

In [46]:
id2label = {0: "LOG", 1: "CODE"}
label2id = {"LOG": 0, "CODE": 1}
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training pipeline

In [44]:
acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return acc.compute(predictions=predictions, references=labels)


In [47]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)



training_args = TrainingArguments(
    output_dir="log_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

Map:   0%|          | 0/595 [00:00<?, ? examples/s]

Map:   0%|          | 0/149 [00:00<?, ? examples/s]

In [48]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.326921,0.879195
2,No log,0.278057,0.919463


TrainOutput(global_step=76, training_loss=0.31335491883127314, metrics={'train_runtime': 154.9411, 'train_samples_per_second': 7.68, 'train_steps_per_second': 0.491, 'total_flos': 313102155878400.0, 'train_loss': 0.31335491883127314, 'epoch': 2.0})

In [50]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

events.out.tfevents.1707848488.75dc389bcdc8.290.1:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

events.out.tfevents.1707848228.75dc389bcdc8.290.0:   0%|          | 0.00/8.09k [00:00<?, ?B/s]

events.out.tfevents.1707848912.75dc389bcdc8.290.2:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

Upload 9 LFS files:   0%|          | 0/9 [00:00<?, ?it/s]

events.out.tfevents.1707850182.75dc389bcdc8.290.3:   0%|          | 0.00/5.22k [00:00<?, ?B/s]

events.out.tfevents.1707852970.75dc389bcdc8.290.4:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

events.out.tfevents.1707853238.75dc389bcdc8.290.6:   0%|          | 0.00/5.29k [00:00<?, ?B/s]

events.out.tfevents.1707853084.75dc389bcdc8.290.5:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/SzymonSt2808/log_classifier/commit/6f340620f402de65eed34cf031f162713ad1219e', commit_message='End of training', commit_description='', oid='6f340620f402de65eed34cf031f162713ad1219e', pr_url=None, pr_revision=None, pr_num=None)

--------------------------------------------------------------------------------------------------

## Inferece pipeline

In [54]:
import time
path  = './extracted_logs/'
data  = pd.DataFrame(columns=['logs'])
files = os.listdir(path)
files = [f for f in files if f.endswith('.parquet')]

for file in files:
    data = pd.concat([data, pd.read_parquet(path + file)])

dataset = Dataset.from_pandas(data)



tokenizer = AutoTokenizer.from_pretrained("SzymonSt2808/log_classifier")
model = AutoModelForSequenceClassification.from_pretrained("SzymonSt2808/log_classifier")

classifier  = pipeline('text-classification', model=model, tokenizer=tokenizer)

for log in dataset['logs']:
    print(log)
    print(classifier(log,max_length=512))
    print('----------------------------------')
    time.sleep(1)



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


 (https://github.com/Neurosim-lab/netpyne/blob/development/netpyne/neuromlFuncs.py#L1033), which seems like is never met.

- Also, there seems to be 2 small bugs in https://github.com/Neurosim-lab/netpyne/blob/development/netpyne/cell.py#L1013:
1) self.stims[-1] is accessed before appending the stim (line 1017)
2) ['NeuroML2_stochastic_input_rand'] should start with an 'h' so that it is removed before gathering (otherwise will crash)
Both of these can easily be fixed by replacing with:
[{'label': 'LOG', 'score': 0.9595773816108704}]
----------------------------------
Error occurred type="error" text="Missing job runner for an existing job - #######" stackTrace="   at Kudu.Core.Jobs.ContinuousJobsManager.EnableJob(String jobName)
   at Kudu.Services.Jobs.JobsController.EnableContinuousJob(String jobName)
   at lambda_method(Closure , Object , Object[] )
   at System.Web.Http.Controllers.ReflectedHttpActionDescriptor.ActionExecutor.<>c__DisplayClass10.<GetExecutor>b__9(Object instan

KeyboardInterrupt: 