In [13]:
# This notebook provides the code to simply download BERT and evaluate the model on a specified test data set without any fine-tuning activities

In [1]:
from torch.utils.data import DataLoader
from sentence_transformers import models, losses, util, LoggingHandler, SentenceTransformer, evaluation
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, BinaryClassificationEvaluator
from sentence_transformers.readers import InputExample
from datetime import datetime
from zipfile import ZipFile
import logging
import csv
import sys
import torch
import math
import gzip
import os

In [2]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout


## Read and prepare data sets

In [None]:
dataset_path = '/home/jupyter-sturm/datasets/Eventlogs/Split' # specify path to test data set

######### Read train data  ##########

logging.info("Reading {} ".format('test data'))

        
# Read test data
test_samples = []
with open(os.path.join(dataset_path, "test.csv"), encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter=',', quoting=csv.QUOTE_NONE)
    for row in reader:
        sample = InputExample(texts=[row['label1'], row['label2']], label=float(row['score']))
        test_samples.append(sample)


## Specifying the model

In [7]:
model_name = 'bert-base-uncased' # Specifying the BERT model in its base version
max_seq_length = 128
use_cuda = torch.cuda.is_available()

In [None]:
###### Model architecture (sentence-transformers) ######

logging.info("Loading model: {}".format(model_name))

# Use transformers model BERT for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

# Compiling the model
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
print(model)

## Evaluating the model on test data set

In [None]:
logging.info("Read test dataset")


test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples)
model.evaluate(test_evaluator)