In [1]:
# set WORKDIR to the top of experiment repository
%cd ..
%pwd

/data_science/projects/LOGOSAI.TECH_external/CHALLENGE/Experiment-repository-template


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


'/data_science/projects/LOGOSAI.TECH_external/CHALLENGE/Experiment-repository-template'

In [12]:
import mlflow
from datetime import datetime
from sklearn.metrics import matthews_corrcoef
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from src.settings import (
    MLFLOW_TRACKING_USERNAME,
    EXPERIMENT_NAME,
    )


def timestamp():
    """This function creates current timestamp"""
    return datetime.now().strftime("%Y_%m_%d%H_%M_%S")


# select the model for evaluation
all_models = {
    'all-mpnet-base-v2': 'sentence-transformers/all-mpnet-base-v2',
    'stsb-roberta-large': 'cross-encoder/stsb-roberta-large',
    'stsb-roberta-base': 'cross-encoder/stsb-roberta-base',
    'Legal-BERT': 'nlpaueb/legal-bert-base-uncased',
    'EURLEX-BERT': 'nlpaueb/bert-base-uncased-eurlex',
    'SciBERT': 'allenai/scibert_scivocab_uncased',
}

selected_model = all_models['stsb-roberta-base']



In [4]:
# load test dataset
df_test = pd.read_parquet('data/test_clean.parquet')

sentence_pairs = list(zip(df_test['text'].tolist(),df_test['text_b'].tolist()))
sentence_pairs_lds = [{"text": x[0], "text_pair": x[1]} for x in sentence_pairs]

labels_true = df_test['label'].tolist()


## Sentence-Transformers

In [19]:
from sentence_transformers.cross_encoder import CrossEncoder


# Load a pre-trained CrossEncoder model
model = CrossEncoder(selected_model)

# Predict scores for a pair of sentences
scores = model.predict(sentence_pairs)

# scores to binary
# threshold = 0.45
for threshold in [
    0.51,
    0.60,
    0.75,
    0.85,
    0.90,
]:
    labels_pred = [0 if x <= threshold else 1 for x in scores]
    matthews_corrcoef_values = matthews_corrcoef(y_true=labels_true, y_pred=labels_pred)
    print(matthews_corrcoef_values)
    



0.0
0.10206207261596577
0.0
0.0
0.0


In [None]:
# save score predictions
df_scores = pd.DataFrame(scores)
df_scores.to_parquet('data/stsb-roberta-base_pretrain_test_scores.parquet')


## HF Transformers

### test_1

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(selected_model)
model = AutoModelForSequenceClassification.from_pretrained(selected_model)

scores = list()

for sentence_pair in sentence_pairs:
    # Example pair of legal texts
    text1 = sentence_pair[0]
    text2 = sentence_pair[1]

    # Tokenize the texts
    inputs = tokenizer(text1, text2, return_tensors='pt', truncation=True, padding=True)

    # Get model predictions
    outputs = model(**inputs)
    logits = outputs.logits

    # Get the score (e.g., similarity score)
    score = torch.softmax(logits, dim=1)

    scores.append(score)

In [None]:
scores

### test_2

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import torch

model = AutoModelForSequenceClassification.from_pretrained(selected_model)
tokenizer = AutoTokenizer.from_pretrained(selected_model)

features = tokenizer(sentence_pairs[:10], padding=True, truncation=True, return_tensors="pt")

model.eval()
with torch.no_grad():
    scores = model(**features).logits
    print(scores)

tensor([[-0.3469],
        [-0.4552],
        [-0.3044],
        [-0.1127],
        [ 0.3391],
        [-0.5461],
        [ 0.1714],
        [ 0.3955],
        [ 0.6687],
        [ 0.5301]])


### test with pipeline

In [25]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from transformers.pipelines.text_classification import ClassificationFunction


model = AutoModelForSequenceClassification.from_pretrained(selected_model)
tokenizer = AutoTokenizer.from_pretrained(selected_model)

pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, function_to_apply=ClassificationFunction.SIGMOID)


pipe(sentence_pairs_lds)

[{'label': 'LABEL_0', 'score': 0.41412752866744995},
 {'label': 'LABEL_0', 'score': 0.38811999559402466},
 {'label': 'LABEL_0', 'score': 0.4244891107082367},
 {'label': 'LABEL_0', 'score': 0.4718495309352875},
 {'label': 'LABEL_0', 'score': 0.5839836597442627},
 {'label': 'LABEL_0', 'score': 0.3667718768119812},
 {'label': 'LABEL_0', 'score': 0.5427504181861877},
 {'label': 'LABEL_0', 'score': 0.5976123213768005},
 {'label': 'LABEL_0', 'score': 0.6612198352813721},
 {'label': 'LABEL_0', 'score': 0.629508376121521}]