In [2]:
import warnings
import numpy as np
import os
import pandas as pd
import openpyxl
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset  
from torch.utils.data.dataset import random_split
import torchvision.transforms as transforms
import transformers
from transformers import BertForSequenceClassification , Trainer , TrainingArguments
from transformers import BertTokenizerFast
from sklearn.model_selection import train_test_split
from datasets import load_metric
from transformers import Trainer
from datasets import load_metric
import pandas as pd
import torch

  from pandas.core import (


In [3]:

df = pd.read_csv("embed_data")
df = df.loc[(df.label == "standard") | (df.label == "requirement")]
print(df.groupby("label").count())
print(df.head())

             text  document
label                      
requirement  1303      1303
standard      204       204
                                                text        label  \
0  CSP_Mid.CBF shall have a Maintenance Down Time...  requirement   
1  When commanded, CSP_Mid.CBF shall perform auto...  requirement   
2   Each box end end blast station paddle lift sh...  requirement   
3   Each pipe shall be transferred into the box e...  requirement   
4   Each of the vrollers shall be used on many st...  requirement   

         document  
0          SKAMid  
1          SKAMid  
2  JCanadaWelding  
3  JCanadaWelding  
4  JCanadaWelding  


In [4]:
requirements = df.text.tolist()
labels = df.label.map({'standard': 1, 'requirement': 0}).tolist()
#print(labels)

In [5]:
train_requs , test_requs , train_labels , test_labels = train_test_split (
requirements , labels , random_state =500 , test_size =.2)

train_requs , val_requs , train_labels , val_labels = train_test_split (
train_requs , train_labels , random_state =501 , test_size =.1)

In [6]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_encodings = tokenizer ( train_requs , truncation = True , padding = True )
val_encodings = tokenizer ( val_requs , truncation = True , padding = True )
test_encodings = tokenizer ( test_requs , truncation = True , padding = True )



In [7]:
class RequDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RequDataset(train_encodings, train_labels)
val_dataset = RequDataset(val_encodings, val_labels)
test_dataset = RequDataset(test_encodings, test_labels)


In [8]:
import numpy as np
import torch
from transformers import BertForSequenceClassification, AdamW, BertTokenizer, TrainingArguments, Trainer, EarlyStoppingCallback, get_linear_schedule_with_warmup
from datasets import load_metric
from datasets import load_dataset
import ipywidgets as widgets
from IPython.display import display, clear_output

# Create collapsible output widget
output = widgets.Output()
output_collapsible = widgets.Accordion([output])
output_collapsible.set_title(0, 'Model Output')
display(output_collapsible)

# Function to display content in the output widget
def display_in_output(content):
    with output:
        clear_output(wait=True)
        display(content)



# Define your training arguments first
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_strategy='epoch',
    load_best_model_at_end=True,
)

# Define your model and optimizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay)

# Load the metric
metric = load_metric("matthews_correlation")

# Define your compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels_string = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels_string)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Define your early stopping callback
early_stopping = EarlyStoppingCallback(early_stopping_patience=1, early_stopping_threshold=0.01)

# Manually calculate the total number of training steps
total_train_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs

# Define your learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(total_train_steps * training_args.warmup_ratio),
    num_training_steps=total_train_steps
)

# Initialize lists to store training and evaluation metrics
train_losses = []
eval_losses = []
eval_matthews_corrs = []

# Add the early stopping callback to the trainer
trainer.add_callback(early_stopping)

# Custom training loop
for epoch in range(training_args.num_train_epochs):
    # Training
    trainer.train()

    # Validation
    trainer.evaluate()

    # Create a collapsible widget for model output
    model_output = widgets.Output()
    model_output_collapsible = widgets.Accordion([model_output])
    model_output_collapsible.set_title(0, f'Epoch {epoch + 1}/{training_args.num_train_epochs}')
    
    # Function to display content in the model_output widget
    def display_in_model_output(content):
        with model_output:
            clear_output(wait=True)
            display(content)
    
    # Get training and evaluation metrics
    train_loss = trainer.callback_metrics['train_loss']
    eval_loss = trainer.callback_metrics['eval_loss']
    eval_matthews_corr = trainer.callback_metrics['eval_matthews_correlation']
    
    # Append metrics to lists
    train_losses.append(train_loss)
    eval_losses.append(eval_loss)
    eval_matthews_corrs.append(eval_matthews_corr)
    
    # Display training and validation metrics in model output
    display_in_model_output(f"Training Loss: {train_loss:.4f}")
    display_in_model_output(f"Validation Loss: {eval_loss:.4f}")
    display_in_model_output(f"Matthews Correlation: {eval_matthews_corr:.4f}")
    
    # Display the collapsible model output widget
    display(model_output_collapsible)

    # Check for early stopping
    if early_stopping.early_stopping.should_stop:
        display_in_output("Early stopping triggered.")
        break

    # Update the learning rate using the scheduler
    for _ in range(total_train_steps):
        scheduler.step()

# Access training and evaluation metrics from the lists
final_train_loss = train_losses[-1]
final_eval_loss = eval_losses[-1]
final_eval_matthews_corr = eval_matthews_corrs[-1]


# Print final metrics
print(f"Final Training Loss: {final_train_loss:.4f}")
print(f"Final Validation Loss: {final_eval_loss:.4f}")
print(f"Final Matthews Correlation: {final_eval_matthews_corr:.4f}")



Accordion(children=(Output(),), _titles={'0': 'Model Output'})

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  metric = load_metric("matthews_correlation")


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899
4,0.0039,0.050473,0.923899


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.3955,0.146477,0.753563
2,0.0911,0.080259,0.882033
3,0.0154,0.051813,0.923899
4,0.0039,0.050473,0.923899


AttributeError: 'Trainer' object has no attribute 'callback_metrics'

In [9]:

predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

In [10]:
from sklearn . metrics import matthews_corrcoef as mcc
mcc_test_labels = [ label for label in test_labels ]
mcc_preds = [ label for label in preds . tolist () ]
print ( mcc_preds )
print ( mcc_test_labels )
print ( mcc ( mcc_test_labels , mcc_preds ) )

[0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0]
[0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,

In [11]:
from sklearn import metrics
print(metrics.confusion_matrix(test_labels, preds))
print(metrics.classification_report(test_labels, preds, target_names=['Standard', 'Requirement']))

[[252   4]
 [  4  42]]
              precision    recall  f1-score   support

    Standard       0.98      0.98      0.98       256
 Requirement       0.91      0.91      0.91        46

    accuracy                           0.97       302
   macro avg       0.95      0.95      0.95       302
weighted avg       0.97      0.97      0.97       302



In [12]:
import os
output_dir = './model_save_trained/'
# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
model_to_save = model.module if hasattr(model, 'module') else model
# Take care of distributed / parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./model_save_trained/tokenizer_config.json',
 './model_save_trained/special_tokens_map.json',
 './model_save_trained/vocab.txt',
 './model_save_trained/added_tokens.json',
 './model_save_trained/tokenizer.json')

In [3]:

from transformers import BertTokenizerFast, BertForSequenceClassification

# Define the directory where you saved the model and tokenizer
#output_dir = './model_save_trained/'


from transformers import BertTokenizer, BertModel

# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Load the model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
df = pd.read_csv("embed_data")
df = df.loc[(df.label == "standard") | (df.label == "requirement")]
print(df.groupby("document").count())
print(df.head())

                text  label
document                   
AASHTO            27     27
ASABE             39     39
ATSM              99     99
IEEE              13     13
ISO               17     17
JCanadaWelding   350    350
Jpierburg        214    214
Jtoho            159    159
MilSpec            9      9
SKADish          289    289
SKAMid           291    291
                                                text        label  \
0  CSP_Mid.CBF shall have a Maintenance Down Time...  requirement   
1  When commanded, CSP_Mid.CBF shall perform auto...  requirement   
2   Each box end end blast station paddle lift sh...  requirement   
3   Each pipe shall be transferred into the box e...  requirement   
4   Each of the vrollers shall be used on many st...  requirement   

         document  
0          SKAMid  
1          SKAMid  
2  JCanadaWelding  
3  JCanadaWelding  
4  JCanadaWelding  


In [5]:
requirements = df.text.tolist()
labels = df.label.tolist()
documents = df.document.tolist()
#print(documents)

In [6]:
def requirement_to_embedding(model, tokenizer, requirement):
    input = tokenizer(requirement, padding=True, truncation=True, return_tensors="pt")
    input = input.to("cpu")  # copy input to CPU
    output = model(**input)  # run model without labels to get logits & encoded layers
    hidden_states = output.hidden_states
    embedding = hidden_states[12][0][0]  # each layer has output of size (batch_size, sequence_length, hidden_size); here we are getting the [CLS] token from the final layer
    embedding = embedding.detach().numpy()
    return embedding

In [7]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import numpy as np
embeddings = [ requirement_to_embedding ( model , tokenizer , requirement ) for
requirement in requirements ]
embedd_array = np . stack ( embeddings )
embedd_array . shape

TypeError: 'NoneType' object is not subscriptable

In [8]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Specify the perplexity and learning rate values
perplexity_value = 50  # You can adjust this value
learning_rate_value = 10  # You can adjust this value

# Initialize the t-SNE model with specified perplexity and learning rate
tsne = TSNE(n_components=2, perplexity=perplexity_value, learning_rate=learning_rate_value, random_state=42)

# Fit the t-SNE model to your data
embeddings_2d = tsne.fit_transform(embedd_array)

# Create binary labels as a list (0 for 'requirement', 1 for 'standard')
labels = df.label.map({'standard': 1, 'requirement': 0}).tolist()

# Define colors and markers for the two classes
colors = sns.color_palette('Set1', n_colors=2)  # Custom color palette
markers = 'o'  # Circle markers for both classes

# Convert embeddings_2d array into a DataFrame with index values
df = pd.DataFrame(embeddings_2d, columns=['Dimension 1', 'Dimension 2'])
df['label'] = labels

# Create a scatter plot of the t-SNE embeddings for both classes
plt.figure(figsize=(10, 8))
sns.set(style='whitegrid')  # Set Seaborn style with gridlines

for label in set(labels):
    df_label = df[df['label'] == label]
    sns.scatterplot(data=df_label, x='Dimension 1', y='Dimension 2', hue='label', palette=[colors[int(label)]], marker=markers, edgecolor='k', s=100)
    


# Create legends for the classes
class_labels = ['requirement', 'standard']  # Map the labels back to their original names
legend_handles = [plt.Line2D([0], [0], marker=markers, color='w', label=class_labels[int(label)], markersize=10,
                              markerfacecolor=colors[int(label)]) for label in set(labels)]
plt.legend(handles=legend_handles, title='Classes')

plt.title("t-SNE Visualization of Embeddings with Class Labels and IDs", fontsize=16)
#plt.xlabel("t-SNE Dimension 1", fontsize=12)
#plt.ylabel("t-SNE Dimension 2", fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(False)  # Turn off gridlines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Save the plot as a PNG image
plt.savefig('tsne_visualization_perp6_nottrained.png', dpi=300, bbox_inches='tight')

plt.show()

NameError: name 'embedd_array' is not defined