In [105]:

from transformers import AutoModelForSequenceClassification, AutoTokenizer

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import torch, csv, ast

from sklearn.metrics import accuracy_score  # Example metric
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback  # Import EarlyStoppingCallback

import pandas as pd



In [62]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())


2.3.1
11.8
True


## Prepare Dataset

In [86]:
# Read the TSV file
df = pd.read_csv(
    "merged_output_7.tsv",
    delimiter="\t",
    dtype=str,  # Read two first columns as strings labels as int {'patent_id': str, 'text': str, 'label': }
    quoting=csv.QUOTE_NONE,
)

# Convert the 'label' column from strings to lists of integers
df['label'] = df['label'].apply(ast.literal_eval)

# Print and inspect the resulting DataFrame
print(df.head(10))
print(df.shape)
df.info()


  patent_id                                               text  \
0  10000379  The invention is directed to a process for the...   
1  10001403  This invention relates to a system including a...   
2  10002404  A graphics processing unit (GPU) includes prog...   
3  10002897  Provided is a solid-state imaging device inclu...   
4  10003740  The present disclosure involves systems, softw...   
5  10004066  To handle different Quality of Service (QoS) r...   
6  10004246  Disclosed are hydrated fat compositions compri...   
7  10006614  A lighting device includes a primary housing h...   
8  10008125  Multi-user portable electronic devices for imp...   
9  10008416  A gate structure is formed over a substrate. T...   

                      label  
0  [0, 0, 1, 0, 0, 0, 0, 0]  
1  [0, 0, 0, 0, 0, 0, 1, 0]  
2  [0, 0, 0, 0, 0, 0, 1, 0]  
3  [0, 0, 0, 0, 0, 0, 0, 1]  
4  [0, 0, 0, 0, 0, 0, 1, 1]  
5  [0, 0, 0, 0, 0, 0, 0, 1]  
6  [1, 0, 0, 0, 0, 0, 0, 0]  
7  [0, 1, 0, 0, 0, 1, 0, 0]  
8  

## Subset data

In [87]:
data = df.iloc[:, 1:3]  # Rows 0 to 9, Columns 1 and 2
print(data.head())
print(data.info())
print(data.shape)
print(type(df['label'].iloc[0])) # Output: <class 'list'>

                                                text                     label
0  The invention is directed to a process for the...  [0, 0, 1, 0, 0, 0, 0, 0]
1  This invention relates to a system including a...  [0, 0, 0, 0, 0, 0, 1, 0]
2  A graphics processing unit (GPU) includes prog...  [0, 0, 0, 0, 0, 0, 1, 0]
3  Provided is a solid-state imaging device inclu...  [0, 0, 0, 0, 0, 0, 0, 1]
4  The present disclosure involves systems, softw...  [0, 0, 0, 0, 0, 0, 1, 1]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8175 entries, 0 to 8174
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    8175 non-null   object
 1   label   8175 non-null   object
dtypes: object(2)
memory usage: 127.9+ KB
None
(8175, 2)
<class 'list'>


## Tokenize Dataset
text — The original abstract text.  
label — The binary vector labels.  
input_ids — The tokenized IDs for the abstract.  
attention_mask — The attention mask for the input sequences.  

In [88]:
# Create Hugging Face Dataset
dataset = Dataset.from_dict(data)
dataset.info

DatasetInfo(description='', citation='', homepage='', license='', features={'text': Value(dtype='string', id=None), 'label': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name=None, version=None, splits=None, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_bytes=None)

In [89]:
print(dataset.info.features)  
print("\n")
# Print the first few rows of the dataset
print(dataset[:1])  # or dataset.head(10) if using pandas
print(dataset)

{'text': Value(dtype='string', id=None), 'label': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


{'text': ['The invention is directed to a process for the preparation of a syngas comprising hydrogen and carbon monoxide from a methane comprising gas, which process comprises the steps of: (a) reacting the methane comprising gas with an oxidizing gas in an autothermal reformer to obtain a hot raw syngas comprising carbon monoxide and hydrogen; (b) cooling the hot raw syngas resulting from step (a) to obtain the syngas, wherein step (b) comprises cooling the hot raw syngas by indirect heat exchange against the methane comprising gas used in step (a) and wherein sulphur is added upstream of cooling step (b). The invention also relates to a process for the preparation of hydrocarbon products in which a feed syngas is prepared in the process as described above followed by a desulphurization treatment and the desulphurized syngas is subsequently converted into hydrocarb

In [90]:
print(dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 8175
})


In [91]:
# Determine length of the longest abstract
longest_abstract_length = df['text'].apply(len).max()

print(f"The longest abstract has {longest_abstract_length} characters.")


The longest abstract has 4678 characters.


In [92]:
# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize abstracts and include labels
def tokenize_fn(example):
    tokens = tokenizer(example['text'], truncation=True, padding='max_length', max_length=512)   # I work in pycharm, where the map progression bar visually is bugged - therefore I ignore that it says 0%. I can verify that the function tokenized_fn work by printing the tokenized_dataset
    tokens["label"] = example['label']  # Add labels
    return tokens

# Tokenize the dataset and keep labels in the same dataset
tokenized_dataset = dataset.map(tokenize_fn, batched=False)
print(tokenized_dataset)
print(tokenized_dataset.column_names)


Map:   0%|          | 0/8175 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 8175
})
['text', 'label', 'input_ids', 'attention_mask']


In [93]:
#Print first row of the tokenized data
print(tokenized_dataset[0])


{'text': 'The invention is directed to a process for the preparation of a syngas comprising hydrogen and carbon monoxide from a methane comprising gas, which process comprises the steps of: (a) reacting the methane comprising gas with an oxidizing gas in an autothermal reformer to obtain a hot raw syngas comprising carbon monoxide and hydrogen; (b) cooling the hot raw syngas resulting from step (a) to obtain the syngas, wherein step (b) comprises cooling the hot raw syngas by indirect heat exchange against the methane comprising gas used in step (a) and wherein sulphur is added upstream of cooling step (b). The invention also relates to a process for the preparation of hydrocarbon products in which a feed syngas is prepared in the process as described above followed by a desulphurization treatment and the desulphurized syngas is subsequently converted into hydrocarbon products in a Fischer-Tropsch process.', 'label': [0, 0, 1, 0, 0, 0, 0, 0], 'input_ids': [101, 1996, 11028, 2003, 2856,

In [94]:
#print(tokenized_dataset['label'][:5])  # Print the first 5 labels
#print(tokenized_dataset['input_ids'][:5])  # Print the first 5 tokenized texts (input_ids)


## Split tokenized data into train, val and test data

In [95]:
# First, split the dataset into training and testing (80% train, 20% test)
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
print(train_test_split["train"].shape)
print(train_test_split["test"].shape)

# Further split the training set into training and validation (80% train, 20% validation)
train_val_split = train_test_split['train'].train_test_split(test_size=0.2)
print(train_val_split["train"].shape)
print(train_val_split["test"].shape)

# Combining everything into a DatasetDict for easier handling
split_datasets = DatasetDict({
    'train': train_val_split['train'],
    'validation': train_val_split['test'],
    'test': train_test_split['test']
})

(6540, 4)
(1635, 4)
(5232, 4)
(1308, 4)


## Define Model and Training Arguments
multi-label classification

In [96]:
# Define the model for multi-label classification
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=8,  # Number of classes
    problem_type="multi_label_classification"  # Specify multi-label classification
)

print(model)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [97]:
# Freeze all layers except the classification head
for param in model.distilbert.parameters():
    param.requires_grad = False
# Only the classifier layers will be trainable
for param in model.pre_classifier.parameters():
    param.requires_grad = True
for param in model.classifier.parameters():
    param.requires_grad = True

print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [24]:
# Print DistilBert model architecture 
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


Global step: 3 (3 training steps since the dataset is small and batch size is 2).  
Training loss: 0.6537 (lower is better; indicates the model is learning).  
Samples per second: 2.71 (training speed, works well for small data).  
Steps per second: 1.355 (batch processing rate).  
Epoch: 3 (completed all 3 epochs).  

In [103]:
# ChatGPT
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions
    preds = preds.argmax(axis=1)
    # Use sklearn's accuracy_score or other metrics as needed
    from sklearn.metrics import accuracy_score
    return {'accuracy': accuracy_score(labels, preds)}

In [99]:
# ChatGPT

# Training settingswith 5 epochs
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,  # Match dataset size
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    eval_strategy="steps",  # Evaluate after a certain number of steps
    logging_dir="./logs",
    logging_steps=500,  # Adjust logging steps if necessary
    save_steps=1000,  # Set save_steps to be a multiple of eval_steps
    eval_steps=1000,  # Evaluate every 1000 steps
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="accuracy",  # Monitor the accuracy metric
    greater_is_better=True,  # Higher accuracy is better
)

# Setup EarlyStoppingCallback
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)

# Trainer setup with evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_datasets['train'],
    eval_dataset=split_datasets['validation'],
    compute_metrics=compute_metrics,  # Optional metric computation
    callbacks=[early_stopping_callback]  # Add early stopping callback
)

# Train
trainer.train()


Step,Training Loss,Validation Loss,Accuracy
1000,0.3303,0.316379,0.069572
2000,0.2769,0.282315,0.158257
3000,0.2668,0.271083,0.211774
4000,0.2748,0.262882,0.244648
5000,0.2533,0.262433,0.285168
6000,0.259,0.254996,0.308104
7000,0.2527,0.252236,0.305046
8000,0.2513,0.254157,0.335627
9000,0.2509,0.252678,0.336391
10000,0.246,0.250173,0.331804


TrainOutput(global_step=13080, training_loss=0.2617912671617047, metrics={'train_runtime': 4195.7689, 'train_samples_per_second': 6.235, 'train_steps_per_second': 3.117, 'total_flos': 3465717946122240.0, 'train_loss': 0.2617912671617047, 'epoch': 5.0})

In [100]:
# Evaluate the model on the validation set
results = trainer.evaluate(split_datasets['validation'])
print(results)


{'eval_loss': 0.24888338148593903, 'eval_accuracy': 0.34938837920489296, 'eval_runtime': 123.4734, 'eval_samples_per_second': 10.593, 'eval_steps_per_second': 5.297, 'epoch': 5.0}


## Evaluate the Model

trainer.evaluate(): calculates metrics (e.g., loss) on the validation dataset or any dataset provided. It does not return the predicted labels; it only provides evaluation metrics (like loss) based on the true labels and the model's output. Typically used for monitoring the model during training.

trainer.predict(): returns the predicted labels (along with other information like logits) for a given dataset (e.g., test data). Allows for specific metrics like accuracy, precision, recall, etc., on the predictions. Typically used after training is complete to evaluate how well the model performs on unseen data.

In [101]:
# Get predictions on the test set
predictions = trainer.predict(split_datasets['test'])

# Extract predicted labels and true labels
predicted_labels = (predictions.predictions > 0.5).astype(int)
true_labels = predictions.label_ids

# Compute accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print("Test Accuracy:", accuracy)


Test Accuracy: 0.35718654434250763


## Save the model

In [102]:
# ChatGPT
# Save the model and tokenizer
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")


('./trained_model\\tokenizer_config.json',
 './trained_model\\special_tokens_map.json',
 './trained_model\\vocab.txt',
 './trained_model\\added_tokens.json')

## Load former model

In [83]:
# ChatGPT

# Load the trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("./trained_model")
tokenizer = AutoTokenizer.from_pretrained("./trained_model")


## Predictions

In [84]:
# ChatGPT
# Example input text for prediction
input_text = "A multilayered tube for transporting fuel including an innermost layer (A), an outermost layer (B) and an intermediate layer (C)..."

# Tokenize the input
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Make predictions
with torch.no_grad():
    logits = model(**inputs).logits
    predictions = torch.sigmoid(logits).cpu().numpy()

# Convert logits to binary predictions (if it's multi-label classification)
predicted_labels = (predictions > 0.5).astype(int)

print(predicted_labels)


[[0 0 0 0 0 0 0 0]]
