<a href="https://colab.research.google.com/github/SarathiPrabu/sentiment-analysis-app/blob/main/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch torchvision torchaudio
!pip install transformers dataset evaluate



In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset

dataset_dict = load_dataset(
    'HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-21',
    val_filing_start_date='2016-01-22',
    val_filing_end_date='2016-01-31',
)


In [4]:
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id'],
        num_rows: 16153
    })
    validation: Dataset({
        features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id'],
        num_rows: 9094
    })
})


In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english', num_labels=2)

In [6]:
def preprocess(example):
    if example['decision'] == 'ACCEPTED':
        label = 0
    elif example['decision'] == 'REJECTED':
        label = 1
    else:
        label = None

    if label is not None:
        encoded = tokenizer.encode_plus(example['abstract'],
                                         max_length=512,
                                         padding='max_length',
                                         truncation=True,
                                         return_attention_mask=True,
                                         return_tensors='pt')
        return {'input_ids': encoded['input_ids'][0],
                'attention_mask': encoded['attention_mask'][0],
                'label': label}

train_dataset = dataset_dict['train'].filter(lambda example: example['decision'] in ['ACCEPTED', 'REJECTED']).map(preprocess)
val_dataset = dataset_dict['validation'].filter(lambda example: example['decision'] in ['ACCEPTED', 'REJECTED']).map(preprocess)

Map:   0%|          | 0/4888 [00:00<?, ? examples/s]

In [8]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
import evaluate
accuracy = evaluate.load("accuracy")

In [10]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [11]:
id2label = {0: "REJECTED", 1: "ACCEPTED"}
label2id = {"REJECTED": 0, "ACCEPTED": 1}

In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english", num_labels=2, id2label=id2label, label2id=label2id
)

In [13]:
!pip install accelerate -U
!pip install transformers[torch] accelerate -U




In [14]:
import torch

In [15]:
!pip install accelerate -U



In [None]:
training_args = TrainingArguments(
    output_dir="patent_model",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,


)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
trainer = Trainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=val_dataset,  # evaluation dataset
)

# Train the model
trainer.train()
trainer.model.save_pretrained('./model')