In [None]:
!pip install transformers[torch] accelerate -U torch --upgrade datasets


Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datase

In [None]:
# Verify the installation
import transformers
import torch
import accelerate

print(f"Transformers version: {transformers.__version__}")
print(f"Torch version: {torch.__version__}")
print(f"Accelerate version: {accelerate.__version__}")


Transformers version: 4.41.0
Torch version: 2.3.0+cu121
Accelerate version: 0.30.1


In [None]:
import json
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from google.colab import drive
from datasets import load_metric


In [None]:
drive.mount('/content/drive')

# Define the path to your file
file_path = '/content/drive/My Drive/review_corpus_en.json'

# Load the data from the specified path
with open(file_path) as f:
    reviews = [json.loads(line) for line in f]


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
class ReviewDataset(Dataset):
    def __init__(self, reviews, tokenizer, max_len):
        self.reviews = reviews
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        title = review['title']
        body = review['body']
        text = f"{title} {body}"
        label = 1 if review['rating'] == 'pos' else 0

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
train_reviews, test_reviews = train_test_split(reviews, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = ReviewDataset(train_reviews, tokenizer, max_len=128)
test_dataset = ReviewDataset(test_reviews, tokenizer, max_len=128)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(p):
    metric = load_metric("accuracy")
    predictions, labels = p
    preds = predictions.argmax(-1)
    return metric.compute(predictions=preds, references=labels)


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",  # to evaluate after each epoch
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  # Add test dataset for evaluation
    compute_metrics=compute_metrics
)

trainer.train()




Epoch,Training Loss,Validation Loss,Accuracy
1,0.4329,0.370848,0.843333
2,0.3239,0.356114,0.853333
3,0.307,0.370402,0.865


  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


TrainOutput(global_step=450, training_loss=0.3751769420835707, metrics={'train_runtime': 222.1202, 'train_samples_per_second': 32.415, 'train_steps_per_second': 2.026, 'total_flos': 473599899648000.0, 'train_loss': 0.3751769420835707, 'epoch': 3.0})

In [None]:
# Evaluate the model on the test set
eval_result = trainer.evaluate()

print(f"Evaluation results: {eval_result}")

model.save_pretrained('./sentiment-model')
tokenizer.save_pretrained('./sentiment-tokenizer')


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluation results: {'eval_loss': 0.37040162086486816, 'eval_accuracy': 0.865, 'eval_runtime': 7.0795, 'eval_samples_per_second': 84.752, 'eval_steps_per_second': 1.413, 'epoch': 3.0}


('./sentiment-tokenizer/tokenizer_config.json',
 './sentiment-tokenizer/special_tokens_map.json',
 './sentiment-tokenizer/vocab.txt',
 './sentiment-tokenizer/added_tokens.json')