# Transfer Learning: BERT

In [1]:
import torch
print(torch.__version__)


2.3.1+cpu


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm





In [3]:

# Load the cleaned and processed dataset
file_path = 'cleaned_movie_data.csv'
cleaned_data = pd.read_csv(file_path)

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(cleaned_data['processed_text'], cleaned_data['label'], test_size=0.2, random_state=42)

In [5]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Tokenize the text data
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=128)

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, y_train.tolist())
test_dataset = SentimentDataset(test_encodings, y_test.tolist())

In [7]:
import transformers
import accelerate

print(transformers.__version__)
print(accelerate.__version__)


4.42.4
0.32.1


In [8]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=100,
    per_device_eval_batch_size=640,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [9]:
# Train the model
trainer.train()

  3%|▎         | 10/320 [07:23<3:49:24, 44.40s/it]

{'loss': 0.7956, 'grad_norm': 6.093982219696045, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.03}


  6%|▋         | 20/320 [14:36<3:33:35, 42.72s/it]

{'loss': 0.7336, 'grad_norm': 2.5426721572875977, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.06}


  9%|▉         | 30/320 [23:50<4:41:50, 58.31s/it]

{'loss': 0.7062, 'grad_norm': 2.6184616088867188, 'learning_rate': 3e-06, 'epoch': 0.09}


 12%|█▎        | 40/320 [32:42<4:01:34, 51.77s/it]

{'loss': 0.6891, 'grad_norm': 1.6252832412719727, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.12}


 16%|█▌        | 50/320 [39:33<3:15:02, 43.34s/it]

{'loss': 0.6797, 'grad_norm': 1.1084074974060059, 'learning_rate': 5e-06, 'epoch': 0.16}


 19%|█▉        | 60/320 [46:41<3:02:19, 42.07s/it]

{'loss': 0.6456, 'grad_norm': 3.460432291030884, 'learning_rate': 6e-06, 'epoch': 0.19}


 22%|██▏       | 70/320 [53:36<2:55:46, 42.19s/it]

{'loss': 0.6132, 'grad_norm': 2.380405902862549, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.22}


 25%|██▌       | 80/320 [1:00:55<2:54:39, 43.67s/it]

{'loss': 0.623, 'grad_norm': 2.2648324966430664, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.25}


 28%|██▊       | 90/320 [1:08:00<2:39:42, 41.66s/it]

{'loss': 0.5751, 'grad_norm': 2.9812510013580322, 'learning_rate': 9e-06, 'epoch': 0.28}


 31%|███▏      | 100/320 [1:15:32<2:45:19, 45.09s/it]

{'loss': 0.5236, 'grad_norm': 4.567686080932617, 'learning_rate': 1e-05, 'epoch': 0.31}


 34%|███▍      | 110/320 [1:22:54<2:35:10, 44.34s/it]

{'loss': 0.5258, 'grad_norm': 7.1903181076049805, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.34}


 38%|███▊      | 120/320 [1:30:40<2:41:30, 48.45s/it]

{'loss': 0.5048, 'grad_norm': 3.559196949005127, 'learning_rate': 1.2e-05, 'epoch': 0.38}


 41%|████      | 130/320 [1:38:04<2:20:26, 44.35s/it]

{'loss': 0.4941, 'grad_norm': 4.153372287750244, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.41}


 44%|████▍     | 140/320 [1:45:35<2:07:49, 42.61s/it]

{'loss': 0.5011, 'grad_norm': 4.560493469238281, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.44}


 47%|████▋     | 150/320 [1:53:23<2:02:40, 43.30s/it]

{'loss': 0.4324, 'grad_norm': 3.77274227142334, 'learning_rate': 1.5e-05, 'epoch': 0.47}


 50%|█████     | 160/320 [2:01:18<2:05:50, 47.19s/it]

{'loss': 0.474, 'grad_norm': 3.562633514404297, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.5}


 53%|█████▎    | 170/320 [2:08:17<1:44:22, 41.75s/it]

{'loss': 0.4393, 'grad_norm': 4.135149955749512, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.53}


 56%|█████▋    | 180/320 [2:16:32<1:42:37, 43.98s/it]

{'loss': 0.4458, 'grad_norm': 6.607757091522217, 'learning_rate': 1.8e-05, 'epoch': 0.56}


 59%|█████▉    | 190/320 [2:24:20<1:40:10, 46.24s/it]

{'loss': 0.4295, 'grad_norm': 8.887730598449707, 'learning_rate': 1.9e-05, 'epoch': 0.59}


 62%|██████▎   | 200/320 [2:32:14<1:31:21, 45.68s/it]

{'loss': 0.384, 'grad_norm': 3.340078115463257, 'learning_rate': 2e-05, 'epoch': 0.62}


 66%|██████▌   | 210/320 [2:39:33<1:19:53, 43.58s/it]

{'loss': 0.3736, 'grad_norm': 8.277885437011719, 'learning_rate': 2.1e-05, 'epoch': 0.66}


 69%|██████▉   | 220/320 [2:47:16<1:16:50, 46.11s/it]

{'loss': 0.3963, 'grad_norm': 2.452183246612549, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.69}


 72%|███████▏  | 230/320 [2:54:31<1:05:45, 43.84s/it]

{'loss': 0.3996, 'grad_norm': 3.4846110343933105, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.72}


 75%|███████▌  | 240/320 [3:02:04<1:03:31, 47.64s/it]

{'loss': 0.3799, 'grad_norm': 3.4714858531951904, 'learning_rate': 2.4e-05, 'epoch': 0.75}


 78%|███████▊  | 250/320 [3:09:18<51:48, 44.40s/it]  

{'loss': 0.4082, 'grad_norm': 4.5460968017578125, 'learning_rate': 2.5e-05, 'epoch': 0.78}


 81%|████████▏ | 260/320 [3:16:54<45:34, 45.57s/it]

{'loss': 0.3777, 'grad_norm': 8.31108570098877, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.81}


 84%|████████▍ | 270/320 [3:24:15<36:57, 44.34s/it]

{'loss': 0.3849, 'grad_norm': 3.0011048316955566, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.84}


 88%|████████▊ | 280/320 [3:31:46<28:48, 43.21s/it]

{'loss': 0.4142, 'grad_norm': 5.446024417877197, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.88}


 91%|█████████ | 290/320 [3:38:52<21:50, 43.67s/it]

{'loss': 0.4264, 'grad_norm': 2.425049304962158, 'learning_rate': 2.9e-05, 'epoch': 0.91}


 94%|█████████▍| 300/320 [3:46:20<14:43, 44.16s/it]

{'loss': 0.3947, 'grad_norm': 8.80986213684082, 'learning_rate': 3e-05, 'epoch': 0.94}


 97%|█████████▋| 310/320 [3:53:36<07:12, 43.26s/it]

{'loss': 0.4434, 'grad_norm': 7.078861236572266, 'learning_rate': 3.1e-05, 'epoch': 0.97}


100%|██████████| 320/320 [4:00:47<00:00, 43.42s/it]

{'loss': 0.4128, 'grad_norm': 8.834904670715332, 'learning_rate': 3.2000000000000005e-05, 'epoch': 1.0}


100%|██████████| 320/320 [4:00:51<00:00, 45.16s/it]


{'train_runtime': 14450.576, 'train_samples_per_second': 2.214, 'train_steps_per_second': 0.022, 'train_loss': 0.5008510336279869, 'epoch': 1.0}


TrainOutput(global_step=320, training_loss=0.5008510336279869, metrics={'train_runtime': 14450.576, 'train_samples_per_second': 2.214, 'train_steps_per_second': 0.022, 'total_flos': 2104888442880000.0, 'train_loss': 0.5008510336279869, 'epoch': 1.0})

In [10]:
# Evaluate the model
trainer.evaluate()

100%|██████████| 13/13 [15:06<00:00, 69.72s/it]


{'eval_loss': 0.37715137004852295,
 'eval_runtime': 984.2814,
 'eval_samples_per_second': 8.128,
 'eval_steps_per_second': 0.013,
 'epoch': 1.0}

In [14]:
max_length = 512

In [19]:
import pickle
# Save the model using pickle
with open('bert_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
# Make predictions and evaluate the model
bert_pipeline = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
X_test_str = X_test.tolist()
predictions = bert_pipeline(X_test_str, truncation=True, padding=True, max_length=max_length)

# Convert predictions to labels
y_pred = [1 if pred['label'] == 'LABEL_1' else 0 for pred in predictions]

# Print the accuracy and classification report
print("BERT Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))