In [1]:
#install libraries
!pip install transformers torch pandas numpy scikit-learn matplotlib seaborn tqdm



In [2]:
#import libraries
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import time

from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
#pytorch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from torch.optim import AdamW

In [3]:
#check gpu
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
  print("No gpu detected")
  device = torch.device("cpu")


There are 1 GPU(s) available.
Device name: Tesla T4


In [4]:
#install kagglehub
!pip install kagglehub



In [5]:
import kagglehub
import bz2
import os

#download dataset
path = kagglehub.dataset_download("bittlingmayer/amazonreviews")
print("Dataset downloaded to:", path)

Using Colab cache for faster access to the 'amazonreviews' dataset.
Dataset downloaded to: /kaggle/input/amazonreviews


In [6]:


def get_amazon_data(file_path, num_rows):
    texts = []
    labels = []

    print(f"Loading data from {os.path.basename(file_path)}...")

    with bz2.BZ2File(file_path, "r") as f:
        for i, line in tqdm(enumerate(f)):

            if i >= num_rows:
                break


            line = line.decode("utf-8")

            if "__label__2" in line:
                labels.append(1)  # Positive
            else:
                labels.append(0)  # Negative

            text_part = line.split("__label__")[1][1:].strip()
            texts.append(text_part)

    return pd.DataFrame({'text': texts, 'label': labels})

In [7]:

train_file = os.path.join(path, "train.ft.txt.bz2")
test_file = os.path.join(path, "test.ft.txt.bz2")

df_train = get_amazon_data(train_file, num_rows=50000)
df_test = get_amazon_data(test_file, num_rows=10000)

print("\n✅ Data Loading Completed!")

Loading data from train.ft.txt.bz2...


50000it [00:01, 27651.71it/s]


Loading data from test.ft.txt.bz2...


10000it [00:00, 25777.87it/s]


✅ Data Loading Completed!





In [8]:
print("--- Training Data Head ---")
display(df_train.head())

print("\n--- Class Distribution (Train) ---")
print(df_train['label'].value_counts())

print("\n--- Sample Review Text ---")
sample_text = df_train['text'].iloc[0]
sample_label = df_train['label'].iloc[0]
print(f"Label: {sample_label} ({'Positive' if sample_label==1 else 'Negative'})")
print(f"Review: {sample_text}")

--- Training Data Head ---


Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,1
1,The best soundtrack ever to anything.: I'm rea...,1
2,Amazing!: This soundtrack is my favorite music...,1
3,Excellent Soundtrack: I truly like this soundt...,1
4,"Remember, Pull Your Jaw Off The Floor After He...",1



--- Class Distribution (Train) ---
label
1    25506
0    24494
Name: count, dtype: int64

--- Sample Review Text ---
Label: 1 (Positive)
Review: Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^


In [9]:
#load bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def preprocessing_for_bert(data):
    """
    data: sentences
    """
    input_ids = []
    attention_masks = []

    print(f"Tokenizing {len(data)} sentences...")

    for sent in tqdm(data):
        encoded_sent = tokenizer.encode_plus(
            text=sent,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )


        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))


    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

#process train data
train_inputs, train_masks = preprocessing_for_bert(df_train['text'].values)
train_labels = torch.tensor(df_train['label'].values)

#process test data
test_inputs, test_masks = preprocessing_for_bert(df_test['text'].values)
test_labels = torch.tensor(df_test['label'].values)

print("\n✅ Preprocessing Completed!")
print("original text: ",df_train['text'].values[0])
print("tokenized text: ",train_inputs[0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokenizing 50000 sentences...


100%|██████████| 50000/50000 [01:34<00:00, 531.32it/s]


Tokenizing 10000 sentences...


100%|██████████| 10000/10000 [00:16<00:00, 615.89it/s]



✅ Preprocessing Completed!
original text:  Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
tokenized text:  tensor([  101, 24646,  5582,  2130,  2005,  1996,  2512,  1011, 27911,  1024,
         2023,  2614,  2650,  2001,  3376,   999,  2009, 23262,  1996, 12411,
         7301,  1999,  2115,  2568,  2061,  2092,  1045,  2052, 28667,  8462,
         4859,  2009,  2130,  2000,  2111,  2040,  5223,  6819,  2094,  1012,
         2208,  2189,   999,  1045,  2031,  2209,  1996,  2208, 10381,  4948,
         2080,  2892,  2021,  2041,  1997,  2035,  1997,  1996,  2399,  1045,
         2031,  2412,  2209,  2009,  2038,  

In [10]:
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)

train_sampler = RandomSampler(train_data)

train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

print("\n✅ DataLoaders Created Successfully!")
print(f"Number of batches in Train Loader: {len(train_dataloader)}")
print(f"Number of batches in Test Loader: {len(test_dataloader)}")


✅ DataLoaders Created Successfully!
Number of batches in Train Loader: 1563
Number of batches in Test Loader: 313


In [11]:
print("Loading BERT Model...")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

model.to(device)

print("\n✅ Model loaded and sent to GPU!")

Loading BERT Model...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



✅ Model loaded and sent to GPU!


In [12]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )

epochs = 2

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

print("✅ Optimizer and Scheduler setup completed!")
print(f"Total Training Steps: {total_steps}")

✅ Optimizer and Scheduler setup completed!
Total Training Steps: 3126


In [13]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

print("🚀 Starting Training...")

training_stats = []
total_t0 = time.time()

for epoch_i in range(0, epochs):


    print(f'\n======== Epoch {epoch_i + 1} / {epochs} ========')
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 40 == 0 and not step == 0:
            elapsed = time.time() - t0
            print(f'  Batch {step}  of  {len(train_dataloader)}.    Elapsed: {elapsed:.2f}s.')

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        result = model(b_input_ids,
                       token_type_ids=None,
                       attention_mask=b_input_mask,
                       labels=b_labels,
                       return_dict=True)

        loss = result.loss
        logits = result.logits

        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = time.time() - t0

    print(f"\n  Average training loss: {avg_train_loss:.2f}")
    print(f"  Training epoch took: {training_time:.2f}s")


    print("\nRunning Validation...")
    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in test_dataloader:

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            result = model(b_input_ids,
                           token_type_ids=None,
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)

        loss = result.loss
        logits = result.logits

        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print(f"  Accuracy: {avg_val_accuracy:.2f}")

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': total_eval_loss / len(test_dataloader),
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time
        }
    )

print("\n✅ Training complete!")
print(f"Total training took {time.time()-total_t0:.2f}s")

🚀 Starting Training...

Training...
  Batch 40  of  1563.    Elapsed: 24.52s.
  Batch 80  of  1563.    Elapsed: 48.79s.
  Batch 120  of  1563.    Elapsed: 74.16s.
  Batch 160  of  1563.    Elapsed: 100.79s.
  Batch 200  of  1563.    Elapsed: 126.56s.
  Batch 240  of  1563.    Elapsed: 152.25s.
  Batch 280  of  1563.    Elapsed: 178.43s.
  Batch 320  of  1563.    Elapsed: 204.45s.
  Batch 360  of  1563.    Elapsed: 230.08s.
  Batch 400  of  1563.    Elapsed: 256.02s.
  Batch 440  of  1563.    Elapsed: 282.03s.
  Batch 480  of  1563.    Elapsed: 307.87s.
  Batch 520  of  1563.    Elapsed: 333.81s.
  Batch 560  of  1563.    Elapsed: 359.78s.
  Batch 600  of  1563.    Elapsed: 385.64s.
  Batch 640  of  1563.    Elapsed: 411.52s.
  Batch 680  of  1563.    Elapsed: 437.41s.
  Batch 720  of  1563.    Elapsed: 463.33s.
  Batch 760  of  1563.    Elapsed: 489.25s.
  Batch 800  of  1563.    Elapsed: 515.23s.
  Batch 840  of  1563.    Elapsed: 541.14s.
  Batch 880  of  1563.    Elapsed: 567.06s.
 

In [18]:
import torch.nn.functional as F

def predict_sentiment(text):
    model.eval()

    encoded_review = tokenizer.encode_plus(
        text,
        max_length=128,
        add_special_tokens=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask)

    probs = F.softmax(output.logits, dim=1)

    _, prediction = torch.max(probs, dim=1)

    sentiment = "Positive 😃" if prediction.item() == 1 else "Negative 😞"
    confidence = probs.max().item() * 100

    print(f"Review: {text}")
    print(f"Sentiment: {sentiment} (Confidence: {confidence:.2f}%)")
    print("-" * 50)


predict_sentiment("This product is amazing! I really loved it. Highly recommended.")

predict_sentiment("Worst experience ever. The quality is very bad and useless.")

predict_sentiment(" quite bad.")

Keyword arguments {'pad_to_max_length': True} not recognized.
Keyword arguments {'pad_to_max_length': True} not recognized.
Keyword arguments {'pad_to_max_length': True} not recognized.


Review: This product is amazing! I really loved it. Highly recommended.
Sentiment: Positive 😃 (Confidence: 99.76%)
--------------------------------------------------
Review: Worst experience ever. The quality is very bad and useless.
Sentiment: Negative 😞 (Confidence: 99.92%)
--------------------------------------------------
Review:  quite bad.
Sentiment: Negative 😞 (Confidence: 99.32%)
--------------------------------------------------


In [19]:
import os

output_dir = './model_save/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("✅ Model saved successfully!")

Saving model to ./model_save/
✅ Model saved successfully!
