In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import pandas as pd
import numpy as np

# Load 100,000 random rows from the training dataset
train_data = pd.read_csv(r"C:\Users\ramee\Desktop\AI Lab\Project\Dataset\train.csv", header=None, nrows=100000, skiprows=lambda i: i > 0 and np.random.rand() > 100000/4000000)

# Load 1,000 random rows from the test dataset
test_data = pd.read_csv(r"C:\Users\ramee\Desktop\AI Lab\Project\Dataset\test.csv", header=None, nrows=1000, skiprows=lambda i: i > 0 and np.random.rand() > 1000/50000)

print(train_data.shape)
print(test_data.shape)


(90284, 3)
(1000, 3)


In [3]:
train_data.head()

Unnamed: 0,0,1,2
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,Alaska sourdough,REad most of the book while visiting my brothe...
2,1,Not professional quality,I first tried buying this tape from a Marketpl...
3,2,Rare find,"A good book for Audi owners, and fans in gener..."
4,2,Great little surge protector,Great value for a surge protector. I've got 'e...


In [4]:
df = pd.DataFrame(columns=["rating", "review"])

df["rating"] = train_data[0].apply(lambda x: x - 1)
df["review"] = train_data[1] + " " + train_data[2]

test_df = pd.DataFrame(columns=["rating", "review"])

test_df["rating"] = test_data[0].apply(lambda x: x - 1)
test_df["review"] = test_data[1] + " " + test_data[2]

In [5]:
df.head()

Unnamed: 0,rating,review
0,1,Stuning even for the non-gamer This sound trac...
1,1,Alaska sourdough REad most of the book while v...
2,0,Not professional quality I first tried buying ...
3,1,"Rare find A good book for Audi owners, and fan..."
4,1,Great little surge protector Great value for a...


In [6]:
# Tokenize the reviews
def tokenize_function(review):
    return tokenizer(review, padding='max_length', truncation=True, max_length=512)

In [7]:
df['tokenized'] = df['review'].apply(lambda x: tokenize_function(str(x)))
test_df['tokenized'] = test_df['review'].apply(lambda x: tokenize_function(str(x)))

In [8]:
from torch.utils.data import Dataset, DataLoader

In [20]:
class ProductReviewDataset(Dataset):
    def __init__(self, dataframe):
        self.encodings = dataframe['tokenized']
        self.labels = dataframe['rating']
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        if not self.encodings.iloc[idx]:
            # Handle empty encodings, for example, by returning a default encoding
            # You can also choose to skip these samples or handle them based on your specific needs
            default_encoding = {'input_ids': [], 'attention_mask': []}
            item = {key: torch.tensor(val) for key, val in default_encoding.items()}
        else:
            item = {key: torch.tensor(val) for key, val in self.encodings.iloc[idx].items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

# Create dataset instances
train_dataset = ProductReviewDataset(df)
test_dataset = ProductReviewDataset(test_df)

In [21]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [22]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 