# Setup and imports

In [None]:
%%capture
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Helper Function

In [None]:
def tokenize_and_convert_to_tensors(text, labels, max_len=512):
    input_ids = []
    attention_masks = []

    for sentence, label in tqdm(zip(text, labels), total=len(text), desc="Tokenizing"):
        encoded_dict = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    return input_ids, attention_masks, labels

# Distilbert with finetuning for sentiment classification

In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Load your preprocessed data
PATH_OF_TRAINING_DATA = '/content/drive/MyDrive/CS505_final_project/data/IMDB_fine_tune_data.csv'
df = pd.read_csv(PATH_OF_TRAINING_DATA)
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'pos' else 0)

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Extract features and labels
X_train, y_train = train_df['text'].tolist(), train_df['sentiment'].tolist()
X_val, y_val = val_df['text'].tolist(), val_df['sentiment'].tolist()

# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize and convert to PyTorch tensors
train_input_ids, train_attention_masks, train_labels = tokenize_and_convert_to_tensors(X_train, y_train)
val_input_ids, val_attention_masks, val_labels = tokenize_and_convert_to_tensors(X_val, y_val)

# Create DataLoader
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = torch.utils.data.TensorDataset(val_input_ids, val_attention_masks, val_labels)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Load pre-trained DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=5e-5, steps_per_epoch=len(train_dataloader), epochs=epochs)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss}")

    # Validation
    model.eval()
    val_preds = []
    val_true = []

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f"Validation"):
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels = batch[2].cpu().numpy()
            val_preds.extend(preds)
            val_true.extend(labels)

    val_accuracy = accuracy_score(val_true, val_preds)
    print(f"Validation Accuracy: {val_accuracy}")

# Save the model if needed
model.save_pretrained('fine_tuned_distilbert_imdb')


Tokenizing: 100%|██████████| 22500/22500 [02:23<00:00, 156.35it/s]
Tokenizing: 100%|██████████| 2500/2500 [00:16<00:00, 149.41it/s]


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 2813/2813 [17:48<00:00,  2.63it/s]


Average training loss: 0.3185298505026968


Validation: 100%|██████████| 313/313 [00:40<00:00,  7.68it/s]


Validation Accuracy: 0.9028


Epoch 2/3: 100%|██████████| 2813/2813 [17:48<00:00,  2.63it/s]


Average training loss: 0.17377062002031368


Validation: 100%|██████████| 313/313 [00:41<00:00,  7.63it/s]


Validation Accuracy: 0.9244


Epoch 3/3: 100%|██████████| 2813/2813 [17:48<00:00,  2.63it/s]


Average training loss: 0.04199532235924494


Validation: 100%|██████████| 313/313 [00:40<00:00,  7.64it/s]


Validation Accuracy: 0.9252


In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from sklearn.metrics import accuracy_score

In [None]:
PATH_OF_AMAZON_DATA = '/content/drive/MyDrive/CS505_final_project/data/Amazon_review_testing_data.csv'  # Replace with the actual path
amazon_df = pd.read_csv(PATH_OF_AMAZON_DATA)

In [None]:
# Select the first and third columns
amazon_df = amazon_df.iloc[:, [0, 2]]

# Rename the columns
amazon_df.columns = ['sentiment', 'review']

# Replace values in the 'sentiment' column
amazon_df['sentiment'] = amazon_df['sentiment'].replace({1: 0, 2: 1})

# Display the resulting DataFrame
print(amazon_df.head())

   sentiment                                             review
0          1  Despite the fact that I have only played a sma...
1          0  I bought this charger in Jul 2003 and it worke...
2          1  Check out Maha Energy's website. Their Powerex...
3          1  Reviewed quite a bit of the combo players and ...
4          0  I also began having the incorrect disc problem...


In [None]:
# Extract features and labels
X_test, y_test = amazon_df['review'].tolist(), amazon_df['sentiment'].tolist()

# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize and convert to PyTorch tensors
test_input_ids, test_attention_masks, test_labels = tokenize_and_convert_to_tensors(X_test, y_test)

# Create DataLoader
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Load the fine-tuned model
# model = DistilBertForSequenceClassification.from_pretrained('fine_tuned_distilbert_imdb')  # Use the path where you saved the fine-tuned model
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# Prediction loop
model.eval()
test_preds = []
test_true = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        labels = batch[2].cpu().numpy()
        test_preds.extend(preds)
        test_true.extend(labels)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(test_true, test_preds)
print(f"Test Accuracy: {test_accuracy}")

Tokenizing: 100%|██████████| 399999/399999 [15:11<00:00, 438.68it/s]
Testing: 100%|██████████| 50000/50000 [1:47:43<00:00,  7.74it/s]


Test Accuracy: 0.8827947069867674


# Distilbert without finetuning for sentiment classification

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd

# Load your dataset
PATH_OF_AMAZON_DATA = '/content/drive/MyDrive/CS505_final_project/data/Amazon_review_testing_data.csv'  # Replace with the actual path
og_amazon_df = pd.read_csv(PATH_OF_AMAZON_DATA)

# Select the first and third columns
og_amazon_df = og_amazon_df.iloc[:, [0, 2]]

# Rename the columns
og_amazon_df.columns = ['sentiment', 'review']

# Replace values in the 'sentiment' column
og_amazon_df['sentiment'] = og_amazon_df['sentiment'].replace({1: 0, 2: 1})

# Split the dataset into training and testing sets
origin_train_df, origin_test_df = train_test_split(og_amazon_df, test_size=0.2, random_state=42)

class OGAmazonDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        return {'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'label': torch.tensor(label, dtype=torch.long)}

# Load the pre-trained DistilBERT model and tokenizer
# Load the pre-trained DistilBERT model and tokenizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
origin_amazon_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)  # 2 labels for binary classification
origin_amazon_model.to(device)
# Define the dataset and dataloaders
max_len = 200  # You can adjust this based on your needs
test_dataset = OGAmazonDataset(origin_test_df['review'].values, origin_test_df['sentiment'].values, tokenizer, max_len)

test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Evaluation
origin_amazon_model.eval()
all_predictions = []
all_labels = []

for batch in test_dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1).cpu().numpy()

    all_predictions.extend(predictions)
    all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 88.20%
