In [1]:
!pip install transformers



In [5]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, RobertaTokenizer, RobertaForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification

# Load training dataset
train_df = pd.read_csv("/kaggle/input/plmdataset/train.csv/train.csv")  # Replace "train_dataset.csv" with the actual path to your training dataset file

# Load testing dataset
test_df = pd.read_csv("/kaggle/input/plmdataset/test.csv/test.csv")  # Replace "test_dataset.csv" with the actual path to your testing dataset file

# Extract texts and labels from training dataset
train_texts = train_df['review']
train_labels = train_df['sentiment']

# Extract texts and labels from testing dataset
test_texts = test_df['review']
test_labels = test_df['sentiment']

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to tokenize and encode the text
def tokenize_and_encode(tokenizer, texts, max_length):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens = True,
                            max_length = max_length,
                            padding = 'max_length',
                            truncation = True,
                            return_attention_mask = True,
                            return_tensors = 'pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# Function to train the model
def train_model(model, train_dataloader, optimizer, criterion, num_epochs=3):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch in train_dataloader:
            input_ids = batch[0].to(device)
            attention_masks = batch[1].to(device)
            labels = batch[2].to(device)

            optimizer.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {running_loss/len(train_dataloader)}")

# Function to evaluate the model
def evaluate_model(model, test_dataloader):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch[0].to(device)
            attention_masks = batch[1].to(device)
            labels = batch[2].numpy()

            outputs = model(input_ids=input_ids, attention_mask=attention_masks)
            logits = outputs.logits
            _, predicted = torch.max(logits, dim=1)

            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(labels)

    accuracy = accuracy_score(true_labels, predictions)
    print(f"Accuracy: {accuracy}")

# Tokenize and encode the texts
max_length = 128
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_input_ids, train_attention_masks = tokenize_and_encode(distilbert_tokenizer, train_texts, max_length)
test_input_ids, test_attention_masks = tokenize_and_encode(distilbert_tokenizer, test_texts, max_length)

# Assuming the labels are strings ('positive' and 'negative'), you can convert them to integers
label_map = {'positive': 1, 'negative': 0}
train_labels = train_labels.map(label_map)
test_labels = test_labels.map(label_map)

# Convert labels to tensor
train_labels = torch.tensor(train_labels.values, dtype=torch.long)
test_labels = torch.tensor(test_labels.values, dtype=torch.long)

# Create dataloaders
batch_size = 16
train_data = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size)
test_data = torch.utils.data.TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

# Fine-tune DistilBERT
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
distilbert_model.to(device)
optimizer = AdamW(distilbert_model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
train_model(distilbert_model, train_dataloader, optimizer, criterion)
evaluate_model(distilbert_model, test_dataloader)

# Repeat the same process for RoBERTa and GPT-2
# For RoBERTa
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_input_ids, train_attention_masks = tokenize_and_encode(roberta_tokenizer, train_texts, max_length)
test_input_ids, test_attention_masks = tokenize_and_encode(roberta_tokenizer, test_texts, max_length)
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
roberta_model.to(device)
optimizer = AdamW(roberta_model.parameters(), lr=2e-5)
train_model(roberta_model, train_dataloader, optimizer, criterion)
evaluate_model(roberta_model, test_dataloader)

# For GPT-2
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
train_input_ids, train_attention_masks = tokenize_and_encode(gpt2_tokenizer, train_texts, max_length)
test_input_ids, test_attention_masks = tokenize_and_encode(gpt2_tokenizer, test_texts, max_length)
gpt2_model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)
gpt2_model.to(device)
optimizer = AdamW(gpt2_model.parameters(), lr=2e-5)
train_model(gpt2_model, train_dataloader, optimizer, criterion)
evaluate_model(gpt2_model, test_dataloader)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - Loss: 0.3393317736208439
Epoch 2/3 - Loss: 0.20182555637632807
Epoch 3/3 - Loss: 0.11509952055377265
Accuracy: 0.85695


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - Loss: 0.6953605392773946
Epoch 2/3 - Loss: 0.6947482631047567
Epoch 3/3 - Loss: 0.6946594745000203
Accuracy: 0.49675


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [9]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, RobertaTokenizer, RobertaForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from transformers import get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler, autocast

# Load training dataset
train_df = pd.read_csv("/kaggle/input/plmdataset/train.csv/train.csv")  # Replace "train_dataset.csv" with the actual path to your training dataset file

# Load testing dataset
test_df = pd.read_csv("/kaggle/input/plmdataset/test.csv/test.csv")  # Replace "test_dataset.csv" with the actual path to your testing dataset file

# Extract texts and labels from training dataset
train_texts = train_df['review']
train_labels = train_df['sentiment']

# Extract texts and labels from testing dataset
test_texts = test_df['review']
test_labels = test_df['sentiment']

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to tokenize and encode the text
def tokenize_and_encode(tokenizer, texts, max_length):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            truncation=True,
                            return_attention_mask=True,
                            return_tensors='pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# Function to train the model with mixed precision
def train_model_with_mixed_precision(model, train_dataloader, optimizer, scheduler, criterion, scaler, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, batch in enumerate(train_dataloader):
            input_ids = batch[0].to(device)
            attention_masks = batch[1].to(device)
            labels = batch[2].to(device)

            optimizer.zero_grad()

            with autocast():  # Use mixed precision
                outputs = model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
                loss = outputs.loss

            scaler.scale(loss).backward()  # Scale loss to prevent overflow
            scaler.unscale_(optimizer)  # Unscales the gradients of optimizer's assigned params in-place
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Clip gradients to prevent explosion
            scaler.step(optimizer)  # Take a step using the optimizer
            scaler.update()  # Updates the scale for next iteration

            scheduler.step()  # Update learning rate scheduler

            running_loss += loss.item()

            if i % 100 == 99:  # Print every 100 mini-batches
                print(f"Epoch {epoch+1}/{num_epochs}, Batch {i+1}/{len(train_dataloader)}, Loss: {running_loss/100}")
                running_loss = 0.0

# Tokenize and encode the texts
max_length = 128

# DistilBERT
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_input_ids, train_attention_masks = tokenize_and_encode(distilbert_tokenizer, train_texts, max_length)
test_input_ids, test_attention_masks = tokenize_and_encode(distilbert_tokenizer, test_texts, max_length)

# Assuming the labels are strings ('positive' and 'negative'), you can convert them to integers
label_map = {'positive': 1, 'negative': 0}
train_labels = train_labels.map(label_map)
test_labels = test_labels.map(label_map)

# Convert labels to tensor
train_labels = torch.tensor(train_labels.values, dtype=torch.long)
test_labels = torch.tensor(test_labels.values, dtype=torch.long)

# Create dataloaders
batch_size = 16
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size)
test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

# Define hyperparameters
num_epochs = 3
max_grad_norm = 1.0
warmup_steps = 0.1 * len(train_dataloader)  # 10% of total training steps

# Create scheduler
optimizer = AdamW(distilbert_model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=len(train_dataloader) * num_epochs)

# Define scaler for mixed precision training
scaler = GradScaler()

# Fine-tune DistilBERT with mixed precision
train_model_with_mixed_precision(distilbert_model, train_dataloader, optimizer, scheduler, criterion, scaler, num_epochs)
evaluate_model(distilbert_model, test_dataloader)

# Repeat the same process for RoBERTa and GPT-2
# For RoBERTa
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_input_ids, train_attention_masks = tokenize_and_encode(roberta_tokenizer, train_texts, max_length)
test_input_ids, test_attention_masks = tokenize_and_encode(roberta_tokenizer, test_texts, max_length)
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
roberta_model.to(device)
optimizer = AdamW(roberta_model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=len(train_dataloader) * num_epochs)
train_model_with_mixed_precision(roberta_model, train_dataloader, optimizer, scheduler, criterion, scaler, num_epochs)
evaluate_model(roberta_model, test_dataloader)

# # For GPT-2
# gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# train_input_ids, train_attention_masks = tokenize_and_encode(gpt2_tokenizer, train_texts, max_length)
# test_input_ids, test_attention_masks = tokenize_and_encode(gpt2_tokenizer, test_texts, max_length)
# gpt2_model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)
# gpt2_model.to(device)
# optimizer = AdamW(gpt2_model.parameters(), lr=2e-5)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=len(train_dataloader) * num_epochs)
# train_model_with_mixed_precision(gpt2_model, train_dataloader, optimizer, scheduler, criterion, scaler, num_epochs)
# evaluate_model(gpt2_model, test_dataloader)




Epoch 1/3, Batch 100/1875, Loss: 0.10172989226877689
Epoch 1/3, Batch 200/1875, Loss: 0.0676270891726017
Epoch 1/3, Batch 300/1875, Loss: 0.06075821131467819
Epoch 1/3, Batch 400/1875, Loss: 0.21413344100117684
Epoch 1/3, Batch 500/1875, Loss: 0.07729954890906811
Epoch 1/3, Batch 600/1875, Loss: 0.07085053972899914
Epoch 1/3, Batch 700/1875, Loss: 0.0842372076958418
Epoch 1/3, Batch 800/1875, Loss: 0.06937024764716625
Epoch 1/3, Batch 900/1875, Loss: 0.12046432785689831
Epoch 1/3, Batch 1000/1875, Loss: 0.06073136806488037
Epoch 1/3, Batch 1100/1875, Loss: 0.06664066761732101
Epoch 1/3, Batch 1200/1875, Loss: 0.07201315812766552
Epoch 1/3, Batch 1300/1875, Loss: 0.08882725395262242
Epoch 1/3, Batch 1400/1875, Loss: 0.11559346415102482
Epoch 1/3, Batch 1500/1875, Loss: 0.07386761747300624
Epoch 1/3, Batch 1600/1875, Loss: 0.07036987073719501
Epoch 1/3, Batch 1700/1875, Loss: 0.08533724144101143
Epoch 1/3, Batch 1800/1875, Loss: 0.06804419979453087
Epoch 2/3, Batch 100/1875, Loss: 0.0868

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch 100/1875, Loss: 0.6939346313476562
Epoch 1/3, Batch 200/1875, Loss: 0.7001318359375
Epoch 1/3, Batch 300/1875, Loss: 0.6976708984375
Epoch 1/3, Batch 400/1875, Loss: 0.6888579559326172
Epoch 1/3, Batch 500/1875, Loss: 0.6961116790771484
Epoch 1/3, Batch 600/1875, Loss: 0.6953240966796875
Epoch 1/3, Batch 700/1875, Loss: 0.6958424377441407
Epoch 1/3, Batch 800/1875, Loss: 0.6959896850585937
Epoch 1/3, Batch 900/1875, Loss: 0.6935813903808594
Epoch 1/3, Batch 1000/1875, Loss: 0.6943620300292969
Epoch 1/3, Batch 1100/1875, Loss: 0.6938188171386719
Epoch 1/3, Batch 1200/1875, Loss: 0.6974050903320312
Epoch 1/3, Batch 1300/1875, Loss: 0.6944967651367188
Epoch 1/3, Batch 1400/1875, Loss: 0.6878280639648438
Epoch 1/3, Batch 1500/1875, Loss: 0.6747649765014648
Epoch 1/3, Batch 1600/1875, Loss: 0.6593856048583985
Epoch 1/3, Batch 1700/1875, Loss: 0.6286680984497071
Epoch 1/3, Batch 1800/1875, Loss: 0.5979949188232422
Epoch 2/3, Batch 100/1875, Loss: 0.5938976287841797
Epoch 2/3

In [10]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, RobertaTokenizer, RobertaForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from transformers import get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler, autocast

# Load training dataset
train_df = pd.read_csv("/kaggle/input/plmdataset/train.csv/train.csv")  # Replace "train_dataset.csv" with the actual path to your training dataset file

# Load testing dataset
test_df = pd.read_csv("/kaggle/input/plmdataset/test.csv/test.csv")  # Replace "test_dataset.csv" with the actual path to your testing dataset file

# Extract texts and labels from training dataset
train_texts = train_df['review']
train_labels = train_df['sentiment']

# Extract texts and labels from testing dataset
test_texts = test_df['review']
test_labels = test_df['sentiment']

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to tokenize and encode the text
def tokenize_and_encode(tokenizer, texts, max_length):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            truncation=True,
                            return_attention_mask=True,
                            return_tensors='pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# Function to train the model with mixed precision
def train_model_with_mixed_precision(model, train_dataloader, optimizer, scheduler, criterion, scaler, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, batch in enumerate(train_dataloader):
            input_ids = batch[0].to(device)
            attention_masks = batch[1].to(device)
            labels = batch[2].to(device)

            optimizer.zero_grad()

            with autocast():  # Use mixed precision
                outputs = model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
                loss = outputs.loss

            scaler.scale(loss).backward()  # Scale loss to prevent overflow
            scaler.unscale_(optimizer)  # Unscales the gradients of optimizer's assigned params in-place
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Clip gradients to prevent explosion
            scaler.step(optimizer)  # Take a step using the optimizer
            scaler.update()  # Updates the scale for next iteration

            scheduler.step()  # Update learning rate scheduler

            running_loss += loss.item()

            if i % 100 == 99:  # Print every 100 mini-batches
                print(f"Epoch {epoch+1}/{num_epochs}, Batch {i+1}/{len(train_dataloader)}, Loss: {running_loss/100}")
                running_loss = 0.0

# Tokenize and encode the texts
max_length = 128

# DistilBERT
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_input_ids, train_attention_masks = tokenize_and_encode(distilbert_tokenizer, train_texts, max_length)
test_input_ids, test_attention_masks = tokenize_and_encode(distilbert_tokenizer, test_texts, max_length)

# Assuming the labels are strings ('positive' and 'negative'), you can convert them to integers
label_map = {'positive': 1, 'negative': 0}
train_labels = train_labels.map(label_map)
test_labels = test_labels.map(label_map)

# Convert labels to tensor
train_labels = torch.tensor(train_labels.values, dtype=torch.long)
test_labels = torch.tensor(test_labels.values, dtype=torch.long)

# Create dataloaders
batch_size = 16
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size)
test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

# Define hyperparameters
num_epochs = 3
max_grad_norm = 1.0
warmup_steps = 0.1 * len(train_dataloader)  # 10% of total training steps

# Create scheduler
optimizer = AdamW(distilbert_model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=len(train_dataloader) * num_epochs)

# Define scaler for mixed precision training
scaler = GradScaler()

# Fine-tune DistilBERT with mixed precision
train_model_with_mixed_precision(distilbert_model, train_dataloader, optimizer, scheduler, criterion, scaler, num_epochs)
evaluate_model(distilbert_model, test_dataloader)

# Repeat the same process for RoBERTa and GPT-2
# For RoBERTa
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_input_ids, train_attention_masks = tokenize_and_encode(roberta_tokenizer, train_texts, max_length)
test_input_ids, test_attention_masks = tokenize_and_encode(roberta_tokenizer, test_texts, max_length)
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
roberta_model.to(device)
optimizer = AdamW(roberta_model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=len(train_dataloader) * num_epochs)
train_model_with_mixed_precision(roberta_model, train_dataloader, optimizer, scheduler, criterion, scaler, num_epochs)
evaluate_model(roberta_model, test_dataloader)

# # For GPT-2
# gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# train_input_ids, train_attention_masks = tokenize_and_encode(gpt2_tokenizer, train_texts, max_length)
# test_input_ids, test_attention_masks = tokenize_and_encode(gpt2_tokenizer, test_texts, max_length)
# gpt2_model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)
# gpt2_model.to(device)
# optimizer = AdamW(gpt2_model.parameters(), lr=2e-5)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=len(train_dataloader) * num_epochs)
# train_model_with_mixed_precision(gpt2_model, train_dataloader, optimizer, scheduler, criterion, scaler, num_epochs)
# evaluate_model(gpt2_model, test_dataloader)




Epoch 1/3, Batch 100/1875, Loss: 0.038977860137820244
Epoch 1/3, Batch 200/1875, Loss: 0.018567416667938232
Epoch 1/3, Batch 300/1875, Loss: 0.03453112371265888
Epoch 1/3, Batch 400/1875, Loss: 0.0348983533680439
Epoch 1/3, Batch 500/1875, Loss: 0.036452372744679454
Epoch 1/3, Batch 600/1875, Loss: 0.0335359850525856
Epoch 1/3, Batch 700/1875, Loss: 0.0476680451631546
Epoch 1/3, Batch 800/1875, Loss: 0.030840462371706963
Epoch 1/3, Batch 900/1875, Loss: 0.04182672463357449
Epoch 1/3, Batch 1000/1875, Loss: 0.03341382563114166
Epoch 1/3, Batch 1100/1875, Loss: 0.03000076524913311
Epoch 1/3, Batch 1200/1875, Loss: 0.05138696871697903
Epoch 1/3, Batch 1300/1875, Loss: 0.03074098937213421
Epoch 1/3, Batch 1400/1875, Loss: 0.05903112329542637
Epoch 1/3, Batch 1500/1875, Loss: 0.0674786216020584
Epoch 1/3, Batch 1600/1875, Loss: 0.03453420706093311
Epoch 1/3, Batch 1700/1875, Loss: 0.04805536761879921
Epoch 1/3, Batch 1800/1875, Loss: 0.050163709595799445
Epoch 2/3, Batch 100/1875, Loss: 0.0

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch 100/1875, Loss: 0.6971086120605469
Epoch 1/3, Batch 200/1875, Loss: 0.6948506164550782
Epoch 1/3, Batch 300/1875, Loss: 0.6959030151367187
Epoch 1/3, Batch 400/1875, Loss: 0.69581298828125
Epoch 1/3, Batch 500/1875, Loss: 0.6953398132324219
Epoch 1/3, Batch 600/1875, Loss: 0.6969720458984375
Epoch 1/3, Batch 700/1875, Loss: 0.6973811340332031
Epoch 1/3, Batch 800/1875, Loss: 0.6933233642578125
Epoch 1/3, Batch 900/1875, Loss: 0.6957020568847656
Epoch 1/3, Batch 1000/1875, Loss: 0.6940127563476562
Epoch 1/3, Batch 1100/1875, Loss: 0.6964111328125
Epoch 1/3, Batch 1200/1875, Loss: 0.6905145645141602
Epoch 1/3, Batch 1300/1875, Loss: 0.6935882568359375
Epoch 1/3, Batch 1400/1875, Loss: 0.6881367492675782
Epoch 1/3, Batch 1500/1875, Loss: 0.6878263854980469
Epoch 1/3, Batch 1600/1875, Loss: 0.6957080078125
Epoch 1/3, Batch 1700/1875, Loss: 0.6936155700683594
Epoch 1/3, Batch 1800/1875, Loss: 0.6919789123535156
Epoch 2/3, Batch 100/1875, Loss: 0.6870713806152344
Epoch 2/3, 

In [14]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from transformers import get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler, autocast
from torch.optim import SGD

# Load training dataset
train_df = pd.read_csv("/kaggle/input/plmdataset/train.csv/train.csv")  # Replace "train_dataset.csv" with the actual path to your training dataset file

# Load testing dataset
test_df = pd.read_csv("/kaggle/input/plmdataset/test.csv/test.csv")  # Replace "test_dataset.csv" with the actual path to your testing dataset file

# Extract texts and labels from training dataset
train_texts = train_df['review']
train_labels = train_df['sentiment']

# Extract texts and labels from testing dataset
test_texts = test_df['review']
test_labels = test_df['sentiment']

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to tokenize and encode the text
def tokenize_and_encode(tokenizer, texts, max_length):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            truncation=True,
                            return_attention_mask=True,
                            return_tensors='pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# Function to train the model with mixed precision
def train_model_with_mixed_precision(model, train_dataloader, optimizer, scheduler, criterion, scaler, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, batch in enumerate(train_dataloader):
            input_ids = batch[0].to(device)
            attention_masks = batch[1].to(device)
            labels = batch[2].to(device)

            optimizer.zero_grad()

            with autocast():  # Use mixed precision
                outputs = model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
                loss = outputs.loss

            scaler.scale(loss).backward()  # Scale loss to prevent overflow
            scaler.unscale_(optimizer)  # Unscales the gradients of optimizer's assigned params in-place
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Clip gradients to prevent explosion
            scaler.step(optimizer)  # Take a step using the optimizer
            scaler.update()  # Updates the scale for next iteration

            scheduler.step()  # Update learning rate scheduler

            running_loss += loss.item()

            if i % 100 == 99:  # Print every 100 mini-batches
                print(f"Epoch {epoch+1}/{num_epochs}, Batch {i+1}/{len(train_dataloader)}, Loss: {running_loss/100}")
                running_loss = 0.0

# Tokenize and encode the texts
max_length = 128

# DistilBERT
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_input_ids, train_attention_masks = tokenize_and_encode(distilbert_tokenizer, train_texts, max_length)
test_input_ids, test_attention_masks = tokenize_and_encode(distilbert_tokenizer, test_texts, max_length)

# Assuming the labels are strings ('positive' and 'negative'), you can convert them to integers
label_map = {'positive': 1, 'negative': 0}
train_labels = train_labels.map(label_map)
test_labels = test_labels.map(label_map)

# Convert labels to tensor
train_labels = torch.tensor(train_labels.values, dtype=torch.long)
test_labels = torch.tensor(test_labels.values, dtype=torch.long)

# Create dataloaders
batch_size = 16
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size)
test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

# Define hyperparameters
num_epochs = 3
max_grad_norm = 1.0
warmup_steps = 0.1 * len(train_dataloader)  # 10% of total training steps

# Create DistilBERT model
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2).to(device)

# Create optimizer and scheduler
optimizer_distilbert = SGD(distilbert_model.parameters(), lr=0.01, momentum=0.9)
scheduler_distilbert = get_linear_schedule_with_warmup(optimizer_distilbert, num_warmup_steps=warmup_steps, num_training_steps=len(train_dataloader) * num_epochs)

# Define scaler for mixed precision training
scaler = GradScaler()

# Fine-tune DistilBERT with mixed precision
train_model_with_mixed_precision(distilbert_model, train_dataloader, optimizer_distilbert, scheduler_distilbert, nn.CrossEntropyLoss(), scaler, num_epochs)
evaluate_model(distilbert_model, test_dataloader)

# Repeat the same process for RoBERTa
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_input_ids, train_attention_masks = tokenize_and_encode(roberta_tokenizer, train_texts, max_length)
test_input_ids, test_attention_masks = tokenize_and_encode(roberta_tokenizer, test_texts, max_length)

# Create RoBERTa model
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2).to(device)

# Create optimizer and scheduler
optimizer_roberta = SGD(roberta_model.parameters(), lr=0.01, momentum=0.9)
scheduler_roberta = get_linear_schedule_with_warmup(optimizer_roberta, num_warmup_steps=warmup_steps, num_training_steps=len(train_dataloader) * num_epochs)

# Fine-tune RoBERTa with mixed precision
train_model_with_mixed_precision(roberta_model, train_dataloader, optimizer_roberta, scheduler_roberta, nn.CrossEntropyLoss(), scaler, num_epochs)
evaluate_model(roberta_model, test_dataloader)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch 100/1875, Loss: 0.638399543762207
Epoch 1/3, Batch 200/1875, Loss: 0.494405632019043
Epoch 1/3, Batch 300/1875, Loss: 0.422783145904541
Epoch 1/3, Batch 400/1875, Loss: 0.4123725938796997
Epoch 1/3, Batch 500/1875, Loss: 0.3828009366989136
Epoch 1/3, Batch 600/1875, Loss: 0.38030768394470216
Epoch 1/3, Batch 700/1875, Loss: 0.37837531328201296
Epoch 1/3, Batch 800/1875, Loss: 0.4078387808799744
Epoch 1/3, Batch 900/1875, Loss: 0.3658698034286499
Epoch 1/3, Batch 1000/1875, Loss: 0.3521576488018036
Epoch 1/3, Batch 1100/1875, Loss: 0.344450261592865
Epoch 1/3, Batch 1200/1875, Loss: 0.3409921288490295
Epoch 1/3, Batch 1300/1875, Loss: 0.3452486753463745
Epoch 1/3, Batch 1400/1875, Loss: 0.36082704782485964
Epoch 1/3, Batch 1500/1875, Loss: 0.32019894003868105
Epoch 1/3, Batch 1600/1875, Loss: 0.33297763466835023
Epoch 1/3, Batch 1700/1875, Loss: 0.36296723008155823
Epoch 1/3, Batch 1800/1875, Loss: 0.3260021793842316
Epoch 2/3, Batch 100/1875, Loss: 0.32985604763031007


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch 100/1875, Loss: 0.7025511932373046
Epoch 1/3, Batch 200/1875, Loss: 0.7114691925048828
Epoch 1/3, Batch 300/1875, Loss: 0.7070757293701172
Epoch 1/3, Batch 400/1875, Loss: 0.7067838287353516
Epoch 1/3, Batch 500/1875, Loss: 0.6982167053222657
Epoch 1/3, Batch 600/1875, Loss: 0.6987605285644531
Epoch 1/3, Batch 700/1875, Loss: 0.6948300170898437
Epoch 1/3, Batch 800/1875, Loss: 0.69453857421875
Epoch 1/3, Batch 900/1875, Loss: 0.69548095703125
Epoch 1/3, Batch 1000/1875, Loss: 0.6946369934082032
Epoch 1/3, Batch 1100/1875, Loss: 0.6942010498046876
Epoch 1/3, Batch 1200/1875, Loss: 0.6946176147460937
Epoch 1/3, Batch 1300/1875, Loss: 0.6954307556152344
Epoch 1/3, Batch 1400/1875, Loss: 0.694906005859375
Epoch 1/3, Batch 1500/1875, Loss: 0.6949349975585938
Epoch 1/3, Batch 1600/1875, Loss: 0.6954974365234375
Epoch 1/3, Batch 1700/1875, Loss: 0.69441162109375
Epoch 1/3, Batch 1800/1875, Loss: 0.69405029296875
Epoch 2/3, Batch 100/1875, Loss: 0.694393310546875
Epoch 2/3, Ba

In [13]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from transformers import get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler, autocast
from torch.optim.lr_scheduler import ExponentialLR

# Load training dataset
train_df = pd.read_csv("/kaggle/input/plmdataset/train.csv/train.csv")  # Replace "train_dataset.csv" with the actual path to your training dataset file

# Load testing dataset
test_df = pd.read_csv("/kaggle/input/plmdataset/test.csv/test.csv")  # Replace "test_dataset.csv" with the actual path to your testing dataset file

# Extract texts and labels from training dataset
train_texts = train_df['review']
train_labels = train_df['sentiment']

# Extract texts and labels from testing dataset
test_texts = test_df['review']
test_labels = test_df['sentiment']

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to tokenize and encode the text
def tokenize_and_encode(tokenizer, texts, max_length):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            truncation=True,
                            return_attention_mask=True,
                            return_tensors='pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# Function to train the model with mixed precision
def train_model_with_mixed_precision(model, train_dataloader, optimizer, scheduler, criterion, scaler, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, batch in enumerate(train_dataloader):
            input_ids = batch[0].to(device)
            attention_masks = batch[1].to(device)
            labels = batch[2].to(device)

            optimizer.zero_grad()

            with autocast():  # Use mixed precision
                outputs = model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
                loss = outputs.loss

            scaler.scale(loss).backward()  # Scale loss to prevent overflow
            scaler.unscale_(optimizer)  # Unscales the gradients of optimizer's assigned params in-place
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Clip gradients to prevent explosion
            scaler.step(optimizer)  # Take a step using the optimizer
            scaler.update()  # Updates the scale for next iteration

            scheduler.step()  # Update learning rate scheduler

            running_loss += loss.item()

            if i % 100 == 99:  # Print every 100 mini-batches
                print(f"Epoch {epoch+1}/{num_epochs}, Batch {i+1}/{len(train_dataloader)}, Loss: {running_loss/100}")
                running_loss = 0.0

# Tokenize and encode the texts
max_length = 128

# DistilBERT
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_input_ids, train_attention_masks = tokenize_and_encode(distilbert_tokenizer, train_texts, max_length)
test_input_ids, test_attention_masks = tokenize_and_encode(distilbert_tokenizer, test_texts, max_length)

# RoBERTa
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_input_ids_roberta, train_attention_masks_roberta = tokenize_and_encode(roberta_tokenizer, train_texts, max_length)
test_input_ids_roberta, test_attention_masks_roberta = tokenize_and_encode(roberta_tokenizer, test_texts, max_length)

# Assuming the labels are strings ('positive' and 'negative'), you can convert them to integers
label_map = {'positive': 1, 'negative': 0}
train_labels = train_labels.map(label_map)
test_labels = test_labels.map(label_map)

# Convert labels to tensor
train_labels = torch.tensor(train_labels.values, dtype=torch.long)
test_labels = torch.tensor(test_labels.values, dtype=torch.long)

# Create dataloaders
batch_size = 16
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size)
test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

# Create dataloaders for RoBERTa
train_data_roberta = TensorDataset(train_input_ids_roberta, train_attention_masks_roberta, train_labels)
train_dataloader_roberta = DataLoader(train_data_roberta, batch_size=batch_size)
test_data_roberta = TensorDataset(test_input_ids_roberta, test_attention_masks_roberta, test_labels)
test_dataloader_roberta = DataLoader(test_data_roberta, batch_size=batch_size)

# Define hyperparameters
num_epochs = 3
max_grad_norm = 1.0
warmup_steps = 0.1 * len(train_dataloader)  # 10% of total training steps

# Define learning rate and schedule for DistilBERT
initial_learning_rate_distilbert = 2e-5
# Create optimizer with adjustable learning rate for DistilBERT
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
distilbert_model.to(device)
optimizer_distilbert = AdamW(distilbert_model.parameters(), lr=initial_learning_rate_distilbert)

# Create scheduler with exponential decay for DistilBERT
scheduler_distilbert = get_linear_schedule_with_warmup(optimizer_distilbert, num_warmup_steps=warmup_steps, num_training_steps=len(train_dataloader) * num_epochs)
scheduler_distilbert = ExponentialLR(optimizer_distilbert, gamma=0.9) # ExponentialLR for exponential decay

# Define scaler for mixed precision training
scaler = GradScaler()

# Fine-tune DistilBERT with mixed precision
train_model_with_mixed_precision(distilbert_model, train_dataloader, optimizer_distilbert, scheduler_distilbert, nn.CrossEntropyLoss(), scaler, num_epochs)

# Define learning rate and schedule for RoBERTa
initial_learning_rate_roberta = 2e-5
# Create optimizer with adjustable learning rate for RoBERTa
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
roberta_model.to(device)
optimizer_roberta = AdamW(roberta_model.parameters(), lr=initial_learning_rate_roberta)

# Create scheduler with exponential decay for RoBERTa
scheduler_roberta = get_linear_schedule_with_warmup(optimizer_roberta, num_warmup_steps=warmup_steps, num_training_steps=len(train_dataloader_roberta) * num_epochs)
scheduler_roberta = ExponentialLR(optimizer_roberta, gamma=0.9) # ExponentialLR for exponential decay

# Fine-tune RoBERTa with mixed precision
train_model_with_mixed_precision(roberta_model, train_dataloader_roberta, optimizer_roberta, scheduler_roberta, nn.CrossEntropyLoss(), scaler, num_epochs)

# Evaluate DistilBERT
def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Accuracy: {accuracy}")

evaluate_model(distilbert_model, test_dataloader)

# Evaluate RoBERTa
evaluate_model(roberta_model, test_dataloader_roberta)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch 100/1875, Loss: 0.6952365112304687
Epoch 1/3, Batch 200/1875, Loss: 0.696234130859375
Epoch 1/3, Batch 300/1875, Loss: 0.6951004028320312
Epoch 1/3, Batch 400/1875, Loss: 0.6953091430664062
Epoch 1/3, Batch 500/1875, Loss: 0.6928515625
Epoch 1/3, Batch 600/1875, Loss: 0.694931640625
Epoch 1/3, Batch 700/1875, Loss: 0.696068115234375
Epoch 1/3, Batch 800/1875, Loss: 0.695142822265625
Epoch 1/3, Batch 900/1875, Loss: 0.6942910766601562
Epoch 1/3, Batch 1000/1875, Loss: 0.69353759765625
Epoch 1/3, Batch 1100/1875, Loss: 0.6954696655273438
Epoch 1/3, Batch 1200/1875, Loss: 0.6924240112304687
Epoch 1/3, Batch 1300/1875, Loss: 0.6942092895507812
Epoch 1/3, Batch 1400/1875, Loss: 0.69363525390625
Epoch 1/3, Batch 1500/1875, Loss: 0.6938516235351563
Epoch 1/3, Batch 1600/1875, Loss: 0.694998779296875
Epoch 1/3, Batch 1700/1875, Loss: 0.6942047119140625
Epoch 1/3, Batch 1800/1875, Loss: 0.6917852783203124
Epoch 2/3, Batch 100/1875, Loss: 0.695845947265625
Epoch 2/3, Batch 200/1

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch 100/1875, Loss: 0.7008372497558594
Epoch 1/3, Batch 200/1875, Loss: 0.6993682861328125
Epoch 1/3, Batch 300/1875, Loss: 0.6985882568359375
Epoch 1/3, Batch 400/1875, Loss: 0.7031089782714843
Epoch 1/3, Batch 500/1875, Loss: 0.6975228881835938
Epoch 1/3, Batch 600/1875, Loss: 0.6984939575195312
Epoch 1/3, Batch 700/1875, Loss: 0.7010128784179688
Epoch 1/3, Batch 800/1875, Loss: 0.6990983581542969
Epoch 1/3, Batch 900/1875, Loss: 0.692613525390625
Epoch 1/3, Batch 1000/1875, Loss: 0.6938954162597656
Epoch 1/3, Batch 1100/1875, Loss: 0.6946134948730469
Epoch 1/3, Batch 1200/1875, Loss: 0.6980857849121094
Epoch 1/3, Batch 1300/1875, Loss: 0.6969267272949219
Epoch 1/3, Batch 1400/1875, Loss: 0.6992776489257813
Epoch 1/3, Batch 1500/1875, Loss: 0.6999372863769531
Epoch 1/3, Batch 1600/1875, Loss: 0.6993745422363281
Epoch 1/3, Batch 1700/1875, Loss: 0.6991278076171875
Epoch 1/3, Batch 1800/1875, Loss: 0.6999179077148437
Epoch 2/3, Batch 100/1875, Loss: 0.7030986022949218
Epoc

Hyperparameter tuning of roberta and distilbert 

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from transformers import get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler, autocast
from sklearn.model_selection import GridSearchCV

# Load training dataset
train_df = pd.read_csv("/kaggle/input/plmdataset/train.csv/train.csv")  # Replace "train_dataset.csv" with the actual path to your training dataset file

# Load testing dataset
test_df = pd.read_csv("/kaggle/input/plmdataset/test.csv/test.csv")  # Replace "test_dataset.csv" with the actual path to your testing dataset file

# Extract texts and labels from training dataset
train_texts = train_df['review']
train_labels = train_df['sentiment']

# Extract texts and labels from testing dataset
test_texts = test_df['review']
test_labels = test_df['sentiment']

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to tokenize and encode the text
def tokenize_and_encode(tokenizer, texts, max_length):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            truncation=True,
                            return_attention_mask=True,
                            return_tensors='pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# Tokenize and encode the texts
max_length = 128

# Define function to create dataloaders
def create_data_loader(tokenizer, texts, labels, max_length, batch_size):
    input_ids, attention_masks = tokenize_and_encode(tokenizer, texts, max_length)
    labels = torch.tensor(labels.map(label_map).values, dtype=torch.long)

    dataset = TensorDataset(input_ids, attention_masks, labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    return dataloader

# Define hyperparameters to search
param_grid = {
    'learning_rate': [5e-5, 3e-5, 2e-5],
    'batch_size': [16, 32],
    'num_epochs': [2, 3, 4]
}

# DistilBERT
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_input_ids, train_attention_masks = tokenize_and_encode(distilbert_tokenizer, train_texts, max_length)
test_input_ids, test_attention_masks = tokenize_and_encode(distilbert_tokenizer, test_texts, max_length)

# Define label mapping
label_map = {'positive': 1, 'negative': 0}

# Assuming the labels are strings ('positive' and 'negative'), you can convert them to integers
train_labels = train_labels.map(label_map)
test_labels = test_labels.map(label_map)

# Convert labels to tensor
train_labels = torch.tensor(train_labels.values, dtype=torch.long)
test_labels = torch.tensor(test_labels.values, dtype=torch.long)

# Initialize DistilBERT model
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Create dataloaders for DistilBERT
train_dataloader = create_data_loader(distilbert_tokenizer, train_texts, train_labels, max_length, batch_size=param_grid['batch_size'][0])
test_dataloader = create_data_loader(distilbert_tokenizer, test_texts, test_labels, max_length, batch_size=param_grid['batch_size'][0])

# Initialize GridSearchCV for DistilBERT
grid_search_distilbert = GridSearchCV(estimator=None, param_grid=param_grid, scoring='accuracy', cv=3)

# Run GridSearchCV for DistilBERT
grid_search_distilbert.fit(train_dataloader, test_dataloader)

# Get the best parameters and accuracy for DistilBERT
best_params_distilbert = grid_search_distilbert.best_params_
best_accuracy_distilbert = grid_search_distilbert.best_score_

print("Best Parameters (DistilBERT):", best_params_distilbert)
print("Best Accuracy (DistilBERT):", best_accuracy_distilbert)

# Instantiate RoBERTa tokenizer
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Initialize RoBERTa model
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# Create dataloaders for RoBERTa
train_dataloader = create_data_loader(roberta_tokenizer, train_texts, train_labels, max_length, batch_size=param_grid['batch_size'][0])
test_dataloader = create_data_loader(roberta_tokenizer, test_texts, test_labels, max_length, batch_size=param_grid['batch_size'][0])

# Initialize GridSearchCV for RoBERTa
grid_search_roberta = GridSearchCV(estimator=None, param_grid=param_grid, scoring='accuracy', cv=3)

# Run GridSearchCV for RoBERTa
grid_search_roberta.fit(train_dataloader, test_dataloader)

# Get the best parameters and accuracy for RoBERTa
best_params_roberta = grid_search_roberta.best_params_
best_accuracy_roberta = grid_search_roberta.best_score_

print("Best Parameters (RoBERTa):", best_params_roberta)
print("Best Accuracy (RoBERTa):", best_accuracy_roberta)


KeyboardInterrupt: 

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification#, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.optim import AdamW

import time
# Record start time
start_time = time.time()


# Load data
data = pd.read_csv('/kaggle/input/plmdataset/train.csv/train.csv')
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})
reviews = data['review'].tolist()
labels = data['sentiment'].tolist()  # assuming sentiment is encoded as 0 (negative) and 1 (positive)

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(reviews, labels, test_size=0.2)

# Initialize tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

# Tokenize data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Create torch dataset
class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create dataloaders
train_dataset = ReviewDataset(train_encodings, train_labels)
val_dataset = ReviewDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
model = model.to('cuda')  # if GPU is available

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
for epoch in range(3):  # number of epochs
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels'].to('cuda')
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the model
model.save_pretrained('sentiment_model_RoBERTa')

# Record end time
end_time = time.time()

print("Time required to fine-tune: ", end_time - start_time)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Time required to fine-tune:  6715.162095785141


In [6]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import torch

# Load the model
model = RobertaForSequenceClassification.from_pretrained('sentiment_model_RoBERTa')
model = model.to('cuda')  # if GPU is available

# Load validation data
val_data = pd.read_csv('/kaggle/input/plmdataset/test.csv/test.csv')
val_texts = val_data['review'].tolist()
val_labels = val_data['sentiment'].map({'positive': 1, 'negative': 0}).tolist()  # convert sentiment to numeric

# Initialize tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

# Tokenize data
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Create torch dataset for validation
class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


val_dataset = ReviewDataset(val_encodings, val_labels)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Evaluate the model
model.eval()
predictions = []
true_labels = []
for batch in val_loader:
    input_ids = batch['input_ids'].to('cuda')
    attention_mask = batch['attention_mask'].to('cuda')
    labels = batch['labels'].to('cuda')

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
    predictions.extend(predicted_labels)
    true_labels.extend(labels.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
conf_matrix = confusion_matrix(true_labels, predictions)

print(f'Accuracy: {accuracy}')
print(f'F1-score: {f1}')
print(f'Confusion matrix:\n {conf_matrix}')

Accuracy: 0.95255
F1-score: 0.9531149646756584
Confusion matrix:
 [[9405  530]
 [ 419 9646]]


In [7]:
from sklearn.metrics import classification_report
# Print classification report
print(classification_report(true_labels, predictions, target_names=['negative', 'positive']))

              precision    recall  f1-score   support

    negative       0.96      0.95      0.95      9935
    positive       0.95      0.96      0.95     10065

    accuracy                           0.95     20000
   macro avg       0.95      0.95      0.95     20000
weighted avg       0.95      0.95      0.95     20000



In [10]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification#, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.optim import AdamW


import time

# Record start time
start_time = time.time()

# Load data
data = pd.read_csv('/kaggle/input/plmdataset/train.csv/train.csv')
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})
reviews = data['review'].tolist()
labels = data['sentiment'].tolist()  # assuming sentiment is encoded as 0 (negative) and 1 (positive)

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(reviews, labels, test_size=0.2)

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

    
# Create torch dataset
class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create dataloaders
train_dataset = ReviewDataset(train_encodings, train_labels)
val_dataset = ReviewDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model = model.to('cuda')  # if GPU is available

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
for epoch in range(3):  # number of epochs
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels'].to('cuda')
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the model
model.save_pretrained('sentiment_model_DistilBERT')

# Record end time
end_time = time.time()

print("Time required to fine-tune: ", end_time - start_time)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Time required to fine-tune:  3672.2959537506104


In [13]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import torch

# Load the model
model = DistilBertForSequenceClassification.from_pretrained('sentiment_model_DistilBERT')
model = model.to('cuda')  # if GPU is available

# Load validation data
val_data = pd.read_csv('/kaggle/input/plmdataset/test.csv/test.csv')
val_texts = val_data['review'].tolist()
val_labels = val_data['sentiment'].map({'positive': 1, 'negative': 0}).tolist()  # convert sentiment to numeric

# Initialize tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize data
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Create torch dataset for validation
class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

val_dataset = ReviewDataset(val_encodings, val_labels)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Evaluate the model
model.eval()
predictions = []
true_labels = []
for batch in val_loader:
    input_ids = batch['input_ids'].to('cuda')
    attention_mask = batch['attention_mask'].to('cuda')
    labels = batch['labels'].to('cuda')

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
    predictions.extend(predicted_labels)
    true_labels.extend(labels.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
conf_matrix = confusion_matrix(true_labels, predictions)

print(f'Accuracy: {accuracy}')
print(f'F1-score: {f1}')
print(f'Confusion matrix:\n {conf_matrix}')

Accuracy: 0.93105
F1-score: 0.9313349599163471
Confusion matrix:
 [[9269  666]
 [ 713 9352]]


In [14]:
from sklearn.metrics import classification_report
# Print classification report
print(classification_report(true_labels, predictions, target_names=['negative', 'positive']))

              precision    recall  f1-score   support

    negative       0.93      0.93      0.93      9935
    positive       0.93      0.93      0.93     10065

    accuracy                           0.93     20000
   macro avg       0.93      0.93      0.93     20000
weighted avg       0.93      0.93      0.93     20000



glove

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.initializers import Constant

# Parameters for the model and training
max_features = 20000  # Number of words to consider as features
maxlen = 200  # Max length of individual reviews
embedding_dim = 100  # Dimension of word embeddings
lstm_units = 128  # Number of LSTM units
filters = 64  # Number of filters for CNN
kernel_size = 5  # Kernel size for CNN
batch_size = 32
epochs = 5

# Load train and test datasets from CSV files
train_data = pd.read_csv('/kaggle/input/plmdataset/train.csv/train.csv')
test_data = pd.read_csv('/kaggle/input/plmdataset/test.csv/test.csv')

# Preprocess train data
X_train = train_data['review'].values
y_train = train_data['sentiment'].values

# Preprocess test data
X_test = test_data['review'].values
y_test = test_data['sentiment'].values

# Convert labels to numerical format
y_train[y_train == 'negative'] = 0
y_train[y_train == 'positive'] = 1
y_train = y_train.astype(int)

y_test[y_test == 'negative'] = 0
y_test[y_test == 'positive'] = 1
y_test = y_test.astype(int)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train_tokenized, maxlen=maxlen)
X_test = pad_sequences(X_test_tokenized, maxlen=maxlen)

# Load GloVe embeddings
embeddings_index = {}
with open('/kaggle/input/glove6/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create embedding matrix
num_words = min(max_features, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define the model using functional API
inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(num_words, embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=maxlen,
                            trainable=False)(inputs)

# LSTM branch
lstm_branch = LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)

# CNN branch
cnn_branch = Conv1D(filters, kernel_size, activation='relu')(embedding_layer)
cnn_branch = GlobalMaxPooling1D()(cnn_branch)

# Concatenate both branches
merged = Concatenate()([lstm_branch, cnn_branch])
output = Dense(1, activation='sigmoid')(merged)

# Create model
model = Model(inputs=inputs, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print('Test accuracy:', accuracy)


Epoch 1/5


I0000 00:00:1708622911.969708      96 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.8751999735832214


In [6]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.initializers import Constant

# Record start time
start_time = time.time()
# Parameters for the model and training
max_features = 20000  # Number of words to consider as features
maxlen = 200  # Max length of individual reviews
embedding_dim = 200  # Dimension of word embeddings
lstm_units = 128  # Number of LSTM units
filters = 64  # Number of filters for CNN
kernel_size = 5  # Kernel size for CNN
batch_size = 32
epochs = 5

# Load train and test datasets from CSV files
train_data = pd.read_csv('/kaggle/input/plmdataset/train.csv/train.csv')
test_data = pd.read_csv('/kaggle/input/plmdataset/test.csv/test.csv')

# Preprocess train data
X_train = train_data['review'].values
y_train = train_data['sentiment'].values

# Preprocess test data
X_test = test_data['review'].values
y_test = test_data['sentiment'].values

# Convert labels to numerical format
y_train[y_train == 'negative'] = 0
y_train[y_train == 'positive'] = 1
y_train = y_train.astype(int)

y_test[y_test == 'negative'] = 0
y_test[y_test == 'positive'] = 1
y_test = y_test.astype(int)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train_tokenized, maxlen=maxlen)
X_test = pad_sequences(X_test_tokenized, maxlen=maxlen)

# Load GloVe embeddings
embeddings_index = {}
with open('/kaggle/input/glove6/glove.6B.200d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create embedding matrix
num_words = min(max_features, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define the model using functional API
inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(num_words, embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=maxlen,
                            trainable=False)(inputs)

# LSTM branch
lstm_branch = LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)

# CNN branch
cnn_branch = Conv1D(filters, kernel_size, activation='relu')(embedding_layer)
cnn_branch = GlobalMaxPooling1D()(cnn_branch)

# Concatenate both branches
merged = Concatenate()([lstm_branch, cnn_branch])
output = Dense(1, activation='sigmoid')(merged)

# Create model
model = Model(inputs=inputs, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print('Test accuracy:', accuracy)
end_time = time.time()

print("Time required to fine-tune: ", end_time - start_time)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.8841500282287598
Time required to fine-tune:  970.7984492778778


In [7]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.initializers import Constant

# Record start time
start_time = time.time()
# Parameters for the model and training
max_features = 20000  # Number of words to consider as features
maxlen = 200  # Max length of individual reviews
embedding_dim = 300  # Dimension of word embeddings
lstm_units = 128  # Number of LSTM units
filters = 64  # Number of filters for CNN
kernel_size = 5  # Kernel size for CNN
batch_size = 32
epochs = 5

# Load train and test datasets from CSV files
train_data = pd.read_csv('/kaggle/input/plmdataset/train.csv/train.csv')
test_data = pd.read_csv('/kaggle/input/plmdataset/test.csv/test.csv')

# Preprocess train data
X_train = train_data['review'].values
y_train = train_data['sentiment'].values

# Preprocess test data
X_test = test_data['review'].values
y_test = test_data['sentiment'].values

# Convert labels to numerical format
y_train[y_train == 'negative'] = 0
y_train[y_train == 'positive'] = 1
y_train = y_train.astype(int)

y_test[y_test == 'negative'] = 0
y_test[y_test == 'positive'] = 1
y_test = y_test.astype(int)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train_tokenized, maxlen=maxlen)
X_test = pad_sequences(X_test_tokenized, maxlen=maxlen)

# Load GloVe embeddings
embeddings_index = {}
with open('/kaggle/input/glove6/glove.6B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create embedding matrix
num_words = min(max_features, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define the model using functional API
inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(num_words, embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=maxlen,
                            trainable=False)(inputs)

# LSTM branch
lstm_branch = LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)

# CNN branch
cnn_branch = Conv1D(filters, kernel_size, activation='relu')(embedding_layer)
cnn_branch = GlobalMaxPooling1D()(cnn_branch)

# Concatenate both branches
merged = Concatenate()([lstm_branch, cnn_branch])
output = Dense(1, activation='sigmoid')(merged)

# Create model
model = Model(inputs=inputs, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print('Test accuracy:', accuracy)
end_time = time.time()

print("Time required to fine-tune: ", end_time - start_time)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.8880500197410583
Time required to fine-tune:  1002.8629622459412


10 epochs

In [8]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.initializers import Constant

# Record start time
start_time = time.time()
# Parameters for the model and training
max_features = 20000  # Number of words to consider as features
maxlen = 200  # Max length of individual reviews
embedding_dim = 300  # Dimension of word embeddings
lstm_units = 128  # Number of LSTM units
filters = 64  # Number of filters for CNN
kernel_size = 5  # Kernel size for CNN
batch_size = 32
epochs = 10

# Load train and test datasets from CSV files
train_data = pd.read_csv('/kaggle/input/plmdataset/train.csv/train.csv')
test_data = pd.read_csv('/kaggle/input/plmdataset/test.csv/test.csv')

# Preprocess train data
X_train = train_data['review'].values
y_train = train_data['sentiment'].values

# Preprocess test data
X_test = test_data['review'].values
y_test = test_data['sentiment'].values

# Convert labels to numerical format
y_train[y_train == 'negative'] = 0
y_train[y_train == 'positive'] = 1
y_train = y_train.astype(int)

y_test[y_test == 'negative'] = 0
y_test[y_test == 'positive'] = 1
y_test = y_test.astype(int)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train_tokenized, maxlen=maxlen)
X_test = pad_sequences(X_test_tokenized, maxlen=maxlen)

# Load GloVe embeddings
embeddings_index = {}
with open('/kaggle/input/glove6/glove.6B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create embedding matrix
num_words = min(max_features, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define the model using functional API
inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(num_words, embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=maxlen,
                            trainable=False)(inputs)

# LSTM branch
lstm_branch = LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)

# CNN branch
cnn_branch = Conv1D(filters, kernel_size, activation='relu')(embedding_layer)
cnn_branch = GlobalMaxPooling1D()(cnn_branch)

# Concatenate both branches
merged = Concatenate()([lstm_branch, cnn_branch])
output = Dense(1, activation='sigmoid')(merged)

# Create model
model = Model(inputs=inputs, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print('Test accuracy:', accuracy)
end_time = time.time()

print("Time required to fine-tune: ", end_time - start_time)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.8853999972343445
Time required to fine-tune:  1885.5962204933167


20 epochs

In [9]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.initializers import Constant
import tensorflow as tf

# Record start time
start_time = time.time()

# Parameters for the model and training
max_features = 20000  # Number of words to consider as features
maxlen = 200  # Max length of individual reviews
embedding_dim = 300  # Dimension of word embeddings
lstm_units = 128  # Number of LSTM units
filters = 64  # Number of filters for CNN
kernel_size = 5  # Kernel size for CNN
batch_size = 32
epochs = 20  # Increase the number of epochs
early_stopping_patience = 3  # Early stopping patience

# Load train and test datasets from CSV files
train_data = pd.read_csv('/kaggle/input/plmdataset/train.csv/train.csv')
test_data = pd.read_csv('/kaggle/input/plmdataset/test.csv/test.csv')

# Preprocess train data
X_train = train_data['review'].values
y_train = train_data['sentiment'].values

# Preprocess test data
X_test = test_data['review'].values
y_test = test_data['sentiment'].values

# Convert labels to numerical format
y_train[y_train == 'negative'] = 0
y_train[y_train == 'positive'] = 1
y_train = y_train.astype(int)

y_test[y_test == 'negative'] = 0
y_test[y_test == 'positive'] = 1
y_test = y_test.astype(int)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train_tokenized, maxlen=maxlen)
X_test = pad_sequences(X_test_tokenized, maxlen=maxlen)

# Load GloVe embeddings
embeddings_index = {}
with open('/kaggle/input/glove6/glove.6B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create embedding matrix
num_words = min(max_features, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define the model using functional API
inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(num_words, embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=maxlen,
                            trainable=False)(inputs)

# LSTM branch
lstm_branch = LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)

# CNN branch
cnn_branch = Conv1D(filters, kernel_size, activation='relu')(embedding_layer)
cnn_branch = GlobalMaxPooling1D()(cnn_branch)

# Concatenate both branches
merged = Concatenate()([lstm_branch, cnn_branch])
output = Dense(1, activation='sigmoid')(merged)

# Create model
model = Model(inputs=inputs, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=early_stopping_patience, restore_best_weights=True
)

# Train the model with modified epochs and callbacks
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, 
          validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print('Test accuracy:', accuracy)
end_time = time.time()

print("Time required to fine-tune: ", end_time - start_time)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Test accuracy: 0.8873999714851379
Time required to fine-tune:  985.5678837299347


l2 regularizatiom

In [10]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout
from tensorflow.keras.initializers import Constant
from tensorflow.keras import regularizers

# Record start time
start_time = time.time()

# Parameters for the model and training
max_features = 20000  # Number of words to consider as features
maxlen = 200  # Max length of individual reviews
embedding_dim = 300  # Dimension of word embeddings
lstm_units = 128  # Number of LSTM units
filters = 64  # Number of filters for CNN
kernel_size = 5  # Kernel size for CNN
batch_size = 32
epochs = 5  # Increase the number of epochs
early_stopping_patience = 3  # Early stopping patience
dropout_rate = 0.2  # Dropout rate for regularization

# Load train and test datasets from CSV files
train_data = pd.read_csv('/kaggle/input/plmdataset/train.csv/train.csv')
test_data = pd.read_csv('/kaggle/input/plmdataset/test.csv/test.csv')

# Preprocess train data
X_train = train_data['review'].values
y_train = train_data['sentiment'].values

# Preprocess test data
X_test = test_data['review'].values
y_test = test_data['sentiment'].values

# Convert labels to numerical format
y_train[y_train == 'negative'] = 0
y_train[y_train == 'positive'] = 1
y_train = y_train.astype(int)

y_test[y_test == 'negative'] = 0
y_test[y_test == 'positive'] = 1
y_test = y_test.astype(int)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train_tokenized, maxlen=maxlen)
X_test = pad_sequences(X_test_tokenized, maxlen=maxlen)

# Load GloVe embeddings
embeddings_index = {}
with open('/kaggle/input/glove6/glove.6B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create embedding matrix
num_words = min(max_features, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define the model using functional API
inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(num_words, embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=maxlen,
                            trainable=False)(inputs)

# LSTM branch with dropout
lstm_branch = LSTM(lstm_units, dropout=dropout_rate, recurrent_dropout=dropout_rate,
                   kernel_regularizer=regularizers.l2(0.001))(embedding_layer)

# CNN branch with dropout
cnn_branch = Conv1D(filters, kernel_size, activation='relu')(embedding_layer)
cnn_branch = GlobalMaxPooling1D()(cnn_branch)
cnn_branch = Dropout(rate=dropout_rate)(cnn_branch)

# Concatenate both branches
merged = Concatenate()([lstm_branch, cnn_branch])
output = Dense(1, activation='sigmoid')(merged)

# Create model
model = Model(inputs=inputs, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=early_stopping_patience, restore_best_weights=True
)

# Train the model with modified epochs and callbacks
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, 
          validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print('Test accuracy:', accuracy)
end_time = time.time()

print("Time required to fine-tune: ", end_time - start_time)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.8726000189781189
Time required to fine-tune:  1000.0464890003204
