In [1]:
import pandas as pd
import re
import torch
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

# Load and clean the dataset
train_file_path = '/kaggle/input/sentiment-analysis/DVD11.csv'  # Use your domain-specific file
train_data = pd.read_csv(train_file_path, nrows=20000)

# Clean text data
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s]', '', text)
        return text
    return None

train_data['cleaned_review'] = train_data['review_body'].apply(clean_text)
train_data = train_data.dropna(subset=['cleaned_review'])

# Initialize BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

# Tokenize and encode data
max_length = 128

def encode_review_bert(text):
    return bert_tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

input_ids = []
attention_masks = []

for review in train_data['cleaned_review']:
    encoded_review = encode_review_bert(review)
    input_ids.append(encoded_review['input_ids'])
    attention_masks.append(encoded_review['attention_mask'])

# Convert lists to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(train_data['star_rating'].values)

# Split data into training and validation sets
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.1, random_state=42
)

# Create DataLoader
batch_size = 16
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]



In [2]:
# Load pre-trained BERT for sequence classification
bert_model = BertForSequenceClassification.from_pretrained('bert-large-cased', num_labels=5)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert_model.to(device)

# Define optimizer and scheduler
optimizer = AdamW(bert_model.parameters(), lr=2e-5)
epochs = 3
num_training_steps = epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Fine-tuning loop
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
bert_model.train()

for epoch in range(epochs):
    for batch in train_dataloader:
        batch_input_ids, batch_masks, batch_labels = [b.to(device) for b in batch]
        outputs = bert_model(batch_input_ids, attention_mask=batch_masks, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    print(f"Epoch {epoch+1}/{epochs} completed. Loss: {loss.item()}")

# Save the fine-tuned BERT model
bert_model.save_pretrained('./fine_tuned_bert_model')
bert_tokenizer.save_pretrained('./fine_tuned_bert_model')


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2103 [00:00<?, ?it/s]

Epoch 1/3 completed. Loss: 0.1300184279680252
Epoch 2/3 completed. Loss: 0.806008517742157
Epoch 3/3 completed. Loss: 0.00026544384309090674


('./fine_tuned_bert_model/tokenizer_config.json',
 './fine_tuned_bert_model/special_tokens_map.json',
 './fine_tuned_bert_model/vocab.txt',
 './fine_tuned_bert_model/added_tokens.json')

In [3]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from tqdm import tqdm
import pandas as pd

# Define device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Define the directory where the model and tokenizer are saved
model_dir = './fine_tuned_bert_model'

# Load the fine-tuned BERT model
bert_model = BertForSequenceClassification.from_pretrained(model_dir)
bert_model.to(device)
bert_model.eval()

# Load the tokenizer from the same directory
bert_tokenizer = BertTokenizer.from_pretrained(model_dir)

# Access the underlying BERT model for embeddings
bert_base_model = bert_model.bert
bert_base_model.to(device)
bert_base_model.eval()

# Function to get BERT embeddings in batches
max_length = 128
batch_size = 32

def get_bert_embeddings_batch(batch_texts):
    encoded_reviews = bert_tokenizer.batch_encode_plus(
        batch_texts,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoded_reviews['input_ids'].to(device)
    attention_mask = encoded_reviews['attention_mask'].to(device)

    with torch.no_grad():
        outputs = bert_base_model(input_ids, attention_mask=attention_mask)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # Shape: [batch_size, hidden_dim]
        return cls_embeddings

# Reload the training data to get the 'cleaned_review' column
train_file_path = '/kaggle/input/sentiment-analysis/DVD11.csv'
train_data_df = pd.read_csv(train_file_path, nrows=20000) # Reload the dataframe

# Ensure 'review_body' is string and handle missing values
train_data_df['review_body'] = train_data_df['review_body'].astype(str).fillna('') # Use the DataFrame here

bert_embeddings = []

# Process in batches
for i in tqdm(range(0, len(train_data_df), batch_size), desc="Extracting BERT embeddings"): # Iterate over the DataFrame
    batch_reviews = train_data_df['review_body'][i:i+batch_size].tolist() #Extract review_body instead of cleaned_review
    embeddings = get_bert_embeddings_batch(batch_reviews)
    bert_embeddings.append(embeddings)

# Concatenate all embeddings
bert_embeddings = torch.cat(bert_embeddings, dim=0)  # Shape: [num_samples, hidden_dim]

# Optionally, convert embeddings to NumPy for further processing
bert_embeddings_np = bert_embeddings.cpu().numpy()

# Print the shape of the embeddings to verify
print(f"BERT Embeddings Shape: {bert_embeddings_np.shape}")


Extracting BERT embeddings:   0%|          | 0/390 [00:00<?, ?it/s][A
Extracting BERT embeddings:   1%|          | 2/390 [00:00<02:28,  2.62it/s][A
Extracting BERT embeddings:   1%|          | 3/390 [00:01<03:18,  1.95it/s][A
Extracting BERT embeddings:   1%|          | 4/390 [00:02<03:39,  1.76it/s][A
Extracting BERT embeddings:   1%|▏         | 5/390 [00:02<03:53,  1.65it/s][A
Extracting BERT embeddings:   2%|▏         | 6/390 [00:03<04:03,  1.57it/s][A
Extracting BERT embeddings:   2%|▏         | 7/390 [00:04<04:08,  1.54it/s][A
Extracting BERT embeddings:   2%|▏         | 8/390 [00:04<04:10,  1.52it/s][A
Extracting BERT embeddings:   2%|▏         | 9/390 [00:05<04:14,  1.50it/s][A
Extracting BERT embeddings:   3%|▎         | 10/390 [00:06<04:15,  1.49it/s][A
Extracting BERT embeddings:   3%|▎         | 11/390 [00:06<04:15,  1.48it/s][A
Extracting BERT embeddings:   3%|▎         | 12/390 [00:07<04:16,  1.48it/s][A
Extracting BERT embeddings:   3%|▎         | 13/390 [00:

BERT Embeddings Shape: (12450, 1024)





In [4]:
# Convert BERT embeddings to NumPy for SVM
bert_embeddings_np = bert_embeddings.cpu().numpy()
labels = train_data_df['star_rating'].values # Use train_data_df to access the star_rating column

# Split data into training and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    bert_embeddings_np, labels, test_size=0.1, random_state=42
)

In [5]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Initialize SVM with standard scaling
svm_classifier = make_pipeline(StandardScaler(), SVC(kernel='linear', C=1.0))

# Train the SVM classifier
svm_classifier.fit(train_inputs, train_labels)


In [6]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # Import the necessary functions from sklearn.metrics

# Initialize SVM with standard scaling
svm_classifier = make_pipeline(StandardScaler(), SVC(kernel='linear', C=1.0))

# Train the SVM classifier
svm_classifier.fit(train_inputs, train_labels)

# Make predictions on the validation set
val_preds = svm_classifier.predict(val_inputs)

# Calculate metrics for validation set
accuracy = accuracy_score(val_labels, val_preds)
precision = precision_score(val_labels, val_preds, average='macro')
recall = recall_score(val_labels, val_preds, average='macro')
f1 = f1_score(val_labels, val_preds, average='macro')

# Print metrics
print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}")

Accuracy: 0.96, Precision: 0.91, Recall: 0.91, F1-score: 0.91


In [7]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

# Assuming you have already defined get_bert_embeddings_batch and loaded bert_base_model and svm_classifier

# Define a custom Dataset
class ReviewDataset(Dataset):
    def __init__(self, reviews):
        self.reviews = reviews

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        return self.reviews[idx]

# Load test data
test_file_path = '/kaggle/input/sentiment-analysis/Books11.csv'  # Replace with your test file path
test_data = pd.read_csv(test_file_path, nrows=20000)

# Clean the test data
test_data = test_data.dropna(subset=['review_body'])

# Prepare DataLoader with a suitable batch size
batch_size = 32  # Adjust based on your GPU memory
dataset = ReviewDataset(test_data['review_body'].tolist())
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

test_embeddings = []
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

bert_base_model.to(device)
bert_base_model.eval()  # Set model to evaluation mode

with torch.no_grad():
    for batch in tqdm(dataloader, desc="Processing Batches"):
        # Get embeddings
        embeddings = get_bert_embeddings_batch(batch)  # Ensure this function handles batch inputs
        # Move embeddings to CPU
        embeddings = embeddings.cpu()
        test_embeddings.append(embeddings)
        # Optional: Clear cache to free up GPU memory
        torch.cuda.empty_cache()

# Concatenate all embeddings
test_embeddings = torch.cat(test_embeddings, dim=0)  # Shape: [num_samples, hidden_dim]
test_embeddings_np = test_embeddings.numpy()

# Use existing sentiment labels for the test set
test_labels = test_data['star_rating'].values

# Make predictions on the test set
test_preds = svm_classifier.predict(test_embeddings_np)

# Calculate metrics for test set
accuracy = accuracy_score(test_labels, test_preds)
precision = precision_score(test_labels, test_preds, average='macro')
recall = recall_score(test_labels, test_preds, average='macro')
f1 = f1_score(test_labels, test_preds, average='macro')

# Print metrics
print(f"Test Accuracy: {accuracy:.2f}, Test Precision: {precision:.2f}, Test Recall: {recall:.2f}, Test F1-score: {f1:.2f}")



Processing Batches:   0%|          | 0/625 [00:00<?, ?it/s][A
Processing Batches:   0%|          | 1/625 [00:01<10:35,  1.02s/it][A
Processing Batches:   0%|          | 2/625 [00:01<09:45,  1.06it/s][A
Processing Batches:   0%|          | 3/625 [00:02<09:10,  1.13it/s][A
Processing Batches:   1%|          | 4/625 [00:03<08:53,  1.16it/s][A
Processing Batches:   1%|          | 5/625 [00:04<08:49,  1.17it/s][A
Processing Batches:   1%|          | 6/625 [00:05<08:50,  1.17it/s][A
Processing Batches:   1%|          | 7/625 [00:06<08:49,  1.17it/s][A
Processing Batches:   1%|▏         | 8/625 [00:06<08:40,  1.18it/s][A
Processing Batches:   1%|▏         | 9/625 [00:07<08:48,  1.16it/s][A
Processing Batches:   2%|▏         | 10/625 [00:08<08:48,  1.16it/s][A
Processing Batches:   2%|▏         | 11/625 [00:09<08:42,  1.17it/s][A
Processing Batches:   2%|▏         | 12/625 [00:10<08:38,  1.18it/s][A
Processing Batches:   2%|▏         | 13/625 [00:11<08:28,  1.20it/s][A
Processin

Test Accuracy: 0.87, Test Precision: 0.69, Test Recall: 0.70, Test F1-score: 0.69
