In [None]:
import pandas as pd
import re
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
# Load data
train_file_path = '/content/DVD11.csv'  # Replace with your train file path
train_data = pd.read_csv(train_file_path, nrows=20000)

# Text cleaning function
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s]', '', text)
        return text
    return None

# Clean the reviews
train_data['cleaned_review'] = train_data['review_body'].apply(clean_text)
train_data = train_data.dropna(subset=['cleaned_review'])


In [None]:
# Initialize BERT tokenizer and model (without classification head)
bert_tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
bert_model = BertModel.from_pretrained('bert-large-cased')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert_model.to(device)
bert_model.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, 

In [None]:
max_length = 128

def get_bert_embeddings(text):
    encoded_review = bert_tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)

    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
        # Extract the [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        return cls_embedding

# Get BERT embeddings for all reviews
bert_embeddings = []

for review in train_data['cleaned_review']:
    embedding = get_bert_embeddings(review)
    bert_embeddings.append(embedding)

# Concatenate the embeddings
bert_embeddings = torch.cat(bert_embeddings, dim=0)  # Shape: [num_samples, hidden_dim]


In [None]:
# Use existing sentiment labels (adjust if necessary for binary classification)
labels = train_data['star_rating'].values

# Convert BERT embeddings from torch tensor to NumPy for sklearn compatibility
bert_embeddings_np = bert_embeddings.cpu().numpy()

# Split data into training and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    bert_embeddings_np, labels, test_size=0.1, random_state=42
)


In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Initialize SVM with standard scaling
svm_classifier = make_pipeline(StandardScaler(), SVC(kernel='linear', C=1.0))

# Train the SVM classifier
svm_classifier.fit(train_inputs, train_labels)


In [None]:
# Make predictions on the validation set
val_preds = svm_classifier.predict(val_inputs)

# Calculate metrics for binary classification (adjust if using multi-class classification)
accuracy = accuracy_score(val_labels, val_preds)
precision = precision_score(val_labels, val_preds, average='binary')
recall = recall_score(val_labels, val_preds, average='binary')
f1 = f1_score(val_labels, val_preds, average='binary')

# Print metrics
print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}")


Accuracy: 0.91, Precision: 0.94, Recall: 0.95, F1-score: 0.95


In [None]:
# Load test data
test_file_path = '/content/Books11.csv'  # Replace with your test file path
test_data = pd.read_csv(test_file_path, nrows=20000)

# Clean the test data
test_data['cleaned_review'] = test_data['review_body'].apply(clean_text)
test_data = test_data.dropna(subset=['cleaned_review'])

# Get BERT embeddings for test data
test_embeddings = []

for review in test_data['cleaned_review']:
    embedding = get_bert_embeddings(review)
    test_embeddings.append(embedding)

test_embeddings = torch.cat(test_embeddings, dim=0)  # Shape: [num_samples, hidden_dim]
test_embeddings_np = test_embeddings.cpu().numpy()

# Use existing sentiment labels for the test set
test_labels = test_data['star_rating'].values

# Make predictions on the test set
test_preds = svm_classifier.predict(test_embeddings_np)

# Calculate metrics for binary classification
accuracy = accuracy_score(test_labels, test_preds)
precision = precision_score(test_labels, test_preds, average='binary')
recall = recall_score(test_labels, test_preds, average='binary')
f1 = f1_score(test_labels, test_preds, average='binary')

# Print metrics
print(f"Test Accuracy: {accuracy:.2f}, Test Precision: {precision:.2f}, Test Recall: {recall:.2f}, Test F1-score: {f1:.2f}")


Test Accuracy: 0.86, Test Precision: 0.91, Test Recall: 0.92, Test F1-score: 0.92
