# Reviews

In [13]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from collections import Counter
import re
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')

# GPU OPTIMIZED - FIXED IMPORTS
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
torch.cuda.empty_cache()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW as TorchAdamW  # Explicit full import
import torch.nn.functional as F

# =========================
# LOAD DATA
# =========================

df = pd.read_csv("Appliances_Reviews.csv")

df["sentiment"] = (df["overall"] > 3).astype(int)
df["text"] = df["reviewText"].fillna("") + " " + df["summary"].fillna("")

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z ]","",text)
    return text

df["text"] = df["text"].apply(clean_text)


In [10]:
# =========================
# LSTM PIPELINE (same as your file)
# =========================

counter = Counter()
for text in df["text"]:
    counter.update(text.split()[:100])

vocab = dict(counter.most_common(5000))
word2idx = {w:i+1 for i,(w,_) in enumerate(vocab.items())}
vocab_size = len(vocab) + 1

def text_to_sequence(text):
    return [word2idx.get(word, 0) for word in text.split()[:100]]


sequences = df["text"].apply(text_to_sequence)
X = np.array([seq + [0]*(100-len(seq)) for seq in sequences])
y = df['sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)

X_train = torch.tensor(X_train, dtype=torch.long).to(device)
X_test  = torch.tensor(X_test, dtype=torch.long).to(device)
y_train = torch.tensor(y_train, dtype=torch.float32).to(device)
y_test  = torch.tensor(y_test, dtype=torch.float32).to(device)

train_loader = DataLoader(TensorDataset(X_train,y_train), batch_size=128, shuffle=True)
test_loader  = DataLoader(TensorDataset(X_test,y_test), batch_size=128)

In [11]:
# =========================
# LSTM MODEL
# =========================

class LSTMModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 128).to(device)
        self.lstm = nn.LSTM(128, 128, batch_first=True, dropout=0.3).to(device)
        self.fc = nn.Linear(128, 1).to(device)
        self.dropout = nn.Dropout(0.3).to(device)

    def forward(self,x):
        x = self.embedding(x)
        x,_ = self.lstm(x)
        x = x[:,-1,:]
        x = self.fc(x)
        return torch.sigmoid(x.squeeze())

lstm_model = LSTMModel(vocab_size).to(device)
criterion = nn.BCELoss()
lstm_optimizer = optim.Adam(lstm_model.parameters(),lr=0.001, weight_decay=0.01)
lstm_losses = []

for epoch in range(8):
    lstm_model.train()
    total_loss = 0
    for xb, yb in train_loader:
        lstm_optimizer.zero_grad()
        preds = lstm_model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(lstm_model.parameters(), 1.0)
        lstm_optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    lstm_losses.append(avg_loss)

print("LSTM training complete")

LSTM training complete


In [2]:
# GPU EVALUATION FUNCTION
def evaluate_gpu(model, test_loader, is_bert=False):
    model.eval()
    predictions, true_labels = [], []
      
    with torch.no_grad():
        for batch in test_loader:
            if is_bert:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                preds = torch.sigmoid(outputs.logits).squeeze().cpu()
            else:
                xb = batch[0].to(device)
                labels = batch[1].to(device)
                preds = model(xb).cpu()
            
            predictions.extend(preds.numpy())
            true_labels.extend(labels.cpu().numpy())
    
    predictions = (np.array(predictions) > 0.5).astype(int)
    true_labels = np.array(true_labels)
    
    return {
        'accuracy': accuracy_score(true_labels, predictions),
        'precision': precision_score(true_labels, predictions),
        'recall': recall_score(true_labels, predictions),
        'f1': f1_score(true_labels, predictions),
        'predictions': predictions,
        'true_labels': true_labels
    }

In [None]:


# EVALUATE BOTH
lstm_results = evaluate_gpu(lstm_model, test_loader)
bert_results = evaluate_gpu(bert_model, bert_test_loader, is_bert=True)

print(f"\nâœ… GPU RESULTS:")
print(f"LSTM:  Acc={lstm_results['accuracy']:.3f} F1={lstm_results['f1']:.3f}")
print(f"BERT:  Acc={bert_results['accuracy']:.3f} F1={bert_results['f1']:.3f}")

In [None]:
# RESULTS TABLE
results_df = pd.DataFrame({
    'Model': ['LSTM (GPU)', 'BERT (GPU)'],
    'Accuracy': [lstm_results['accuracy'], bert_results['accuracy']],
    'Precision': [lstm_results['precision'], bert_results['precision']],
    'Recall': [lstm_results['recall'], bert_results['recall']],
    'F1-Score': [lstm_results['f1'], bert_results['f1']]
})
print("\nðŸ“Š FINAL GPU RESULTS")
print(results_df.round(4))

# GPU OPTIMIZED GRAPHS
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Performance Comparison
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
x = np.arange(len(metrics))
width = 0.4
colors = ['#FF6B6B', '#4ECDC4']
axes[0,0].bar(x - width/2, [lstm_results[m.lower()] for m in metrics], width, 
              label='LSTM', color=colors[0], alpha=0.8)
axes[0,0].bar(x + width/2, [bert_results[m.lower()] for m in metrics], width, 
              label='BERT', color=colors[1], alpha=0.8)
axes[0,0].set_title('ðŸš€ GPU Model Performance', fontsize=14, fontweight='bold')
axes[0,0].set_xticks(x)
axes[0,0].set_xticklabels(metrics, rotation=45)
axes[0,0].legend()

# Loss Curves
axes[0,1].plot(range(1,9), lstm_losses, 'o-', linewidth=3, label='LSTM', color=colors[0])
axes[0,1].plot(range(1,5), bert_losses, 's-', linewidth=3, label='BERT', color=colors[1])
axes[0,1].set_title('ðŸ“ˆ Training Loss (GPU)', fontsize=14, fontweight='bold')
axes[0,1].set_xlabel('Epoch')
axes[0,1].legend()

# Confusion Matrices
sns.heatmap(confusion_matrix(lstm_results['true_labels'], lstm_results['predictions']), 
            annot=True, fmt='d', cmap='Reds', ax=axes[1,0])
axes[1,0].set_title('LSTM Confusion Matrix')

sns.heatmap(confusion_matrix(bert_results['true_labels'], bert_results['predictions']), 
            annot=True, fmt='d', cmap='Blues', ax=axes[1,1])
axes[1,1].set_title('BERT Confusion Matrix')

plt.tight_layout()
plt.savefig('gpu_model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("ðŸŽ‰ GPU Training Complete! Check 'gpu_model_comparison.png'")
torch.cuda.empty_cache()

In [None]:
# BERT MODEL PIPELINE
# Add this after your LSTM section

from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset
import torch.nn.functional as F

class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
bert_model = bert_model.to(device)

# Prepare datasets
train_dataset = ReviewDataset(dftext[:int(0.8*len(dftext))], dfsentiment[:int(0.8*len(dfsentiment))], tokenizer)
test_dataset = ReviewDataset(dftext[int(0.8*len(dftext)):], dfsentiment[int(0.8*len(dfsentiment)):], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Train function
def train_bert(model, train_loader, val_loader, epochs=3):
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.BCEWithLogitsLoss()
    
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')
    
    return model

# Evaluate function
def evaluate(model, test_loader):
    model.eval()
    predictions = []
    true_labels = []
    total_loss = 0
    criterion = nn.BCEWithLogitsLoss()
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits.squeeze(), labels)
            total_loss += loss.item()
            
            preds = torch.sigmoid(outputs.logits).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    
    predictions = (np.array(predictions) > 0.5).astype(int)
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    
    print(f'Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Train the model
print("Training BERT...")
bert_model = train_bert(bert_model, train_loader, test_loader, epochs=3)
print("BERT training complete!")

# Evaluate
bert_results = evaluate(bert_model, test_loader)

# Assume you have LSTM results stored, e.g., lstm_results = {'accuracy': 0.85, ...}
# Replace with actual if available
lstm_results = {'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0}  # Placeholder

# Results table
results_df = pd.DataFrame({
    'Model': ['LSTM', 'BERT'],
    'Accuracy': [lstm_results['accuracy'], bert_results['accuracy']],
    'Precision': [lstm_results['precision'], bert_results['precision']],
    'Recall': [lstm_results['recall'], bert_results['recall']],
    'F1-Score': [lstm_results['f1'], bert_results['f1']]
})
print(results_df)

# Graphs: Loss curve placeholder (extend train function to log losses)
# For comparison bar chart
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
x = np.arange(len(metrics))
width = 0.35

fig, ax = plt.subplots(figsize=(10,6))
ax.bar(x - width/2, [lstm_results[m.lower()] for m in metrics], width, label='LSTM')
ax.bar(x + width/2, [bert_results[m.lower()] for m in metrics], width, label='BERT')

ax.set_ylabel('Scores')
ax.set_title('LSTM vs BERT Performance')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
plt.tight_layout()
plt.savefig('model_comparison.png')
plt.show()

# Confusion matrix for BERT (example)
from sklearn.metrics import confusion_matrix
import seaborn as sns
cm = confusion_matrix(true_labels, predictions)
sns.heatmap(cm, annot=True, fmt='d')
plt.title('BERT Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.savefig('bert_confusion.png')
plt.show()
