In [None]:
#in domain test
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv('/content/DVD11.csv')

# Preprocess the text and labels
# Ensure 'review_body' is a string and 'star_rating' exists
df = df[df['review_body'].apply(lambda x: isinstance(x, str))]  # Keep only rows with string text
texts = df['review_body'].tolist()
labels = df['star_rating'].tolist()  # Assuming star_rating is your label

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Generate embeddings for each review
X = [get_bert_embeddings(text) for text in texts]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


In [None]:
#after domain adoptation 
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Function to get BERT embeddings
def get_bert_embeddings(texts):
    """Converts input text list into BERT embeddings."""
    all_embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        # Move the output back to CPU before converting to numpy
        all_embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy())
    return np.array(all_embeddings)

# Load book reviews dataset
books_df = pd.read_csv('/content/books_reviews.csv')

# Preprocess the text and labels (ensure only rows with valid text data are kept)
books_df = books_df[books_df['review_body'].apply(lambda x: isinstance(x, str))]
books_texts = books_df['review_body'].tolist()
books_labels = books_df['star_rating'].tolist()

# Generate BERT embeddings for book reviews
books_X = get_bert_embeddings(books_texts)

# Split book data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(books_X, books_labels, test_size=0.2, random_state=42)

# Define domain-invariant feature learning model
class DomainInvariantModel(torch.nn.Module):
    def __init__(self, bert_embedding_dim, num_classes):
        super(DomainInvariantModel, self).__init__()
        self.sentiment_classifier = torch.nn.Linear(bert_embedding_dim, num_classes)
        self.domain_classifier = torch.nn.Linear(bert_embedding_dim, 2)  # 2 classes: book and electronics

    def forward(self, x):
        sentiment_logits = self.sentiment_classifier(x)
        domain_logits = self.domain_classifier(x)
        return sentiment_logits, domain_logits

# Initialize domain-invariant feature learning model
domain_invariant_model = DomainInvariantModel(bert_embedding_dim=768, num_classes=5).to(device)

# Define loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(domain_invariant_model.parameters(), lr=1e-5)

# Train domain-invariant feature learning model
for epoch in range(3):
    for batch in DataLoader(TensorDataset(torch.tensor(X_train), torch.tensor(y_train)), batch_size=32, shuffle=True):
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)
        
        # Forward pass
        sentiment_logits, domain_logits = domain_invariant_model(input_ids)
        sentiment_loss = criterion(sentiment_logits, labels)
        domain_loss = criterion(domain_logits, torch.zeros_like(labels))  # assume all book reviews are from the book domain
        total_loss = sentiment_loss + domain_loss
        
        # Backward pass
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()
    
    # Evaluate on validation set
    domain_invariant_model.eval()
    with torch.no_grad():
        total_correct = 0
        for batch in DataLoader(TensorDataset(torch.tensor(X_test), torch.tensor(y_test)), batch_size=32, shuffle=False):
            input_ids, labels = batch
            input_ids, labels = input_ids.to(device), labels.to(device)
            
            sentiment_logits, _ = domain_invariant_model(input_ids)
            _, predicted = torch.max(sentiment_logits, 1)
            total_correct += (predicted == labels).sum().item()
        
        accuracy = total_correct / len(X_test)
        print(f'Epoch {epoch+1}, Val Accuracy: {accuracy:.4f}')

# Load electronics reviews dataset
electronics_df = pd.read_csv('/content/electronics_reviews.csv')

# Preprocess electronics reviews
electronics_df = electronics_df[electronics_df['review_body'].apply(lambda x: isinstance(x, str))]
electronics_texts = electronics_df['review_body'].tolist()
electronics_labels = electronics_df['star_rating'].tolist()

# Generate BERT embeddings for electronics reviews
electronics_X = get_bert_embeddings(electronics_texts)

# Make predictions on electronics reviews using the trained model
domain_invariant_model.eval()
with torch.no_grad():
    predictions = []
    for batch in DataLoader(TensorDataset(torch.tensor(electronics_X)), batch_size=32, shuffle=False):
        input_ids = batch[0].to(device)
        
        # Forward pass
        sentiment_logits, _ = domain_invariant_model(input_ids)
        _, predicted = torch.max(sentiment_logits, 1)
        predictions.extend(predicted.cpu().numpy())

# Evaluate performance on out-of-domain (electronics reviews) data
electronics_accuracy = accuracy_score(electronics_labels, predictions)
electronics_f1 = f1_score(electronics_labels, predictions, average='weighted')
electronics_precision = precision_score(electronics_labels, predictions, average='weighted')
electronics_recall = recall_score(electronics_labels, predictions, average='weighted')

# Print all evaluation metrics
print(f"Domain Adaptation Performance (Books -> Electronics):")
print(f"Accuracy: {electronics_accuracy}")
print(f"F1 Score: {electronics_f1}")
print(f"Precision: {electronics_precision}")
print(f"Recall: {electronics_recall}")