In [None]:
import numpy as np
import pandas as pd

data = pd.read_csv('/kaggle/input/amzon-book-review-clean/clean_data_amazon_book_review.csv',index_col=False)

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,user_id,rating_review,rating,review_summary,review,authors,publisher,genre,ratings
0,0,The Church of Christ: A Biblical Ecclesiology ...,ARI272XF8TOL4,74/81,5.0,ecclesiological milestone,with the publication of everett ferguson's boo...,['everett ferguson'],wm. b. eerdmans publishing,['religion'],5.0
1,1,The Church of Christ: A Biblical Ecclesiology ...,A36TPZSH8LBT1,2/3,5.0,early christian development of the church,everett ferguson approaches the subject of ear...,['everett ferguson'],wm. b. eerdmans publishing,['religion'],5.0
2,2,The Church of Christ: A Biblical Ecclesiology ...,ANX3DDV12ZRRU,2/3,1.0,an excellent presentation of the beliefs of th...,this book is a continual resource. it is so bi...,['everett ferguson'],wm. b. eerdmans publishing,['religion'],5.0
3,3,The Church of Christ: A Biblical Ecclesiology ...,A2H2LORTA5EZY2,3/5,4.0,christ is lord,this is a very useful and thorough text book. ...,['everett ferguson'],wm. b. eerdmans publishing,['religion'],5.0
4,4,Voices from the Farm: Adventures in Community ...,A3W1KIKQ93S62,21/21,5.0,excellent perspective on communal living and h...,"ironically, i grew up in a small town close to...",['rupert fike'],book publishing company,['biography & autobiography'],1.0


In [None]:
train_df = data[:10000]

In [None]:
test_df = data[25000:40000]

In [None]:
import torch
from transformers import BertModel, BertTokenizer
from torch.utils.data import DataLoader, TensorDataset

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def encode_text(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)  # Mean pooling

def encode_array(text_array):
    embeddings = [encode_text(text) for text in text_array]
    return torch.mean(torch.stack(embeddings), dim=0)

def encode_dataset(df, columns):
    embeddings = {}
    for column in columns:
        print(f"Encoding {column}...")
        if isinstance(df[column].iloc[0], list):
            embeddings[column] = torch.vstack([encode_array(arr) for arr in df[column]])
        else:
            embeddings[column] = torch.vstack([encode_text(text) for text in df[column]])
    return embeddings

# Encode necessary columns
encoded_columns = ['review', 'review_summary', 'authors', 'publisher', 'genre']
embeddings = encode_dataset(train_df, encoded_columns)

In [None]:
embeddings['genre'].size(1) * len(encoded_columns)

3840

In [None]:
import torch.nn as nn
import torch.optim as optim

class Recommender(nn.Module):
    def __init__(self, input_dim):
        super(Recommender, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, *inputs):
        x = torch.cat(inputs, dim=-1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Assuming each embedding has the same dimension
input_dim = embeddings['review'].size(1) * len(encoded_columns)
model = Recommender(input_dim)


In [None]:
# Combine embeddings for training
def get_combined_embeddings(df, embeddings, columns):
    combined_embeddings = []
    for i in range(len(df)):
        row_embeddings = [embeddings[col][i] for col in columns]
        combined_embeddings.append(torch.cat(row_embeddings))
    return torch.vstack(combined_embeddings)

X = get_combined_embeddings(train_df, embeddings, encoded_columns)
y = torch.tensor(train_df['rating'].values, dtype=torch.float32).unsqueeze(1)

# Split data into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Training parameters
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10
batch_size = 32

# Training loop with validation
for epoch in range(num_epochs):
    model.train()
    permutation = torch.randperm(X_train.size(0))
    epoch_loss = 0.0
    
    for i in range(0, X_train.size(0), batch_size):
        optimizer.zero_grad()
        
        indices = permutation[i:i+batch_size]
        batch_x, batch_y = X_train[indices], y_train[indices]
        
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = criterion(val_outputs, y_val)
    
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss/len(X_train)}, Validation Loss: {val_loss.item()}')

# Save the model
torch.save(model.state_dict(), 'recommender_model.pth')

# Example prediction
model.eval()
with torch.no_grad():
    example_idx = 0
    example_inputs = [embeddings[col][example_idx] for col in encoded_columns]
    prediction = model(*example_inputs)
    print(f'Predicted rating: {prediction.item()}, Actual rating: {y[example_idx].item()}')


In [None]:
# Split data into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Training loop with validation
for epoch in range(num_epochs):
    model.train()
    permutation = torch.randperm(X_train.size(0))
    epoch_loss = 0.0
    
    for i in range(0, X_train.size(0), batch_size):
        optimizer.zero_grad()
        
        indices = permutation[i:i+batch_size]
        batch_x, batch_y = X_train[indices], y_train[indices]
        
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = criterion(val_outputs, y_val)
    
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss/len(X_train)}, Validation Loss: {val_loss.item()}')

# Example prediction
model.eval()
with torch.no_grad():
    example_idx = 0
    example_inputs = [embeddings[col][example_idx] for col in encoded_columns]
    prediction = model(*example_inputs)
    print(f'Predicted rating: {prediction.item()}, Actual rating: {y[example_idx].item()}')


In [None]:
# Combine embeddings for training
def get_combined_embeddings(df, embeddings, columns):
    combined_embeddings = []
    for i in range(len(df)):
        row_embeddings = [embeddings[col][i] for col in columns]
        combined_embeddings.append(torch.cat(row_embeddings))
    return torch.vstack(combined_embeddings)

X = get_combined_embeddings(train_df, embeddings, encoded_columns)
y = torch.tensor(train_df['rating'].values, dtype=torch.float32).unsqueeze(1)

# Split data into training and validation sets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, precision_score, recall_score, accuracy_score
import matplotlib.pyplot as plt

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Training parameters
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10
batch_size = 32

# Lists to store metrics
train_losses, val_losses = [], []
rmse_scores, precisions, recalls, accuracies, hit_ratios = [], [], [], [], []

# Training loop with validation
for epoch in range(num_epochs):
    model.train()
    permutation = torch.randperm(X_train.size(0))
    epoch_loss = 0.0
    
    for i in range(0, X_train.size(0), batch_size):
        optimizer.zero_grad()
        
        indices = permutation[i:i+batch_size]
        batch_x, batch_y = X_train[indices], y_train[indices]
        
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = criterion(val_outputs, y_val)
        
        # Calculate additional metrics
        rmse = mean_squared_error(y_val, val_outputs, squared=False)
        y_val_rounded = y_val.round()
        val_outputs_rounded = val_outputs.round()
        precision = precision_score(y_val_rounded, val_outputs_rounded, average='macro', zero_division=0)
        recall = recall_score(y_val_rounded, val_outputs_rounded, average='macro', zero_division=0)
        accuracy = accuracy_score(y_val_rounded, val_outputs_rounded)
        hit_ratio = (val_outputs_rounded == y_val_rounded).float().mean().item()
        
    train_losses.append(epoch_loss / len(X_train))
    val_losses.append(val_loss.item())
    rmse_scores.append(rmse)
    precisions.append(precision)
    recalls.append(recall)
    accuracies.append(accuracy)
    hit_ratios.append(hit_ratio)
    
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {train_losses[-1]}, Validation Loss: {val_losses[-1]}, RMSE: {rmse:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}, Hit Ratio: {hit_ratio:.4f}')

# Save the model
torch.save(model.state_dict(), 'recommender_model.pth')



In [None]:
# Plot Training and Validation Loss
plt.figure(figsize=(10, 5))
plt.plot(range(num_epochs), train_losses, label='Training Loss')
plt.plot(range(num_epochs), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# Plot RMSE
plt.figure(figsize=(10, 5))
plt.plot(range(num_epochs), rmse_scores, label='RMSE')
plt.xlabel('Epochs')
plt.ylabel('RMSE')
plt.title('Root Mean Squared Error')
plt.legend()
plt.show()

# Plot Precision, Recall, Accuracy, Hit Ratio
plt.figure(figsize=(10, 5))
plt.plot(range(num_epochs), precisions, label='Precision')
plt.plot(range(num_epochs), recalls, label='Recall')
plt.plot(range(num_epochs), accuracies, label='Accuracy')
plt.plot(range(num_epochs), hit_ratios, label='Hit Ratio')
plt.xlabel('Epochs')
plt.ylabel('Score')
plt.title('Evaluation Metrics')
plt.legend()
plt.show()

# Print final metrics
print(f'Final RMSE: {rmse_scores[-1]:.4f}')
print(f'Final Precision: {precisions[-1]:.4f}')
print(f'Final Recall: {recalls[-1]:.4f}')
print(f'Final Accuracy: {accuracies[-1]:.4f}')
print(f'Final Hit Ratio: {hit_ratios[-1]:.4f}')
