In [366]:
%run 'Data Cleaning.ipynb'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 10 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   Id                  3000000 non-null  object 
 1   Title               2999792 non-null  object 
 2   Price               481171 non-null   float64
 3   User_id             2438213 non-null  object 
 4   profileName         2438095 non-null  object 
 5   review/helpfulness  3000000 non-null  object 
 6   review/score        3000000 non-null  float64
 7   review/time         3000000 non-null  int64  
 8   review/summary      2999593 non-null  object 
 9   review/text         2999992 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 228.9+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 6 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   Id  

In [367]:
import torch
import torch.nn as nn

class CollabFiltModel(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
    
    def forward(self, user, item):
        user_emb = self.user_emb(user)
        item_emb = self.item_emb(item)
        return (user_emb * item_emb).sum(1)


In [368]:
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

cleaned_df['User_id'] = user_encoder.fit_transform(cleaned_df['User_id'])
cleaned_df['Title'] = item_encoder.fit_transform(cleaned_df['Title'])

class ReviewDataset(Dataset):
    def __init__(self, users, items, ratings):
        self.users = torch.tensor(users, dtype=torch.int64)
        self.items = torch.tensor(items, dtype=torch.int64)
        self.ratings = torch.tensor(ratings, dtype=torch.float32)
    
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]
    
    def __len__(self):
        return len(self.ratings)

dataset = ReviewDataset(cleaned_df['User_id'].values,
                        cleaned_df['Title'].values,
                        cleaned_df['review/score'].values)

dataloader = DataLoader(dataset, batch_size=512, shuffle=True)


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [369]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = CollabFiltModel(num_users=cleaned_df['User_id'].nunique(),
                        num_items=cleaned_df['Title'].nunique()).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epoch_loss_dict = {}
for epoch in range(10):
    for users, items, ratings in dataloader:
        users, items, ratings = users.to(device), items.to(device), ratings.to(device)
        
        optimizer.zero_grad()
        predictions = model(users, items)
        loss = criterion(predictions, ratings)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')
    epoch_loss_dict[epoch] = loss.item()

Epoch 1, Loss: 83.57991790771484
Epoch 2, Loss: 54.123512268066406
Epoch 3, Loss: 23.50264549255371
Epoch 4, Loss: 10.028059005737305
Epoch 5, Loss: 7.289210319519043
Epoch 6, Loss: 4.068905830383301
Epoch 7, Loss: 2.630927085876465
Epoch 8, Loss: 2.0325589179992676
Epoch 9, Loss: 1.885364294052124
Epoch 10, Loss: 1.9344913959503174


In [370]:
def recommend_items_for_user(model, user_id, user_encoder, item_encoder, num_recommendations=5):
    # Set model to evaluation mode
    model.eval()
    
    # Encode the user ID
    encoded_user_id = user_encoder.transform([user_id])
    
    # Create a tensor for the user ID
    user_tensor = torch.tensor(encoded_user_id, dtype=torch.int64).to(device)
    
    # Predict scores for all items for this user
    all_items = torch.tensor(range(len(item_encoder.classes_)), dtype=torch.int64).to(device)
    user = user_tensor.repeat(len(all_items))
    predictions = model(user, all_items).detach().cpu().numpy()
    # Get the top N recommendations; argsort returns indices of sorted array
    top_indices = predictions.argsort()[-num_recommendations:][::-1]
    # Decode the recommended item IDs
    recommended_items = item_encoder.inverse_transform(top_indices)
    top_ratings = predictions[top_indices]
    print(top_ratings)
    return recommended_items

user_id_to_recommend = 'AVCGYZL8FQQTD'  # Example user ID
recommended_items = recommend_items_for_user(model, user_id_to_recommend, user_encoder, item_encoder)
print("Recommended Items:", recommended_items)


[39.62539  37.94596  36.63533  36.460766 36.39388 ]
Recommended Items: ['The war of the worlds, The time machine, and selected short stories'
 'Reel Time: A Novel' 'Have You Thought Of Leonard Peltier Lately?'
 'TWELVE MEN' 'Girl Scout Handbook']


In [371]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(cleaned_df, test_size=0.2, random_state=42)


In [372]:
train_dataset = ReviewDataset(train_df['User_id'].values, train_df['Title'].values, train_df['review/score'].values)
train_dataloader = DataLoader(train_dataset, batch_size=512, shuffle=True)

test_dataset = ReviewDataset(test_df['User_id'].values, test_df['Title'].values, test_df['review/score'].values)
test_dataloader = DataLoader(test_dataset, batch_size=512, shuffle=False)


In [391]:
model.eval()
total_squared_error = 0.0
total_absolute_error = 0.0
total_count = 0

with torch.no_grad():
    for users, items, ratings in test_dataloader:
        users, items, ratings = users.to(device), items.to(device), ratings.to(device)
        predictions = model(users, items)
        
        # Squared error for RMSE
        squared_error = (predictions - ratings) ** 2
        total_squared_error += squared_error.sum().item()
        # Absolute error for MAE
        absolute_error = torch.abs(predictions - ratings)
        total_absolute_error += absolute_error.sum().item()
        
        total_count += ratings.size(0)

# Calculate mean squared error and RMSE
mse = total_squared_error / total_count
rmse = torch.sqrt(torch.tensor(mse))

# Calculate mean absolute error
mae = total_absolute_error / total_count

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')


RuntimeError: Placeholder storage has not been allocated on MPS device!

In [374]:
# Save the model state dictionary
torch.save(model.state_dict(), 'collab_filt_model_state_dict.pth')

# Save the encoders
import joblib
joblib.dump(user_encoder, 'user_encoder.joblib')
joblib.dump(item_encoder, 'item_encoder.joblib')

cleaned_df.to_csv('cleaned_df')


['item_encoder.joblib']

In [429]:
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
from copy import deepcopy
import os

In [436]:
import scipy.sparse as sp

def save_sparse_matrix(tensor, path):
    # Convert PyTorch tensor to a NumPy array
    tensor_np = tensor.detach().cpu().numpy()
    
    # Convert the NumPy array to a SciPy sparse matrix (CSR format)
    sparse_matrix = sp.csr_matrix(tensor_np)
    
    # Save the sparse matrix to disk
    sp.save_npz(path, sparse_matrix)

# Assuming pruned_model is your pruned model
save_sparse_matrix(model.user_emb.weight, "user_emb_sparse.npz")
save_sparse_matrix(model.item_emb.weight, "item_emb_sparse.npz")