In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.loader import DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [3]:
# Load the dataset
url = "ratings.csv"
df = pd.read_csv(url, names=["userId", "productId", "Rating", "timestamp"])

df.shape


(7824482, 4)

In [241]:
from sklearn.utils import resample
# Separate each class into different DataFrames
dfs = [df[df['Rating'] == rating] for rating in df['Rating'].unique()]

# Determine the size of the smallest class
min_size = min(len(df_rating) for df_rating in dfs)

# Undersample each class to the size of the smallest class
undersampled_dfs = [resample(df_rating, replace=False, n_samples=min_size, random_state=42) for df_rating in dfs]

# Combine the undersampled DataFrames
undersampled_df = pd.concat(undersampled_dfs)

# Shuffle the combined DataFrame
undersampled_df = undersampled_df.sample(frac=0.01, random_state=42).reset_index(drop=True)

In [242]:
undersampled_df

Unnamed: 0,userId,productId,Rating,timestamp
0,A15R3LOMPYTM8U,B004HXKVXC,5.0,1324252800
1,A3EJVZ5LCBP61X,B004URBZ4O,4.0,1384732800
2,A3QS2WEQ2QXYD6,B007JYRW60,2.0,1386115200
3,ALONWZT1KWJDU,B00CH643A8,1.0,1388707200
4,AW4T7ZDZLK5KF,B0079XKYP2,2.0,1350432000
...,...,...,...,...
22811,AP4R6QS7DKMEW,B004M8SU0I,4.0,1398988800
22812,A1ZZNOZAEZ164Q,B0088LYCZC,3.0,1354838400
22813,A35OQCTA24K4I9,B001FWXDZQ,1.0,1318464000
22814,A2N9QDTH0239RD,B00A81SXHI,1.0,1397260800


In [243]:

# Clean the data
undersampled_df.dropna(inplace=True)
undersampled_df.drop_duplicates(inplace=True)

# Encode user IDs and item IDs
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

undersampled_df['userId'] = user_encoder.fit_transform(undersampled_df['userId'])
undersampled_df['productId'] = item_encoder.fit_transform(undersampled_df['productId'])

# Split the data into training, validation, and test sets
train_df, test_df = train_test_split(undersampled_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# shape of the datasets
train_df.shape, val_df.shape, test_df.shape

((14601, 4), (3651, 4), (4564, 4))

In [270]:
# Create edge index from user-item interactions
edge_index = torch.tensor([train_df['userId'].values, train_df['productId'].values], dtype=torch.long)

# Create edge attributes (ratings)
edge_attr = torch.tensor(train_df['Rating'].values, dtype=torch.float)

# Create the PyTorch Geometric data object
data = Data(edge_index=edge_index, edge_attr=edge_attr)


edge_index_val = torch.tensor([val_df['userId'].values, val_df['productId'].values], dtype=torch.long)
edge_attr_val = torch.tensor(val_df['Rating'].values, dtype=torch.float)
data_val = Data(edge_index=edge_index_val, edge_attr=edge_attr_val)

edge_index_test = torch.tensor([test_df['userId'].values, test_df['productId'].values], dtype=torch.long)
edge_attr_test = torch.tensor(test_df['Rating'].values, dtype=torch.float)
data_test = Data(edge_index=edge_index_test, edge_attr=edge_attr_test)

# Display the data object
data

Data(edge_index=[2, 14601], edge_attr=[14601])

In [245]:
import torch
import torch.nn as nn
num_users = undersampled_df['userId'].nunique()
num_items = undersampled_df['productId'].nunique()


# Assuming `num_users` and `num_items` are already defined
num_nodes = num_users + num_items

# Create an embedding layer for node features
embedding_dim = 64  # You can adjust this dimension as needed
node_embedding = nn.Embedding(num_nodes, embedding_dim)

# Create node indices (0 to num_nodes-1)
node_indices = torch.arange(num_nodes)

# Generate node features using the embedding layer
node_features = node_embedding(node_indices)

# Add node features to the data object
data.x = node_features

# Display the updated data object
print(data)


Data(edge_index=[2, 14601], edge_attr=[14601], x=[38882, 64])


In [246]:
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.fc = torch.nn.Linear(hidden_channels * 2, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        
        # Apply the final linear layer on the concatenated edge features
        edge_pred = self.fc(torch.cat([x[edge_index[0]], x[edge_index[1]]], dim=1))
        return edge_pred.squeeze()
    
#Initialize the model




In [247]:
import torch
from torch_geometric.data import DataLoader

# Assuming `data_list` contains your `Data` objects for training
data_list = [data]  # Ensure `data` is correctly formatted

# Define batch size and create DataLoader
batch_size = 32
train_loader = DataLoader(data_list, batch_size=batch_size, shuffle=True)




In [248]:
# model, loss function, and optimizer
model = GCN(in_channels=node_features.size(1), hidden_channels=16, out_channels=1) 
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
print(model)


GCN(
  (conv1): GCNConv(64, 16)
  (conv2): GCNConv(16, 16)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


In [249]:
import torch
import torch.nn.functional as F

def train_model(model, train_loader, optimizer, criterion, epochs=200):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0.0
        for batch in train_loader:
            optimizer.zero_grad()
            
            # Forward pass
            output = model(batch).squeeze()  # Assuming output is a single value per edge
            
            # Assuming batch.edge_attr is your target (ratings)
            target = batch.edge_attr.view(-1)
            
            # Print shapes for debugging
            print(f"Output shape: {output.shape}")
            print(f"Target shape: {target.shape}")

            # Check if output and target have the same number of elements
            if output.shape[0] != target.shape[0]:
                raise ValueError("Output and target have different sizes along dimension 0.")
            
            # Compute loss
            loss = criterion(output, target)
            
            # Backward pass
            loss.backward(retain_graph=True)  # Retain the graph for multiple backward passes
            
            # Optimization step
            optimizer.step()
            
            # Accumulate loss
            epoch_loss += loss.item()

            # Clean up
            del output, loss, target

        # Print epoch loss
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss / len(train_loader):.4f}')

# Assuming `model`, `train_loader`, `optimizer`, and `criterion` are defined elsewhere
train_model(model, train_loader, optimizer, criterion, epochs= 200)
  





Output shape: torch.Size([14601])
Target shape: torch.Size([14601])
Epoch [1/200], Loss: 11.8372
Output shape: torch.Size([14601])
Target shape: torch.Size([14601])
Epoch [2/200], Loss: 10.1815
Output shape: torch.Size([14601])
Target shape: torch.Size([14601])
Epoch [3/200], Loss: 8.6200
Output shape: torch.Size([14601])
Target shape: torch.Size([14601])
Epoch [4/200], Loss: 7.1376
Output shape: torch.Size([14601])
Target shape: torch.Size([14601])
Epoch [5/200], Loss: 5.7616
Output shape: torch.Size([14601])
Target shape: torch.Size([14601])
Epoch [6/200], Loss: 4.5747
Output shape: torch.Size([14601])
Target shape: torch.Size([14601])
Epoch [7/200], Loss: 3.7246
Output shape: torch.Size([14601])
Target shape: torch.Size([14601])
Epoch [8/200], Loss: 3.3883
Output shape: torch.Size([14601])
Target shape: torch.Size([14601])
Epoch [9/200], Loss: 3.6145
Output shape: torch.Size([14601])
Target shape: torch.Size([14601])
Epoch [10/200], Loss: 4.0482
Output shape: torch.Size([14601])
Tar

In [252]:
# Evaluate function

import torch
from sklearn.metrics import mean_squared_error, mean_absolute_error

def evaluate_model(model, loader, criterion):
    model.eval()
    total_mse = 0.0
    total_mae = 0.0
    total_samples = 0

    with torch.no_grad():
        for batch in loader:
            output = model(batch).squeeze()
            target = batch.edge_attr.view(-1)
            
            # Ensure that output and target have the same number of elements
            if output.numel() != target.numel():
                raise ValueError("Output and target do not have the same number of elements.")
            
            mse = mean_squared_error(output.cpu().numpy(), target.cpu().numpy())
            mae = mean_absolute_error(output.cpu().numpy(), target.cpu().numpy())
            
            batch_size = batch.edge_index.size(1)  # Use edge_index size for batch size
            total_samples += batch_size
            
            total_mse += mse * batch_size
            total_mae += mae * batch_size

    avg_mse = total_mse / total_samples
    avg_mae = total_mae / total_samples

    print(f'Evaluation Metrics:')
    print(f'Mean Squared Error (MSE): {avg_mse:.4f}')
    print(f'Mean Absolute Error (MAE): {avg_mae:.4f}')
   

In [256]:
# Convert validation and test data to PyTorch Geometric format
val_edge_index = torch.tensor([val_df['userId'].values, val_df['productId'].values], dtype=torch.long)
val_edge_attr = torch.tensor(val_df['Rating'].values, dtype=torch.float)

test_edge_index = torch.tensor([test_df['userId'].values, test_df['productId'].values], dtype=torch.long)
test_edge_attr = torch.tensor(test_df['Rating'].values, dtype=torch.float)


# Create data objects for validation and test sets
val_data = Data(edge_index=val_edge_index, edge_attr=val_edge_attr, x=node_features)
test_data = Data(edge_index=test_edge_index, edge_attr=test_edge_attr, x=node_features)

# Evaluate the model
model.eval()
with torch.no_grad():
    val_out = model(val_data)
    test_out = model(test_data)

val_rmse = mean_squared_error(val_edge_attr.numpy(), val_out.numpy(), squared=False)
val_mae = mean_absolute_error(val_edge_attr.numpy(), val_out.numpy())

test_rmse = mean_squared_error(test_edge_attr.numpy(), test_out.numpy(), squared=False)
test_mae = mean_absolute_error(test_edge_attr.numpy(), test_out.numpy())

print(f'Validation RMSE: {val_rmse}, Validation MAE: {val_mae}')
print(f'Test RMSE: {test_rmse}, Test MAE: {test_mae}')

Validation RMSE: 1.4966999292373657, Validation MAE: 1.2986104488372803
Test RMSE: 1.4792537689208984, Test MAE: 1.2730622291564941




In [None]:
def generate_recommendations(model, data, train_df, user_id, top_n=5):
    model.eval()
    
    with torch.no_grad():
        # Forward pass through the model to get embeddings
        embeddings = model(data)
        
        # Separate user and product embeddings
        user_embedding = embeddings[user_id]
        product_embeddings = embeddings[train_df['productId'].unique() + data.x.size(0) // 2]
        
        # Calculate cosine similarity scores
        similarity_scores = F.cosine_similarity(user_embedding.unsqueeze(0), product_embeddings, dim=1)
        
        # Get top N recommended product IDs based on similarity scores
        top_indices = similarity_scores.argsort(descending=True)[:top_n]
        recommended_product_ids = train_df['productId'].unique()[top_indices]
        
        return recommended_product_ids

# Example usage
user_id = 3  # Replace with the user ID for whom you want recommendations
top_n = 5
recommended_products = generate_recommendations(model, data, train_df, user_id, top_n)
print(f'Recommended Product IDs for User {user_id}: {recommended_products}')
