In [62]:
import pandas as pd

# Load user data
users = pd.read_csv('users.dat', sep='::', header=None, engine='python',
                    names=['user_id', 'gender', 'age', 'occupation', 'zip_code'])

# Load rating data
ratings = pd.read_csv('ratings.dat', sep='::', header=None, engine='python',
                      names=['user_id', 'movie_id', 'rating', 'timestamp'])

# Load movie data
movies = pd.read_csv('movies.dat', sep='::', header=None, engine='python',
                     names=['movie_id', 'title', 'genres'], encoding='latin-1')

# Display the first few rows of each dataframe
print("Users DataFrame:")
print(users.head())

print("\nRatings DataFrame:")
print(ratings.head())

print("\nMovies DataFrame:")
print(movies.head())

Users DataFrame:
   user_id gender  age  occupation zip_code
0        1      F    1          10    48067
1        2      M   56          16    70072
2        3      M   25          15    55117
3        4      M   45           7    02460
4        5      M   25          20    55455

Ratings DataFrame:
   user_id  movie_id  rating  timestamp
0        1      1193       5  978300760
1        1       661       3  978302109
2        1       914       3  978301968
3        1      3408       4  978300275
4        1      2355       5  978824291

Movies DataFrame:
   movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)   

In [63]:
import torch
from torch.utils.data import Dataset, DataLoader
# Assuming previous data loading steps

# Merge ratings with movies
data = pd.merge(ratings, movies, on='movie_id')

# Merge the result with users
data = pd.merge(data, users, on='user_id')

# Display the first few rows of the merged dataframe
print("Merged DataFrame:")
print(data.head())

# Encode user_id and movie_id as categorical data
data['user_id'] = data['user_id'].astype('category').cat.codes
data['movie_id'] = data['movie_id'].astype('category').cat.codes

# Convert the data into PyTorch tensors
class MovieLensDataset(Dataset):
    def __init__(self, data):
        self.user_ids = torch.tensor(data['user_id'].values, dtype=torch.long)
        self.movie_ids = torch.tensor(data['movie_id'].values, dtype=torch.long)
        self.ratings = torch.tensor(data['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.movie_ids[idx], self.ratings[idx]

from sklearn.model_selection import train_test_split

# Split the data into training and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

train_dataset = MovieLensDataset(train_data)
test_dataset = MovieLensDataset(test_data)

from torch.utils.data import DataLoader

train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)

Merged DataFrame:
   user_id  movie_id  rating  timestamp  \
0        1      1193       5  978300760   
1        1       661       3  978302109   
2        1       914       3  978301968   
3        1      3408       4  978300275   
4        1      2355       5  978824291   

                                    title                        genres  \
0  One Flew Over the Cuckoo's Nest (1975)                         Drama   
1        James and the Giant Peach (1996)  Animation|Children's|Musical   
2                     My Fair Lady (1964)               Musical|Romance   
3                  Erin Brockovich (2000)                         Drama   
4                    Bug's Life, A (1998)   Animation|Children's|Comedy   

  gender  age  occupation zip_code  
0      F    1          10    48067  
1      F    1          10    48067  
2      F    1          10    48067  
3      F    1          10    48067  
4      F    1          10    48067  


In [46]:
#import torch.nn as nn

# class MatrixFactorization(nn.Module):
#     def __init__(self, num_users, num_items, num_factors):
#         super(MatrixFactorization, self).__init__()
#         self.user_factors = nn.Embedding(num_users, num_factors)
#         self.item_factors = nn.Embedding(num_items, num_factors)

#     def forward(self, user_ids, item_ids):
#         user_embedding = self.user_factors(user_ids)
#         item_embedding = self.item_factors(item_ids)
#         return (user_embedding * item_embedding).sum(1)

# num_users = data['user_id'].nunique()
# num_items = data['movie_id'].nunique()
# num_factors = 10

# model = MatrixFactorization(num_users, num_items, num_factors)

# class MatrixFactorizationWithBias(nn.Module):
#     def __init__(self, num_users, num_movies, num_factors):
#         super(MatrixFactorizationWithBias, self).__init__()
#         self.user_factors = nn.Embedding(num_users, num_factors)
#         self.movie_factors = nn.Embedding(num_movies, num_factors)
#         self.user_biases = nn.Embedding(num_users, 1)
#         self.movie_biases = nn.Embedding(num_movies, 1)
#         self.global_bias = nn.Parameter(torch.tensor([0.0]))

#     def forward(self, user_ids, movie_ids):
#         user_embedding = self.user_factors(user_ids)
#         movie_embedding = self.movie_factors(movie_ids)
#         user_bias = self.user_biases(user_ids).squeeze()
#         movie_bias = self.movie_biases(movie_ids).squeeze()
#         dot_product = (user_embedding * movie_embedding).sum(1)
#         return dot_product + user_bias + movie_bias + self.global_bias

# num_users = data['user_id'].nunique()
# num_items = data['movie_id'].nunique()
# num_factors = 10

# model = MatrixFactorizationWithBias(num_users, num_items, num_factors)

In [64]:
import torch.nn as nn

class NeuralCollaborativeFiltering(nn.Module):
    def __init__(self, num_users, num_movies, num_factors, hidden_units):
        super(NeuralCollaborativeFiltering, self).__init__()
        self.user_embedding = nn.Embedding(num_users, num_factors)
        self.movie_embedding = nn.Embedding(num_movies, num_factors)
        self.hidden_layers = nn.Sequential(
            nn.Linear(num_factors * 2, hidden_units),
            nn.ReLU(),
            nn.Linear(hidden_units, 1)
        )

    def forward(self, user_ids, movie_ids):
        user_embedding = self.user_embedding(user_ids)
        movie_embedding = self.movie_embedding(movie_ids)
        concatenated = torch.cat([user_embedding, movie_embedding], dim=1)
        output = self.hidden_layers(concatenated).squeeze()
        return output

In [65]:
import torch.optim as optim
import numpy as np

num_users = data['user_id'].nunique()
num_items = data['movie_id'].nunique()
num_factors = 10
hidden_units = 64
model = NeuralCollaborativeFiltering(num_users, num_items, num_factors, hidden_units)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.MSELoss()
mae_criterion = nn.L1Loss()

num_epochs = 10
model.train()

for epoch in range(num_epochs):
    total_loss = 0
    for user_ids, movie_ids, ratings in train_loader:
        optimizer.zero_grad()
        predictions = model(user_ids, movie_ids)
        loss = criterion(predictions, ratings)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

# Evaluate the model
model.eval()
total_loss = 0
total_mae = 0
with torch.no_grad():
    for user_ids, movie_ids, ratings in test_loader:
        predictions = model(user_ids, movie_ids)
        mse_loss = criterion(predictions, ratings)
        mae_loss = mae_criterion(predictions, ratings)
        total_loss += mse_loss.item()
        total_mae += mae_loss.item()

rmse = np.sqrt(total_loss / len(test_loader))
mae = total_mae / len(test_loader)
print('\n')
print(f'Test RMSE: {rmse}')
print(f'Test MAE: {mae}')

Epoch 1, Loss: 0.9375603582647756
Epoch 2, Loss: 0.8529020523077239
Epoch 3, Loss: 0.8259112923505506
Epoch 4, Loss: 0.8108264342990864
Epoch 5, Loss: 0.7981639370545095
Epoch 6, Loss: 0.7873495619201493
Epoch 7, Loss: 0.7781775168723548
Epoch 8, Loss: 0.7708472349038651
Epoch 9, Loss: 0.7655142007480095
Epoch 10, Loss: 0.7610701152985262


Test RMSE: 0.9004214332266425
Test MAE: 0.7086951161853335


In [73]:
def recommend_for_user(model, user_id, num_recommendations=5):
    user_id_tensor = torch.tensor([user_id] * num_items)
    movie_id_tensor = torch.tensor(range(num_items))
    model.eval()
    with torch.no_grad():
        predictions = model(user_id_tensor, movie_id_tensor)
    top_movie_indices = torch.topk(predictions, num_recommendations).indices.numpy()

    # Get the recommended movie IDs
    recommended_movie_ids = movies['movie_id'].iloc[top_movie_indices].tolist()

    # Fetch movie details
    recommended_movies = movies[movies['movie_id'].isin(recommended_movie_ids)]

    return recommended_movies

# Example user ID
user_id = 100
recommendations = recommend_for_user(model, user_id)
print(f'Recommended movies for user {user_id}:\n{recommendations}')

Recommended movies for user 100:
      movie_id                                          title  \
628        633                         Denise Calls Up (1995)   
2423      2492                                20 Dates (1998)   
2557      2626                       Edge of Seventeen (1998)   
2931      3000  Princess Mononoke, The (Mononoke Hime) (1997)   
3669      3738                  Sugarland Express, The (1974)   

                          genres  
628                       Comedy  
2423                      Comedy  
2557        Comedy|Drama|Romance  
2931  Action|Adventure|Animation  
3669                       Drama  
