In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import pandas as pd

In [5]:
movies=pd.read_csv('/content/drive/My Drive/movies.csv')

In [5]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import numpy as np

# Function to split data evenly based on user IDs
def split_data_evenly(data, test_size=0.2):
    test_data = pd.DataFrame()
    train_data = pd.DataFrame()

    # Group data by user ID
    grouped = data.groupby('userId')

    for user_id, group in grouped:
        # Split each user's reviews evenly
        n = len(group)
        test_indices = np.random.choice(group.index, size=int(test_size * n), replace=False)

        # Assign reviews to test dataset
        test_data = pd.concat([test_data, group.loc[test_indices]])

        # Assign remaining reviews to training dataset
        train_data = pd.concat([train_data, group.drop(test_indices)])

    return train_data, test_data

# Load data
movies=pd.read_csv('/content/drive/My Drive/movies.csv')
ratings = pd.read_csv('/content/drive/My Drive/ratings.csv')

# Merge ratings with movie titles
merged_data = pd.merge(ratings, movies, on='movieId')

# Convert genres into numerical features using one-hot encoding
genres = merged_data['genres'].str.split('|')
mlb = MultiLabelBinarizer()
genre_features = pd.DataFrame(mlb.fit_transform(genres), columns=mlb.classes_, index=merged_data.index)

# Concatenate genre features with ratings data
merged_data = pd.concat([merged_data, genre_features], axis=1)

# Split the data into training and testing sets
train_data, test_data = split_data_evenly(merged_data)

# Check GPU availability
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define PyTorch dataset
class RecommendationDataset(Dataset):
    def __init__(self, data):
        self.user_ids = torch.tensor(data['userId'].values, dtype=torch.long).to(device)
        self.movie_ids = torch.tensor(data['movieId'].values, dtype=torch.long).to(device)
        self.genre_features = torch.tensor(data.iloc[:, 6:].values, dtype=torch.float32).to(device)
        self.ratings = torch.tensor(data['rating'].values, dtype=torch.float32).to(device)

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.movie_ids[idx], self.genre_features[idx], self.ratings[idx]

# Create datasets and data loaders
train_dataset = RecommendationDataset(train_data)
test_dataset = RecommendationDataset(test_data)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

# Define model
class RecommendationModel(nn.Module):
    def __init__(self, num_users, num_movies, num_genres):
        super(RecommendationModel, self).__init__()
        self.user_embedding = nn.Embedding(num_users, 50).to(device)
        self.movie_embedding = nn.Embedding(num_movies, 50).to(device)
        self.genre_linear = nn.Linear(num_genres, 50).to(device)
        self.concat_layer = nn.Linear(150, 128).to(device)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)  # Add dropout for regularization
        self.output_layer = nn.Linear(128, 1).to(device)
        self.sigmoid = nn.Sigmoid().to(device)


    def forward(self, user_ids, movie_ids, genre_features):
        user_embedded = self.user_embedding(user_ids)
        movie_embedded = self.movie_embedding(movie_ids)
        genre_linear = self.relu(self.genre_linear(genre_features))
        concatenated = torch.cat((user_embedded, movie_embedded, genre_linear), dim=1)
        out = self.relu(self.concat_layer(concatenated))
        out = self.output_layer(out)
        out = self.sigmoid(out)  # Apply sigmoid activation function
        out = out * 5.0
        return out.squeeze()

# Instantiate model and define loss function and optimizer
model = RecommendationModel(ratings['userId'].nunique()+1, movies['movieId'].max()+1, len(mlb.classes_))
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

# Training loop
for epoch in range(19):
    model.train()
    running_loss = 0.0
    for user_ids, movie_ids, genre_features, ratings in train_loader:

        optimizer.zero_grad()
        outputs = model(user_ids, movie_ids, genre_features)
        loss = criterion(outputs, ratings)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}")

# Evaluation
model.eval()
test_loss = 0.0
with torch.no_grad():
    for user_ids, movie_ids, genre_features, ratings in test_loader:
        outputs = model(user_ids, movie_ids, genre_features)
        test_loss += criterion(outputs, ratings).item()
print(f"Test Loss: {test_loss / len(test_loader)}")


Using device: cuda:0
Epoch 1, Loss: 0.916826940389185
Epoch 2, Loss: 0.7906666464773537
Epoch 3, Loss: 0.7240812297957607
Epoch 4, Loss: 0.6741119217882051
Epoch 5, Loss: 0.6316693996232522
Epoch 6, Loss: 0.5932184168479487
Epoch 7, Loss: 0.5581033927258812
Epoch 8, Loss: 0.5242069705943518
Epoch 9, Loss: 0.49219530488399765
Epoch 10, Loss: 0.46183034732843503
Epoch 11, Loss: 0.4338382529566371
Epoch 12, Loss: 0.40615494346505476
Epoch 13, Loss: 0.380345976817174
Epoch 14, Loss: 0.35780590648845406
Epoch 15, Loss: 0.3356614008168631
Epoch 16, Loss: 0.31608858425170183
Epoch 17, Loss: 0.2969270701154689
Epoch 18, Loss: 0.2801703065280107
Epoch 19, Loss: 0.26477459866459235
Test Loss: 0.9945524416099756


In [6]:
def get_top5_recommendations(model, user_Id, movie_Ids, genre_features):
    # Prepare input tensors
    user_Ids = torch.tensor(np.array([user_Id] * len(movie_Ids)),dtype=torch.long).to(device)
    movie_Ids = torch.tensor(movie_Ids,dtype=torch.long).to(device)
    genre_features = torch.tensor(genre_features.values,dtype=torch.float32).to(device)  # Assuming genre_features is a tensor


    print("t")

    # Forward pass to get predictions
    model.eval()
    with torch.no_grad():
        predictions = model(user_Ids, movie_Ids, genre_features).squeeze()

    print("h")

    # Sort predicted ratings in descending order and get top 5 indices
    top5_indices = predictions.argsort(descending=True)[:5]

    # Get top 5 movie IDs and predicted ratings
    top5_movie_ids = movie_Ids[top5_indices]
    top5_ratings = predictions[top5_indices]

    return top5_movie_ids, top5_ratings




In [7]:
# Extract movie IDs from the preprocessed dataset
movie_ids = movies['movieId']

# Extract genre features from the preprocessed dataset # Assuming genre features start from the 7th column

# Print shapes of movie IDs and genre features
print("Shape of movie IDs:", movie_ids.shape)
print(type(movie_ids))

genre_features=movies['genres'].str.split('|')

genre_features = pd.DataFrame(mlb.fit_transform(genre_features), columns=mlb.classes_, index=movies.index)
print("Shape of genre features:", genre_features.shape)
print(type(genre_features))


# Example usage
user_id = 1  # User ID for which you want recommendations
# Assuming movie_ids and genre_features are available
top5_movie_ids, top5_ratings = get_top5_recommendations(model, user_id, movie_ids, genre_features)

# Print top 5 recommendations
print("Top 5 Recommendations for User", user_id)
for movie_id, rating in zip(top5_movie_ids, top5_ratings):
    print("Movie ID:", movie_id.item(), "Rating:", rating.item())


Shape of movie IDs: (9742,)
<class 'pandas.core.series.Series'>
Shape of genre features: (9742, 20)
<class 'pandas.core.frame.DataFrame'>
t
h
Top 5 Recommendations for User 1
Movie ID: 5490 Rating: 4.987458229064941
Movie ID: 7121 Rating: 4.95017147064209
Movie ID: 136445 Rating: 4.946697235107422
Movie ID: 102084 Rating: 4.943048477172852
Movie ID: 3473 Rating: 4.943009853363037
