# Movie Recommendation System using Graph-based Methods
This notebook implements a complete movie recommendation system using graph-based approaches.

In [66]:
# Import required libraries
import networkx as nx
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.optim as optim
from datetime import datetime, timedelta
from collections import defaultdict
import random
from sklearn.preprocessing import StandardScaler
import os

In [67]:
# Global variables
GENRE_LIST = ['Action', 'Drama', 'Comedy', 'Romance', 'Thriller']
movie_data = {}
ACCUMULATION_STEPS = 4  # Define accumulation steps constant

## Graph Building and Feature Engineering

In [68]:
import networkx as nx
from datetime import datetime

def build_movie_graph(interactions):
    G = nx.Graph()
    for user_id, movie_id, rating, timestamp in interactions:
        # Add user node with type attribute
        user_node = f"user_{user_id}"
        G.add_node(user_node, type="user", last_active=timestamp)
        
        # Add movie node with type attribute
        movie_node = f"movie_{movie_id}"
        G.add_node(movie_node, type="movie")
        
        # Add or update edge
        if G.has_edge(user_node, movie_node):
            G[user_node][movie_node]['weight'] += 1
            G[user_node][movie_node]['last_interaction'] = timestamp
            G[user_node][movie_node]['ratings'].append(float(rating))
        else:
            G.add_edge(user_node, movie_node, 
                      weight=1, 
                      last_interaction=timestamp, 
                      ratings=[float(rating)])
    return G

def normalize_edge_weights(G):
    for edge in G.edges():
        ratings = G.edges[edge].get('ratings', [])
        if ratings:
            avg_rating = np.mean(ratings)
            G.edges[edge]['rating'] = avg_rating  # Store average rating
            G.edges[edge]['weight'] = avg_rating / 5.0  # Normalize to [0,1]
        else:
            G.edges[edge]['rating'] = 0
            G.edges[edge]['weight'] = 0



def engineer_user_features(G, user_data):
    user_features = {}
    for node, data in G.nodes(data=True):
        if data['type'] == 'user':
            user_id = node.split('_')[1]
            if user_id in user_data:  # Add check if user exists in user_data
                user_info = user_data[user_id]
                # Basic features
                features = [
                    float(user_info['age']),  # Convert to float
                    float(user_info['gender_encoded']),
                    float(user_info['location_encoded']),
                ]
                
                # Behavioral features
                # Get ratings directly from edges
                ratings = [G[node][edge].get('rating', 0) for edge in G[node]]
                avg_rating = np.mean(ratings) if ratings else 0
                rating_count = G.degree(node)
                features.extend([avg_rating, float(rating_count)])
                
                # Add genre preferences
                genre_preferences = calculate_genre_preferences(G, node)
                features.extend(genre_preferences)
                
                user_features[node] = np.array(features, dtype=np.float32)
    
    return user_features

def calculate_genre_preferences(G, user_node):
    genre_counts = {genre: 0 for genre in GENRE_LIST}
    total_ratings = 0
    
    # Iterate through neighboring movie nodes
    for neighbor in G[user_node]:
        if neighbor.startswith('movie_'):
            movie_id = neighbor.split('_')[1]
            if movie_id in movie_data:
                genres = movie_data[movie_id]['genres']
                rating = G[user_node][neighbor]['rating']
                for genre in genres:
                    if genre in genre_counts:
                        genre_counts[genre] += rating
                total_ratings += 1
    
    # Normalize genre preferences
    genre_preferences = []
    for genre in GENRE_LIST:
        if total_ratings > 0:
            genre_preferences.append(genre_counts[genre] / total_ratings)
        else:
            genre_preferences.append(0)
    
    return genre_preferences



## Loss Functions

In [69]:
def bce_loss(predictions, targets):
    return F.binary_cross_entropy_with_logits(predictions, targets)

def mse_loss(predictions, targets):
    return F.mse_loss(predictions, targets)

def bpr_loss(pos_scores, neg_scores):
    return -F.logsigmoid(pos_scores - neg_scores).mean()

def margin_ranking_loss(pos_scores, neg_scores, margin=0.5):
    return F.margin_ranking_loss(pos_scores, neg_scores, torch.ones_like(pos_scores), margin=margin)

def combined_loss(pred_ratings, true_ratings, pos_scores, neg_scores, alpha=0.5):
    # Convert inputs to floating point tensors if they aren't already
    pred_ratings = pred_ratings.float()
    true_ratings = true_ratings.float()
    pos_scores = pos_scores.float()
    neg_scores = neg_scores.float()
    
    rating_loss = mse_loss(pred_ratings, true_ratings)
    ranking_loss = bpr_loss(pos_scores, neg_scores)
    return alpha * rating_loss + (1 - alpha) * ranking_loss


def evaluate_model(model, val_graph):
    """
    Evaluate the model on validation graph and return validation loss
    
    Args:
        model: The GNN model
        val_graph: Validation graph (NetworkX graph)
        
    Returns:
        float: Average validation loss
    """
    model.eval()
    total_loss = 0
    batch_count = 0
    
    with torch.no_grad():
        for batch in generate_batches(val_graph, batch_size=32):
            user_nodes, pos_movie_nodes, neg_movie_nodes, ratings = batch
            
            # Get model predictions
            pred_ratings, pos_scores, neg_scores = model(
                val_graph, 
                user_nodes, 
                pos_movie_nodes, 
                neg_movie_nodes
            )
            
            # Calculate combined loss
            loss = combined_loss(
                pred_ratings,
                ratings,
                pos_scores,
                neg_scores
            )
            
            total_loss += loss.item()
            batch_count += 1
    
    # Return average loss
    return total_loss / max(1, batch_count)

## Training Functions

In [70]:
def generate_batches(graph, batch_size=2):
    for _ in range(3):  # Generate 3 dummy batches
        subgraph, user_nodes, pos_movies, neg_movies = create_mini_batch(
            graph, batch_size=batch_size, n_pos=2, n_neg=2
        )
        # Create ratings tensor with gradient computation enabled
        ratings = torch.rand(batch_size, 1, requires_grad=True)
        yield user_nodes, pos_movies, neg_movies, ratings

def train_model(model, train_graph, val_graph, optimizer, scheduler, epochs, patience=5):
    best_val_loss = float('inf')
    no_improve_count = 0
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        optimizer.zero_grad()
        
        for batch_idx, batch in enumerate(generate_batches(train_graph)):
            user_nodes, pos_movie_nodes, neg_movie_nodes, ratings = batch
            pred_ratings, pos_scores, neg_scores = model(train_graph, user_nodes, pos_movie_nodes, neg_movie_nodes)
            
            loss = combined_loss(pred_ratings, ratings, pos_scores, neg_scores)
            loss = loss / ACCUMULATION_STEPS
            loss.backward()
            
            if (batch_idx + 1) % ACCUMULATION_STEPS == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                optimizer.zero_grad()
                
            total_loss += loss.item() * ACCUMULATION_STEPS
            
        avg_train_loss = total_loss / len(train_graph)
        val_loss = evaluate_model(model, val_graph)
        
        print(f"Epoch {epoch}, Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")
        scheduler.step(val_loss)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            no_improve_count = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            no_improve_count += 1
            if no_improve_count >= patience:
                print(f"Early stopping after {epoch} epochs")
                break
                
    model.load_state_dict(torch.load('best_model.pth'))
    return model


def sample_neighbors(graph, nodes, n_neighbors, n_hops):
    sampled_nodes = set(nodes)
    for _ in range(n_hops):
        new_nodes = set()
        for node in sampled_nodes:
            neighbors = list(graph.neighbors(node))
            sampled = random.sample(neighbors, min(n_neighbors, len(neighbors)))
            new_nodes.update(sampled)
        sampled_nodes.update(new_nodes)
    return list(sampled_nodes)

def create_mini_batch(G, batch_size, n_pos=5, n_neg=5, n_neighbors=10, n_hops=2):
    # Get all user nodes
    all_user_nodes = [n for n in G.nodes() if n.startswith('user_')]
    if not all_user_nodes:
        raise ValueError("No user nodes found in graph")
    
    # Randomly select users
    user_nodes = random.sample(all_user_nodes, min(batch_size, len(all_user_nodes)))
    
    # Create subgraph
    subgraph = G.subgraph(user_nodes).copy()
    
    # Sample positive and negative movies
    pos_movies = []
    neg_movies = []
    all_movies = [n for n in G.nodes() if n.startswith('movie_')]
    
    for user in user_nodes:
        user_movies = [n for n in G[user] if n.startswith('movie_')]
        pos_sample = random.sample(user_movies, min(n_pos, len(user_movies))) if user_movies else []
        available_neg = list(set(all_movies) - set(user_movies))
        neg_sample = random.sample(available_neg, min(n_neg, len(available_neg))) if available_neg else []
        
        pos_movies.extend(pos_sample)
        neg_movies.extend(neg_sample)
    
    return subgraph, user_nodes, pos_movies, neg_movies


## Cold Start Solutions

In [71]:
def initialize_new_movie(movie_id, similar_movies, movie_embeddings):
    if not similar_movies:
        return np.zeros(next(iter(movie_embeddings.values())).shape)
    
    similar_embeddings = [movie_embeddings[m] for m in similar_movies if m in movie_embeddings]
    if not similar_embeddings:
        return np.zeros(next(iter(movie_embeddings.values())).shape)
    
    return np.mean(similar_embeddings, axis=0)

def create_initial_edges(user_id, preferences, movie_graph):
    for pref in preferences:
        similar_movies = [m for m in movie_graph.nodes if pref in movie_graph.nodes[m]['attributes']]
        for movie in similar_movies:
            movie_graph.add_edge(user_id, movie, weight=0.5)
    return movie_graph

## Main Execution

In [74]:
import networkx as nx
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from datetime import datetime

# Sample Data Creation

def create_sample_data():
    # Sample interactions data
    interactions = [
        (1, 101, 4.5, datetime.now().timestamp()),
        (1, 102, 3.5, datetime.now().timestamp()),
        (2, 101, 5.0, datetime.now().timestamp()),
        (2, 103, 4.0, datetime.now().timestamp()),
        (3, 102, 3.0, datetime.now().timestamp()),
    ]
    
    # Sample user data
    user_data = {
        '1': {'age': 25, 'gender_encoded': 1, 'location_encoded': 2},
        '2': {'age': 30, 'gender_encoded': 0, 'location_encoded': 1},
        '3': {'age': 35, 'gender_encoded': 1, 'location_encoded': 3},
    }
    
    # Sample movie data
    global movie_data
    movie_data = {
        '101': {'genres': ['Action', 'Drama']},
        '102': {'genres': ['Comedy', 'Romance']},
        '103': {'genres': ['Action', 'Thriller']},
    }
    
    return interactions, user_data

# Simple Graph Neural Network Model for Testing
class SimpleGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(SimpleGNN, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, 1)
        
    def forward(self, x):
        x = F.relu(self.layer1(x))
        return self.layer2(x)

# Test function
def test_recommendation_system():
    print("Starting recommendation system test...")
    
    # 1. Create sample data
    interactions, user_data = create_sample_data()
    print("\n1. Sample data created")
    
    # 2. Test graph building
    G = build_movie_graph(interactions)
    print("\n2. Graph built successfully")
    print(f"Number of nodes: {G.number_of_nodes()}")
    print(f"Number of edges: {G.number_of_edges()}")
    
    # 3. Test edge weight normalization
    normalize_edge_weights(G)
    print("\n3. Edge weights normalized")
    print("Sample edge weights:", next(iter(G.edges(data=True))))
    
    # 4. Test feature engineering
    user_features = engineer_user_features(G, user_data)
    print("\n4. User features engineered")
    print(f"Number of users with features: {len(user_features)}")
    if user_features:
        print(f"Feature vector size: {len(next(iter(user_features.values())))}")
    
    # 5. Test batch creation
    try:
        subgraph, user_nodes, pos_movies, neg_movies = create_mini_batch(
            G, batch_size=2, n_pos=2, n_neg=2, n_neighbors=2, n_hops=1
        )
        print("\n5. Mini-batch created")
        print(f"Subgraph nodes: {subgraph.number_of_nodes()}")
        print(f"Sample users: {user_nodes[:2]}")
    except Exception as e:
        print(f"\n5. Mini-batch creation failed: {str(e)}")
    
    # 6. Test model and loss functions
    if user_features:
        feature_size = len(next(iter(user_features.values())))
        model = SimpleGNN(input_dim=feature_size, hidden_dim=32)
        sample_predictions = torch.randn(5)
        sample_targets = torch.randn(5)
        sample_pos_scores = torch.randn(5)
        sample_neg_scores = torch.randn(5)
        
        loss = combined_loss(
            sample_predictions, 
            sample_targets,
            sample_pos_scores, 
            sample_neg_scores
        )
        print("\n6. Loss calculation tested")
        print(f"Combined loss value: {loss.item()}")
    
    # 7. Test cold start solutions
    new_movie_id = '104'
    similar_movies = ['101', '102']
    dummy_embeddings = {
        '101': np.array([0.1, 0.2, 0.3]),
        '102': np.array([0.2, 0.3, 0.4])
    }
    
    new_movie_embedding = initialize_new_movie(
        new_movie_id, 
        similar_movies,
        dummy_embeddings
    )
    print("\n7. Cold start solution tested")
    print(f"New movie embedding: {new_movie_embedding}")

if __name__ == "__main__":
    test_recommendation_system()

Starting recommendation system test...

1. Sample data created

2. Graph built successfully
Number of nodes: 6
Number of edges: 5

3. Edge weights normalized
Sample edge weights: ('user_1', 'movie_101', {'weight': 0.9, 'last_interaction': 1732915204.994279, 'ratings': [4.5], 'rating': 4.5})

4. User features engineered
Number of users with features: 3
Feature vector size: 10

5. Mini-batch created
Subgraph nodes: 2
Sample users: ['user_2', 'user_3']

6. Loss calculation tested
Combined loss value: 1.4735026359558105

7. Cold start solution tested
New movie embedding: [0.15 0.25 0.35]


In [80]:
def test_training():
    print("Testing model training...")
    
    # Create a simple GNN model for testing
    class SimpleGNN(nn.Module):
        def __init__(self, input_dim, hidden_dim):
            super(SimpleGNN, self).__init__()
            self.layer1 = nn.Linear(input_dim, hidden_dim)
            self.layer2 = nn.Linear(hidden_dim, 1)

        def forward(self, graph, user_nodes, pos_movies, neg_movies):
            # Simplified forward pass for testing
            batch_size = len(user_nodes)
            device = next(self.parameters()).device
            
            # Create tensors with proper shape and gradient tracking
            x = torch.randn(batch_size, self.layer1.in_features, device=device, requires_grad=True)
            
            # Generate predictions using the same input features
            pred_ratings = self.layer2(F.relu(self.layer1(x)))
            pos_scores = self.layer2(F.relu(self.layer1(x)))
            neg_scores = self.layer2(F.relu(self.layer1(x)))
            
            return pred_ratings, pos_scores, neg_scores

    # Create sample data
    interactions, user_data = create_sample_data()
    
    # Build and prepare graphs
    train_graph = build_movie_graph(interactions[:4])  # 80% for training
    val_graph = build_movie_graph(interactions[4:])    # 20% for validation
    
    # Initialize model and optimization components
    input_dim = 10  # Example dimension
    hidden_dim = 32
    model = SimpleGNN(input_dim, hidden_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=2, verbose=True
    )
    
    # Define batch generation function for testing
    def generate_batches(graph, batch_size=2):
        for _ in range(3):  # Generate 3 dummy batches
            subgraph, user_nodes, pos_movies, neg_movies = create_mini_batch(
                graph, batch_size=batch_size, n_pos=2, n_neg=2,
                n_neighbors=2, n_hops=1  # Add required parameters
            )
            ratings = torch.rand(batch_size, 1, requires_grad=True)  # Add requires_grad
            yield user_nodes, pos_movies, neg_movies, ratings
    
    # Test training
    print("\nStarting training...")
    try:
        trained_model = train_model(
            model=model,
            train_graph=train_graph,
            val_graph=val_graph,
            optimizer=optimizer,
            scheduler=scheduler,
            epochs=3,
            patience=2
        )
        print("Training completed successfully!")
        
        # Test model saving/loading
        if os.path.exists('best_model.pth'):
            print("Model checkpoint saved successfully")
        return trained_model, val_graph
            
    except Exception as e:
        print(f"Training failed with error: {str(e)}")

if __name__ == "__main__":
    trained_model, val_graph = test_training()

Testing model training...

Starting training...
Epoch 0, Train Loss: 0.2519, Val Loss: 0.5463
Epoch 1, Train Loss: 0.2960, Val Loss: 0.5100
Epoch 2, Train Loss: 0.3089, Val Loss: 0.4765
Training completed successfully!
Model checkpoint saved successfully


  return F.mse_loss(predictions, targets)
  model.load_state_dict(torch.load('best_model.pth'))


In [81]:
def test_model_predictions(trained_model, test_graph):
    print("Testing model predictions...")
    
    try:
        # Set model to evaluation mode
        trained_model.eval()
        
        # Generate test batch
        batch = next(generate_batches(test_graph, batch_size=2))
        user_nodes, pos_movies, neg_movies, actual_ratings = batch
        
        # Get predictions
        with torch.no_grad():
            pred_ratings, pos_scores, neg_scores = trained_model(
                test_graph, 
                user_nodes, 
                pos_movies, 
                neg_movies
            )
        
        # Print results
        print("\nTest Results:")
        print(f"Number of test users: {len(user_nodes)}")
        print(f"Predicted ratings shape: {pred_ratings.shape}")
        print(f"Sample predicted rating: {pred_ratings[0].item():.3f}")
        print(f"Sample actual rating: {actual_ratings[0].item():.3f}")
        print(f"Positive score: {pos_scores[0].item():.3f}")
        print(f"Negative score: {neg_scores[0].item():.3f}")
        
        # Calculate test loss
        test_loss = combined_loss(pred_ratings, actual_ratings, pos_scores, neg_scores)
        print(f"\nTest Loss: {test_loss.item():.4f}")
        
        return test_loss.item()
        
    except Exception as e:
        print(f"Testing failed with error: {str(e)}")
        return None

# Add this to the test_training() function after successful training:
if trained_model is not None:
    test_model_predictions(trained_model, val_graph)

Testing model predictions...

Test Results:
Number of test users: 1
Predicted ratings shape: torch.Size([1, 1])
Sample predicted rating: 0.105
Sample actual rating: 0.944
Positive score: 0.105
Negative score: 0.105

Test Loss: 0.5812


  return F.mse_loss(predictions, targets)
