# Amazon Product Recommendation System Using Multi-Modal Learning

## Data Science Project for Amazon Internship

**Author:** Soha Farhana  
**Date:** July 14, 2025  
**Project:** Multi-modal recommendation system combining text, image, and behavioral data

### Project Overview

This notebook implements a comprehensive recommendation system that leverages:
- **Product metadata** (text descriptions, categories)
- **Customer behavior data** (browsing history, purchase patterns)
- **Visual features** (product images)
- **Temporal patterns** (seasonal trends, recent interests)

### Business Value
- Improves conversion rates through better recommendations
- Addresses cold-start problems for new products/customers
- Provides explainable AI with feature importance
- Enables A/B testing against existing systems

### Technical Approach
- **Multi-modal fusion architecture** combining collaborative and content-based filtering
- **BERT embeddings** for text processing
- **CNN features** for image analysis
- **Temporal attention** for sequential patterns
- **Production-ready API** with Docker deployment

## 1. Project Setup and Data Loading

Setting up the development environment and loading Amazon product dataset.

In [None]:
# Install required libraries (run this if packages are not installed)
# !pip install torch torchvision transformers scikit-learn pandas numpy matplotlib seaborn plotly
# !pip install sentence-transformers faiss-cpu tqdm pyyaml mlflow flask

# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine Learning libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# NLP and Computer Vision
from transformers import AutoTokenizer, AutoModel
import torchvision.models as models
import torchvision.transforms as transforms

# Utilities
import warnings
import os
import sys
import json
import yaml
from datetime import datetime, timedelta
from typing import Dict, List, Tuple, Any, Optional
import logging

# Configure settings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print("✅ All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")

In [None]:
# Create comprehensive sample Amazon dataset
def create_sample_amazon_data():
    """Create realistic sample Amazon product and review data"""
    
    # Sample product categories
    categories = ['Electronics', 'Books', 'Home_and_Kitchen', 'Clothing', 'Sports', 'Toys', 'Beauty']
    brands = ['Amazon', 'Apple', 'Samsung', 'Sony', 'Nike', 'Adidas', 'Generic']
    
    # Generate product metadata
    np.random.seed(42)
    n_products = 1000
    
    products_data = []
    for i in range(n_products):
        category = np.random.choice(categories)
        brand = np.random.choice(brands)
        
        product = {
            'asin': f'B{str(i+1).zfill(6)}',
            'title': f'{brand} {category} Product {i+1}',
            'description': f'High-quality {category.lower()} product from {brand}. '
                          f'Features advanced technology and great design. '
                          f'Perfect for daily use and professional applications.',
            'price': np.round(np.random.lognormal(3, 1), 2),
            'brand': brand,
            'category': category,
            'image_url': f'https://example.com/images/{i+1}.jpg',
            'average_rating': np.round(np.random.normal(4.0, 0.8), 1),
            'review_count': np.random.poisson(50),
            'availability': np.random.choice(['In Stock', 'Limited', 'Out of Stock'], p=[0.8, 0.15, 0.05])
        }
        products_data.append(product)
    
    # Generate user data
    n_users = 5000
    users_data = []
    
    for i in range(n_users):
        user = {
            'user_id': f'U{str(i+1).zfill(6)}',
            'age_group': np.random.choice(['18-25', '26-35', '36-45', '46-55', '55+']),
            'location': np.random.choice(['US', 'UK', 'Canada', 'Germany', 'Japan']),
            'prime_member': np.random.choice([True, False], p=[0.6, 0.4]),
            'signup_date': datetime.now() - timedelta(days=np.random.randint(30, 1095))
        }
        users_data.append(user)
    
    # Generate review/interaction data
    n_reviews = 15000
    reviews_data = []
    
    for i in range(n_reviews):
        user_idx = np.random.randint(0, n_users)
        product_idx = np.random.randint(0, n_products)
        
        # Simulate some preference patterns
        user_location = users_data[user_idx]['location']
        product_category = products_data[product_idx]['category']
        
        # Location-based preferences
        rating_bias = 0
        if user_location == 'US' and product_category in ['Electronics', 'Books']:
            rating_bias = 0.3
        elif user_location == 'Japan' and product_category == 'Electronics':
            rating_bias = 0.5
        
        base_rating = products_data[product_idx]['average_rating']
        rating = np.clip(np.random.normal(base_rating + rating_bias, 0.8), 1, 5)
        
        review = {
            'user_id': users_data[user_idx]['user_id'],
            'asin': products_data[product_idx]['asin'],
            'rating': np.round(rating, 1),
            'review_text': f'This {product_category.lower()} product is {"excellent" if rating >= 4 else "okay"}. '
                          f'{"Highly recommend!" if rating >= 4.5 else "Worth considering."}',
            'helpful_votes': np.random.poisson(2),
            'verified_purchase': np.random.choice([True, False], p=[0.8, 0.2]),
            'review_date': datetime.now() - timedelta(days=np.random.randint(1, 365))
        }
        reviews_data.append(review)
    
    # Convert to DataFrames
    products_df = pd.DataFrame(products_data)
    users_df = pd.DataFrame(users_data)
    reviews_df = pd.DataFrame(reviews_data)
    
    # Create data directories
    os.makedirs('../data/raw', exist_ok=True)
    os.makedirs('../data/processed', exist_ok=True)
    os.makedirs('../data/sample', exist_ok=True)
    
    # Save datasets
    products_df.to_csv('../data/sample/products.csv', index=False)
    users_df.to_csv('../data/sample/users.csv', index=False)
    reviews_df.to_csv('../data/sample/reviews.csv', index=False)
    
    print(f"✅ Created sample dataset with:")
    print(f"   📦 {len(products_df)} products")
    print(f"   👥 {len(users_df)} users")
    print(f"   ⭐ {len(reviews_df)} reviews")
    
    return products_df, users_df, reviews_df

# Create sample data
products_df, users_df, reviews_df = create_sample_amazon_data()

## 2. Data Preprocessing and Feature Engineering

Cleaning and preprocessing data for collaborative filtering and content-based recommendations.

In [None]:
# Data exploration and visualization
print("📊 Dataset Overview:")
print(f"Products shape: {products_df.shape}")
print(f"Users shape: {users_df.shape}")
print(f"Reviews shape: {reviews_df.shape}")

# Display basic statistics
print("\n📈 Basic Statistics:")
print("\nProducts DataFrame:")
print(products_df.info())
print(products_df.describe())

print("\nReviews DataFrame:")
print(reviews_df.info())
print(reviews_df.describe())

# Check for missing values
print("\n🔍 Missing Values:")
print("Products missing values:", products_df.isnull().sum().sum())
print("Users missing values:", users_df.isnull().sum().sum())
print("Reviews missing values:", reviews_df.isnull().sum().sum())

In [None]:
# Create comprehensive visualizations
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=('Rating Distribution', 'Category Distribution', 'Price Distribution',
                   'Reviews per User', 'Reviews per Product', 'Temporal Patterns'),
    specs=[[{"type": "histogram"}, {"type": "bar"}, {"type": "histogram"}],
           [{"type": "histogram"}, {"type": "histogram"}, {"type": "scatter"}]]
)

# Rating distribution
fig.add_trace(
    go.Histogram(x=reviews_df['rating'], name='Ratings', nbinsx=10),
    row=1, col=1
)

# Category distribution
category_counts = products_df['category'].value_counts()
fig.add_trace(
    go.Bar(x=category_counts.index, y=category_counts.values, name='Categories'),
    row=1, col=2
)

# Price distribution
fig.add_trace(
    go.Histogram(x=products_df['price'], name='Prices', nbinsx=30),
    row=1, col=3
)

# Reviews per user
user_review_counts = reviews_df['user_id'].value_counts()
fig.add_trace(
    go.Histogram(x=user_review_counts.values, name='Reviews per User', nbinsx=20),
    row=2, col=1
)

# Reviews per product
product_review_counts = reviews_df['asin'].value_counts()
fig.add_trace(
    go.Histogram(x=product_review_counts.values, name='Reviews per Product', nbinsx=20),
    row=2, col=2
)

# Temporal patterns
reviews_df['review_month'] = pd.to_datetime(reviews_df['review_date']).dt.to_period('M')
temporal_counts = reviews_df['review_month'].value_counts().sort_index()
fig.add_trace(
    go.Scatter(x=temporal_counts.index.astype(str), y=temporal_counts.values, 
               mode='lines+markers', name='Reviews over Time'),
    row=2, col=3
)

fig.update_layout(
    height=800, 
    title_text="Amazon Dataset Comprehensive Analysis",
    showlegend=False
)
fig.show()

# Additional analysis
print("\n📊 Key Insights:")
print(f"Average rating: {reviews_df['rating'].mean():.2f}")
print(f"Rating standard deviation: {reviews_df['rating'].std():.2f}")
print(f"Most popular category: {category_counts.index[0]} ({category_counts.iloc[0]} products)")
print(f"Average price: ${products_df['price'].mean():.2f}")
print(f"Price range: ${products_df['price'].min():.2f} - ${products_df['price'].max():.2f}")
print(f"Average reviews per user: {user_review_counts.mean():.1f}")
print(f"Average reviews per product: {product_review_counts.mean():.1f}")

# Sparsity analysis
total_possible_interactions = len(users_df) * len(products_df)
actual_interactions = len(reviews_df)
sparsity = (1 - actual_interactions / total_possible_interactions) * 100
print(f"\n🔍 Data Sparsity: {sparsity:.2f}% (typical for recommendation systems)")

In [None]:
# Data preprocessing for collaborative filtering
class DataPreprocessor:
    def __init__(self, min_user_interactions=3, min_item_interactions=3):
        self.min_user_interactions = min_user_interactions
        self.min_item_interactions = min_item_interactions
        self.user_encoder = LabelEncoder()
        self.item_encoder = LabelEncoder()
        
    def preprocess_interactions(self, reviews_df):
        """Filter and encode user-item interactions"""
        # Filter users and items with minimum interactions
        user_counts = reviews_df['user_id'].value_counts()
        item_counts = reviews_df['asin'].value_counts()
        
        valid_users = user_counts[user_counts >= self.min_user_interactions].index
        valid_items = item_counts[item_counts >= self.min_item_interactions].index
        
        filtered_reviews = reviews_df[
            (reviews_df['user_id'].isin(valid_users)) & 
            (reviews_df['asin'].isin(valid_items))
        ].copy()
        
        # Encode users and items
        filtered_reviews['user_idx'] = self.user_encoder.fit_transform(filtered_reviews['user_id'])
        filtered_reviews['item_idx'] = self.item_encoder.fit_transform(filtered_reviews['asin'])
        
        print(f"✅ Filtered data: {len(filtered_reviews)} interactions")
        print(f"   Users: {len(valid_users)} → {filtered_reviews['user_idx'].nunique()}")
        print(f"   Items: {len(valid_items)} → {filtered_reviews['item_idx'].nunique()}")
        
        return filtered_reviews
    
    def create_interaction_matrix(self, filtered_reviews):
        """Create user-item interaction matrix"""
        n_users = filtered_reviews['user_idx'].nunique()
        n_items = filtered_reviews['item_idx'].nunique()
        
        # Create interaction matrix
        interaction_matrix = np.zeros((n_users, n_items))
        
        for _, row in filtered_reviews.iterrows():
            user_idx = row['user_idx']
            item_idx = row['item_idx']
            rating = row['rating']
            interaction_matrix[user_idx, item_idx] = rating
        
        print(f"📊 Interaction matrix shape: {interaction_matrix.shape}")
        print(f"   Sparsity: {(np.count_nonzero(interaction_matrix) / interaction_matrix.size * 100):.2f}% filled")
        
        return interaction_matrix

# Apply preprocessing
preprocessor = DataPreprocessor()
filtered_reviews = preprocessor.preprocess_interactions(reviews_df)
interaction_matrix = preprocessor.create_interaction_matrix(filtered_reviews)

# Train-test split for recommendation evaluation
def create_train_test_split(filtered_reviews, test_ratio=0.2):
    """Create temporal train-test split"""
    # Sort by review date
    sorted_reviews = filtered_reviews.sort_values('review_date')
    
    # Split per user to ensure all users in both sets
    train_data = []
    test_data = []
    
    for user_idx in sorted_reviews['user_idx'].unique():
        user_reviews = sorted_reviews[sorted_reviews['user_idx'] == user_idx]
        
        if len(user_reviews) >= 2:
            n_test = max(1, int(len(user_reviews) * test_ratio))
            test_reviews = user_reviews.tail(n_test)
            train_reviews = user_reviews.head(len(user_reviews) - n_test)
            
            train_data.append(train_reviews)
            test_data.append(test_reviews)
        else:
            train_data.append(user_reviews)
    
    train_df = pd.concat(train_data, ignore_index=True)
    test_df = pd.concat(test_data, ignore_index=True) if test_data else pd.DataFrame()
    
    print(f"📊 Train-Test Split:")
    print(f"   Train: {len(train_df)} interactions")
    print(f"   Test: {len(test_df)} interactions")
    
    return train_df, test_df

train_df, test_df = create_train_test_split(filtered_reviews)

## 3. Text Feature Extraction with BERT

Implementing BERT-based embeddings for product descriptions and titles.

In [None]:
# BERT-based text feature extraction
class TextFeatureExtractor:
    def __init__(self, model_name='bert-base-uncased', max_length=128):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.max_length = max_length
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        self.model.eval()
        
    def prepare_text_data(self, products_df):
        """Combine and clean text features"""
        # Combine title, description, and category
        products_df['combined_text'] = (
            products_df['title'].fillna('') + ' ' + 
            products_df['description'].fillna('') + ' ' + 
            products_df['category'].fillna('')
        )
        
        # Clean text
        products_df['combined_text'] = products_df['combined_text'].str.lower()
        products_df['combined_text'] = products_df['combined_text'].str.replace('[^a-zA-Z0-9 ]', '', regex=True)
        
        return products_df['combined_text'].tolist()
    
    def extract_embeddings(self, texts, batch_size=32):
        """Extract BERT embeddings for text data"""
        embeddings = []
        
        print(f"🤖 Extracting BERT embeddings for {len(texts)} products...")
        
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            
            # Tokenize
            encoded = self.tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=self.max_length,
                return_tensors='pt'
            )
            
            # Move to device
            input_ids = encoded['input_ids'].to(self.device)
            attention_mask = encoded['attention_mask'].to(self.device)
            
            # Extract embeddings
            with torch.no_grad():
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                # Use [CLS] token embedding
                batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                embeddings.append(batch_embeddings)
            
            if (i // batch_size + 1) % 10 == 0:
                print(f"   Processed {i + len(batch_texts)}/{len(texts)} texts")
        
        embeddings = np.vstack(embeddings)
        print(f"✅ Extracted embeddings shape: {embeddings.shape}")
        
        return embeddings

# Extract text features
text_extractor = TextFeatureExtractor()
product_texts = text_extractor.prepare_text_data(products_df)

# Extract BERT embeddings (this might take a few minutes)
try:
    text_embeddings = text_extractor.extract_embeddings(product_texts)
except Exception as e:
    print(f"⚠️ BERT extraction failed: {e}")
    print("🔄 Using simulated embeddings for demonstration")
    # Create simulated embeddings for demo
    text_embeddings = np.random.normal(0, 1, (len(products_df), 768))

print(f"📊 Text embeddings shape: {text_embeddings.shape}")

# Visualize text embeddings using PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Apply PCA for visualization
scaler = StandardScaler()
text_embeddings_scaled = scaler.fit_transform(text_embeddings)

pca = PCA(n_components=2, random_state=42)
text_embeddings_2d = pca.fit_transform(text_embeddings_scaled)

# Create visualization
fig = go.Figure()

# Color by category
for category in products_df['category'].unique():
    mask = products_df['category'] == category
    fig.add_trace(go.Scatter(
        x=text_embeddings_2d[mask, 0],
        y=text_embeddings_2d[mask, 1],
        mode='markers',
        name=category,
        text=products_df[mask]['title'],
        hovertemplate='<b>%{text}</b><br>Category: ' + category
    ))

fig.update_layout(
    title='Product Text Embeddings (PCA Visualization)',
    xaxis_title=f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)',
    yaxis_title=f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)',
    height=600
)
fig.show()

print(f"💡 PCA explains {sum(pca.explained_variance_ratio_):.1%} of variance in first 2 components")

## 4. Image Feature Extraction with CNN

Extracting visual features from product images using pre-trained CNN models.

In [None]:
# CNN-based image feature extraction
class ImageFeatureExtractor:
    def __init__(self, model_name='resnet50'):
        # Load pre-trained ResNet model
        if model_name == 'resnet50':
            self.model = models.resnet50(pretrained=True)
            # Remove the final classification layer
            self.model = nn.Sequential(*list(self.model.children())[:-1])
        
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        self.model.eval()
        
        # Image preprocessing
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                               std=[0.229, 0.224, 0.225])
        ])
    
    def simulate_image_features(self, n_products, feature_dim=2048):
        """Simulate image features based on product categories"""
        print(f"🖼️  Simulating image features for {n_products} products...")
        
        # Create category-based features
        category_features = {
            'Electronics': np.random.normal(0.5, 0.3, feature_dim),
            'Books': np.random.normal(-0.3, 0.2, feature_dim),
            'Home_and_Kitchen': np.random.normal(0.2, 0.4, feature_dim),
            'Clothing': np.random.normal(-0.1, 0.3, feature_dim),
            'Sports': np.random.normal(0.3, 0.2, feature_dim),
            'Toys': np.random.normal(-0.2, 0.4, feature_dim),
            'Beauty': np.random.normal(0.1, 0.3, feature_dim)
        }
        
        image_features = []
        for _, product in products_df.iterrows():
            category = product['category']
            base_features = category_features.get(category, np.zeros(feature_dim))
            
            # Add product-specific noise
            product_features = base_features + np.random.normal(0, 0.1, feature_dim)
            image_features.append(product_features)
        
        image_features = np.array(image_features)
        print(f"✅ Generated image features shape: {image_features.shape}")
        
        return image_features
    
    def extract_real_features(self, image_paths, batch_size=32):
        """Extract features from real images (if available)"""
        # This would be used if you have actual product images
        print("🔄 Real image feature extraction not implemented - using simulated features")
        return self.simulate_image_features(len(image_paths))

# Extract image features
image_extractor = ImageFeatureExtractor()

# Simulate image features (in real scenario, would process actual images)
image_features = image_extractor.simulate_image_features(len(products_df))

# Visualize image features using t-SNE
from sklearn.manifold import TSNE

print("🔄 Running t-SNE on image features...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
image_features_2d = tsne.fit_transform(image_features[:500])  # Sample for speed

# Create visualization
fig = go.Figure()

for category in products_df['category'].unique():
    mask = (products_df['category'] == category).iloc[:500]  # Match sample size
    if mask.any():
        fig.add_trace(go.Scatter(
            x=image_features_2d[mask, 0],
            y=image_features_2d[mask, 1],
            mode='markers',
            name=category,
            text=products_df[mask]['title'],
            hovertemplate='<b>%{text}</b><br>Category: ' + category
        ))

fig.update_layout(
    title='Product Image Features (t-SNE Visualization)',
    xaxis_title='t-SNE 1',
    yaxis_title='t-SNE 2',
    height=600
)
fig.show()

# Combine text and image features
print("🔗 Combining text and image features...")
combined_features = np.concatenate([text_embeddings, image_features], axis=1)
print(f"✅ Combined features shape: {combined_features.shape}")

# Feature importance analysis
feature_names = ([f'text_dim_{i}' for i in range(text_embeddings.shape[1])] + 
                [f'image_dim_{i}' for i in range(image_features.shape[1])])

print(f"📊 Feature Summary:")
print(f"   Text features: {text_embeddings.shape[1]} dimensions")
print(f"   Image features: {image_features.shape[1]} dimensions")
print(f"   Combined features: {combined_features.shape[1]} dimensions")

## 5. Collaborative Filtering Implementation

Building matrix factorization models for user-item interactions using PyTorch.

In [None]:
# Collaborative Filtering with Matrix Factorization
class CollaborativeFilteringModel(nn.Module):
    def __init__(self, n_users, n_items, embedding_dim=64, dropout=0.1):
        super().__init__()
        
        # User and item embeddings
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.item_embedding = nn.Embedding(n_items, embedding_dim)
        
        # Bias terms
        self.user_bias = nn.Embedding(n_users, 1)
        self.item_bias = nn.Embedding(n_items, 1)
        self.global_bias = nn.Parameter(torch.zeros(1))
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
        # Initialize embeddings
        nn.init.normal_(self.user_embedding.weight, 0, 0.1)
        nn.init.normal_(self.item_embedding.weight, 0, 0.1)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        
    def forward(self, user_ids, item_ids):
        # Get embeddings
        user_emb = self.dropout(self.user_embedding(user_ids))
        item_emb = self.dropout(self.item_embedding(item_ids))
        
        # Get biases
        user_bias = self.user_bias(user_ids).squeeze()
        item_bias = self.item_bias(item_ids).squeeze()
        
        # Compute prediction
        interaction = (user_emb * item_emb).sum(dim=1)
        prediction = interaction + user_bias + item_bias + self.global_bias
        
        return prediction, user_emb, item_emb

# Dataset for collaborative filtering
class CFDataset(Dataset):
    def __init__(self, user_ids, item_ids, ratings):
        self.user_ids = torch.LongTensor(user_ids)
        self.item_ids = torch.LongTensor(item_ids)
        self.ratings = torch.FloatTensor(ratings)
        
    def __len__(self):
        return len(self.user_ids)
    
    def __getitem__(self, idx):
        return self.user_ids[idx], self.item_ids[idx], self.ratings[idx]

# Prepare collaborative filtering data
n_users = filtered_reviews['user_idx'].nunique()
n_items = filtered_reviews['item_idx'].nunique()

print(f"🤝 Collaborative Filtering Setup:")
print(f"   Users: {n_users}")
print(f"   Items: {n_items}")
print(f"   Interactions: {len(filtered_reviews)}")

# Create train dataset
train_dataset = CFDataset(
    train_df['user_idx'].values,
    train_df['item_idx'].values,
    train_df['rating'].values
)

# Create test dataset
test_dataset = CFDataset(
    test_df['user_idx'].values,
    test_df['item_idx'].values,
    test_df['rating'].values
) if len(test_df) > 0 else None

# Data loaders
batch_size = 1024
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) if test_dataset else None

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cf_model = CollaborativeFilteringModel(n_users, n_items, embedding_dim=64)
cf_model.to(device)

# Training setup
optimizer = torch.optim.Adam(cf_model.parameters(), lr=0.001, weight_decay=1e-5)
criterion = nn.MSELoss()

print(f"✅ CF Model initialized with {sum(p.numel() for p in cf_model.parameters())} parameters")

# Training function
def train_cf_model(model, train_loader, optimizer, criterion, epochs=20):
    model.train()
    train_losses = []
    
    print("🚀 Training Collaborative Filtering Model...")
    
    for epoch in range(epochs):
        epoch_loss = 0.0
        batch_count = 0
        
        for user_ids, item_ids, ratings in train_loader:
            user_ids, item_ids, ratings = user_ids.to(device), item_ids.to(device), ratings.to(device)
            
            optimizer.zero_grad()
            
            predictions, user_emb, item_emb = model(user_ids, item_ids)
            loss = criterion(predictions, ratings)
            
            # L2 regularization
            l2_reg = torch.norm(user_emb, p=2) + torch.norm(item_emb, p=2)
            loss += 0.01 * l2_reg
            
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            batch_count += 1
        
        avg_loss = epoch_loss / batch_count
        train_losses.append(avg_loss)
        
        if (epoch + 1) % 5 == 0:
            print(f"   Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
    
    return train_losses

# Train the model
train_losses = train_cf_model(cf_model, train_loader, optimizer, criterion, epochs=20)

# Plot training progress
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=list(range(1, len(train_losses)+1)),
    y=train_losses,
    mode='lines+markers',
    name='Training Loss'
))
fig.update_layout(
    title='Collaborative Filtering Training Progress',
    xaxis_title='Epoch',
    yaxis_title='Loss (MSE)',
    height=400
)
fig.show()

print(f"✅ Training completed. Final loss: {train_losses[-1]:.4f}")

## 6. Multi-Modal Fusion Architecture

Designing and implementing the neural network that combines text, image, and collaborative features.

In [None]:
# Multi-Modal Fusion Architecture
class AttentionLayer(nn.Module):
    def __init__(self, input_dim, attention_dim=128):
        super().__init__()
        self.attention_dim = attention_dim
        self.W = nn.Linear(input_dim, attention_dim)
        self.U = nn.Linear(attention_dim, 1)
        self.tanh = nn.Tanh()
        
    def forward(self, x):
        # x shape: (batch_size, input_dim)
        u = self.tanh(self.W(x))
        attention_weights = F.softmax(self.U(u), dim=1)
        return attention_weights

class MultiModalRecommendationModel(nn.Module):
    def __init__(self, n_users, n_items, text_dim=768, image_dim=2048, 
                 embedding_dim=128, hidden_dims=[256, 128, 64], dropout=0.3):
        super().__init__()
        
        # Collaborative filtering components
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.item_embedding = nn.Embedding(n_items, embedding_dim)
        self.user_bias = nn.Embedding(n_users, 1)
        self.item_bias = nn.Embedding(n_items, 1)
        self.global_bias = nn.Parameter(torch.zeros(1))
        
        # Text feature projection
        self.text_projection = nn.Sequential(
            nn.Linear(text_dim, embedding_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )\n        \n        # Image feature projection\n        self.image_projection = nn.Sequential(\n            nn.Linear(image_dim, embedding_dim),\n            nn.ReLU(),\n            nn.Dropout(dropout)\n        )\n        \n        # Attention mechanisms\n        self.user_attention = AttentionLayer(embedding_dim)\n        self.item_attention = AttentionLayer(embedding_dim)\n        self.content_attention = AttentionLayer(embedding_dim * 2)  # text + image\n        \n        # Fusion network\n        fusion_input_dim = embedding_dim * 4  # user + item + text + image\n        \n        fusion_layers = []\n        prev_dim = fusion_input_dim\n        \n        for hidden_dim in hidden_dims:\n            fusion_layers.extend([\n                nn.Linear(prev_dim, hidden_dim),\n                nn.ReLU(),\n                nn.Dropout(dropout),\n                nn.BatchNorm1d(hidden_dim)\n            ])\n            prev_dim = hidden_dim\n        \n        fusion_layers.append(nn.Linear(prev_dim, 1))\n        self.fusion_network = nn.Sequential(*fusion_layers)\n        \n        # Content-based fallback for cold-start\n        self.content_network = nn.Sequential(\n            nn.Linear(embedding_dim * 2, 64),\n            nn.ReLU(),\n            nn.Dropout(dropout),\n            nn.Linear(64, 1)\n        )\n        \n        # Initialize embeddings\n        nn.init.normal_(self.user_embedding.weight, 0, 0.1)\n        nn.init.normal_(self.item_embedding.weight, 0, 0.1)\n        nn.init.zeros_(self.user_bias.weight)\n        nn.init.zeros_(self.item_bias.weight)\n    \n    def forward(self, user_ids, item_ids, text_features=None, image_features=None):\n        batch_size = user_ids.size(0)\n        \n        # Collaborative filtering embeddings\n        user_emb = self.user_embedding(user_ids)\n        item_emb = self.item_embedding(item_ids)\n        user_bias = self.user_bias(user_ids).squeeze()\n        item_bias = self.item_bias(item_ids).squeeze()\n        \n        # Process content features\n        if text_features is not None:\n            text_emb = self.text_projection(text_features)\n        else:\n            text_emb = torch.zeros(batch_size, user_emb.size(1), device=user_emb.device)\n        \n        if image_features is not None:\n            image_emb = self.image_projection(image_features)\n        else:\n            image_emb = torch.zeros(batch_size, user_emb.size(1), device=user_emb.device)\n        \n        # Apply attention to content features\n        content_features = torch.cat([text_emb, image_emb], dim=1)\n        content_attention_weights = self.content_attention(content_features)\n        attended_content = content_attention_weights * content_features\n        \n        # Split back to text and image\n        mid_point = attended_content.size(1) // 2\n        attended_text = attended_content[:, :mid_point]\n        attended_image = attended_content[:, mid_point:]\n        \n        # Fusion\n        fusion_input = torch.cat([user_emb, item_emb, attended_text, attended_image], dim=1)\n        main_prediction = self.fusion_network(fusion_input).squeeze()\n        \n        # Content-based prediction for cold-start\n        content_input = torch.cat([text_emb, image_emb], dim=1)\n        content_prediction = self.content_network(content_input).squeeze()\n        \n        # Final prediction combines collaborative filtering bias with neural network\n        final_prediction = main_prediction + user_bias + item_bias + self.global_bias\n        \n        return {\n            'prediction': final_prediction,\n            'content_prediction': content_prediction,\n            'user_embedding': user_emb,\n            'item_embedding': item_emb,\n            'text_embedding': text_emb,\n            'image_embedding': image_emb\n        }\n\n# Prepare multi-modal dataset\nclass MultiModalDataset(Dataset):\n    def __init__(self, user_ids, item_ids, ratings, text_features, image_features):\n        self.user_ids = torch.LongTensor(user_ids)\n        self.item_ids = torch.LongTensor(item_ids)\n        self.ratings = torch.FloatTensor(ratings)\n        self.text_features = torch.FloatTensor(text_features)\n        self.image_features = torch.FloatTensor(image_features)\n        \n    def __len__(self):\n        return len(self.user_ids)\n    \n    def __getitem__(self, idx):\n        return (\n            self.user_ids[idx], \n            self.item_ids[idx], \n            self.ratings[idx],\n            self.text_features[self.item_ids[idx]],\n            self.image_features[self.item_ids[idx]]\n        )\n\n# Create item feature lookup\nvalid_items = preprocessor.item_encoder.classes_\nitem_text_features = text_embeddings\nitem_image_features = image_features\n\n# Create multi-modal datasets\nmm_train_dataset = MultiModalDataset(\n    train_df['user_idx'].values,\n    train_df['item_idx'].values,\n    train_df['rating'].values,\n    item_text_features,\n    item_image_features\n)\n\nmm_test_dataset = MultiModalDataset(\n    test_df['user_idx'].values,\n    test_df['item_idx'].values,\n    test_df['rating'].values,\n    item_text_features,\n    item_image_features\n) if len(test_df) > 0 else None\n\n# Data loaders\nmm_train_loader = DataLoader(mm_train_dataset, batch_size=256, shuffle=True)\nmm_test_loader = DataLoader(mm_test_dataset, batch_size=256, shuffle=False) if mm_test_dataset else None\n\n# Initialize multi-modal model\nmm_model = MultiModalRecommendationModel(\n    n_users=n_users,\n    n_items=n_items,\n    text_dim=text_embeddings.shape[1],\n    image_dim=image_features.shape[1],\n    embedding_dim=128,\n    hidden_dims=[256, 128, 64],\n    dropout=0.3\n)\nmm_model.to(device)\n\nprint(f\"🚀 Multi-Modal Model initialized:\")\nprint(f\"   Parameters: {sum(p.numel() for p in mm_model.parameters()):,}\")\nprint(f\"   Text features: {text_embeddings.shape[1]} dims\")\nprint(f\"   Image features: {image_features.shape[1]} dims\")\nprint(f\"   Total content features: {text_embeddings.shape[1] + image_features.shape[1]} dims\")