In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
import logging
from datetime import datetime
import pickle
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../')

In [2]:
from src.models.collaborative_filtering.matrix_factorization import MatrixFactorizationRecommender
from src.models.content_based.tfidf_recommender import TFIDFRecommender

In [3]:
# Importing new hybrid models
from src.models.hybrid.weighted_ensemble import WeightedHybridRecommender
from src.models.hybrid.switching_hybrid import SwitchingHybridRecommender
from src.evaluation.metrics import RecommendationMetrics


In [4]:
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [5]:
# Load data and existing trained models from Phase 2
processed_dir = Path('../data/processed')
models_dir = Path('../models')
features_dir = Path('../data/features')

In [6]:
# Load ratings data
ratings_df = pd.read_csv(processed_dir / 'ratings_cleaned.csv')
enriched_movies = pd.read_csv(Path('../data/enriched') / 'movies_enriched.csv')

print(f"Dataset Overview:")
print(f"Ratings: {len(ratings_df):,}")
print(f"Movies: {len(enriched_movies):,}")
print(f"Users: {ratings_df['userId'].nunique():,}")

Dataset Overview:
Ratings: 100,836
Movies: 5,922
Users: 610


In [7]:
# Create train/validation/test splits
train_size = int(0.7 * len(ratings_df))
val_size = int(0.15 * len(ratings_df))

shuffled_ratings = ratings_df.sample(frac=1, random_state=42).reset_index(drop=True)

train_df = shuffled_ratings.iloc[:train_size]
val_df = shuffled_ratings.iloc[train_size:train_size + val_size]
test_df = shuffled_ratings.iloc[train_size + val_size:]

print(f"\nData Splits:")
print(f"Training: {len(train_df):,} ({len(train_df)/len(ratings_df)*100:.1f}%)")
print(f"Validation: {len(val_df):,} ({len(val_df)/len(ratings_df)*100:.1f}%)")
print(f"Test: {len(test_df):,} ({len(test_df)/len(ratings_df)*100:.1f}%)")


Data Splits:
Training: 70,585 (70.0%)
Validation: 15,125 (15.0%)
Test: 15,126 (15.0%)


In [8]:
# Replace your SVD++ loading code with this safer version
try:
    svd_model = MatrixFactorizationRecommender()
    svd_model_path = Path(r"E:\FilmFusion\models\svd_plus_plus_model.pkl")
    
    if svd_model_path.exists():
        print(f"Loading SVD++ model from {svd_model_path}")
        svd_model.load_model(str(svd_model_path))
        
        # Verify the model loaded correctly
        if hasattr(svd_model, 'model') and svd_model.model is not None:
            print("SVD++ model loaded successfully")
            print(f"Model type: {type(svd_model.model)}")
        else:
            print("SVD++ model loaded but internal model is None")
            svd_model = None
    else:
        print("SVD++ model file not found, training new model...")
        svd_model.fit(train_df)
        svd_model.save_model(str(svd_model_path))
        print(" SVD++ model trained and saved")
        
except Exception as e:
    print(f"Error with SVD++ model: {e}")
    print("Detailed error:")
    import traceback
    traceback.print_exc()
    svd_model = None


INFO:src.models.base_recommender:Model loaded from E:\FilmFusion\models\svd_plus_plus_model.pkl


Loading SVD++ model from E:\FilmFusion\models\svd_plus_plus_model.pkl
SVD++ model loaded successfully
Model type: <class 'surprise.prediction_algorithms.matrix_factorization.SVDpp'>


In [9]:
# 2. Load TF-IDF Content Model
try:
    from scipy import sparse
    import json
    
    # Load TF-IDF components
    tfidf_matrix = sparse.load_npz(features_dir / 'tfidf_matrix.npz')
    tfidf_movie_mapping = pd.read_csv(features_dir / 'tfidf_movie_mapping.csv')
    
    # Merge with enriched movies data
    tfidf_movies_df = tfidf_movie_mapping.merge(
        enriched_movies[['movieId', 'tmdb_title', 'year', 'genres_tmdb']], 
        on='movieId', 
        how='left'
    )
    
    # Initialize and train TF-IDF model
    tfidf_model = TFIDFRecommender(similarity_threshold=0.1)
    tfidf_model.fit(
        ratings_df=train_df,
        tfidf_matrix=tfidf_matrix,
        movies_df=tfidf_movies_df
    )
    
    print("TF-IDF content model loaded and trained successfully")
    
except Exception as e:
    print(f"Error with TF-IDF model: {e}")
    tfidf_model = None


INFO:src.models.base_recommender:Training TF-IDF Content-Based Recommender...
INFO:src.models.base_recommender:Computing cosine similarity matrix...
INFO:src.models.base_recommender:✅ TF-IDF Content-Based Recommender training completed
INFO:src.models.base_recommender:Similarity matrix shape: (5918, 5918)


TF-IDF content model loaded and trained successfully


In [10]:

print(f"\nModel Verification:")
test_user_id = 1

# Test SVD++ model safely
if svd_model is not None:
    try:
        svd_recs = svd_model.recommend(test_user_id, 5)
        print(f"SVD++ recommendations for user {test_user_id}: {len(svd_recs)} items")
        if svd_recs:
            print(f"   Sample rec: Movie {svd_recs[0][0]} (score: {svd_recs[0][1]:.3f})")
    except Exception as e:
        print(f"SVD++ test failed: {e}")
        svd_model = None
else:
    print("SVD++ model is None - cannot test")

# Test TF-IDF model (this should work)
if tfidf_model:
    try:
        tfidf_recs = tfidf_model.recommend(test_user_id, 5)
        print(f"TF-IDF recommendations for user {test_user_id}: {len(tfidf_recs)} items")
    except Exception as e:
        print(f"TF-IDF test failed: {e}")



Model Verification:
SVD++ recommendations for user 1: 5 items
   Sample rec: Movie 246 (score: 5.000)
TF-IDF recommendations for user 1: 5 items


In [11]:
# Initialize Weighted Hybrid Recommender


# Prepare models dictionary
component_models = {}
if svd_model:
    component_models['svd'] = svd_model
if tfidf_model:
    component_models['tfidf'] = tfidf_model

print(f"Available models for ensemble: {list(component_models.keys())}")

if len(component_models) >= 2:
    # Initialize weighted hybrid
    weighted_hybrid = WeightedHybridRecommender(
        models=component_models,
        optimization_method='grid_search'
    )
    
    # Train the hybrid model (optimize weights)
    print("Training weighted hybrid recommender...")
    start_time = datetime.now()
    
    # Pass the required TF-IDF parameters
    weighted_hybrid.fit(
        train_df=train_df, 
        validation_df=val_df,
        tfidf_matrix=tfidf_matrix,  # Pass the loaded TF-IDF matrix
        movies_df=tfidf_movies_df   # Pass the movies dataframe
    )
    
    training_time = (datetime.now() - start_time).total_seconds()
    print(f"Weighted hybrid training completed in {training_time:.1f} seconds")
    
    # Display optimized weights
    print(f"\nOptimized Weights:")
    for model_name, weight in weighted_hybrid.weights.items():
        print(f"  {model_name}: {weight:.3f}")
        

    
    # Test hybrid recommendations
    print(f"\nSample Hybrid Recommendations:")
    test_users = [1, 42, 100]
    
    for user_id in test_users:
        try:
            hybrid_recs = weighted_hybrid.recommend(user_id, 5)
            print(f"\nUser {user_id}:")
            for i, (movie_id, score) in enumerate(hybrid_recs, 1):
                movie_info = enriched_movies[enriched_movies['movieId'] == movie_id]
                title = movie_info.iloc[0]['tmdb_title'] if len(movie_info) > 0 else f"Movie {movie_id}"
                print(f"  {i}. {title} (score: {score:.3f})")
        except Exception as e:
            print(f"  Error for user {user_id}: {e}")

else:
    print("Need at least 2 models for hybrid approach")
    weighted_hybrid = None


INFO:src.models.hybrid.weighted_ensemble:Training individual models...
INFO:src.models.hybrid.weighted_ensemble:Training svd model...
INFO:src.models.base_recommender:Training SVD++ Matrix Factorization...


🔗 WEIGHTED HYBRID RECOMMENDER
Available models for ensemble: ['svd', 'tfidf']
Training weighted hybrid recommender...


INFO:src.models.base_recommender:Created mappings: 610 users, 8493 items
INFO:src.models.base_recommender:✅ SVD++ Matrix Factorization training completed
INFO:src.models.hybrid.weighted_ensemble:svd model trained successfully
INFO:src.models.hybrid.weighted_ensemble:Training tfidf model...
INFO:src.models.base_recommender:Training TF-IDF Content-Based Recommender...
INFO:src.models.base_recommender:Computing cosine similarity matrix...
INFO:src.models.base_recommender:✅ TF-IDF Content-Based Recommender training completed
INFO:src.models.base_recommender:Similarity matrix shape: (5918, 5918)
INFO:src.models.hybrid.weighted_ensemble:tfidf model trained successfully
INFO:src.models.hybrid.weighted_ensemble:Optimizing ensemble weights...
INFO:src.models.hybrid.weighted_ensemble:Testing 37 weight combinations
INFO:src.models.hybrid.weighted_ensemble:Optimal weights: {'svd': 0.2, 'tfidf': 0.5}, Score: 0.0893
INFO:src.models.hybrid.weighted_ensemble:Hybrid recommender training completed


Weighted hybrid training completed in 8614.0 seconds

Optimized Weights:
  svd: 0.200
  tfidf: 0.500

🎬 Sample Hybrid Recommendations:

User 1:
  1. Movie 318 (score: 12.984)
  2. Movie 296 (score: 12.270)
  3. Movie 260 (score: 11.999)
  4. Movie 858 (score: 11.628)
  5. Movie 356 (score: 11.364)

User 42:
  1. Movie 356 (score: 12.348)
  2. Movie 318 (score: 11.984)
  3. Movie 260 (score: 11.939)
  4. Movie 296 (score: 11.270)
  5. Movie 2571 (score: 11.179)

User 100:
  1. Movie 3275 (score: 0.912)
  2. Inside Job (score: 0.908)
  3. Movie 910 (score: 0.905)
  4. Movie 1237 (score: 0.905)
  5. Whiplash (score: 0.904)


In [12]:
# Initialize Switching Hybrid Recommender
print("\nSWITCHING HYBRID RECOMMENDER")
print("=" * 40)

if len(component_models) >= 2:
    # Initialize switching hybrid
    switching_hybrid = SwitchingHybridRecommender(
        models=component_models,
        switching_strategy='user_profile'
    )
    
    # Train the switching hybrid model
    print("Training switching hybrid recommender...")
    start_time = datetime.now()
    
    switching_hybrid.fit(train_df, val_df)
    
    training_time = (datetime.now() - start_time).total_seconds()
    print(f"Switching hybrid training completed in {training_time:.1f} seconds")
    
    # Test switching recommendations
    print(f"\n🎬 Sample Switching Recommendations:")
    
    for user_id in test_users:
        try:
            switching_recs = switching_hybrid.recommend(user_id, 5)
            selected_model = switching_hybrid._decide_model(user_id)
            
            print(f"\nUser {user_id} (selected model: {selected_model}):")
            for i, (movie_id, score) in enumerate(switching_recs, 1):
                movie_info = enriched_movies[enriched_movies['movieId'] == movie_id]
                title = movie_info.iloc[0]['tmdb_title'] if len(movie_info) > 0 else f"Movie {movie_id}"
                print(f"  {i}. {title} (score: {score:.3f})")
        except Exception as e:
            print(f"  Error for user {user_id}: {e}")

else:
    print("Need at least 2 models for switching approach")
    switching_hybrid = None


INFO:src.models.hybrid.switching_hybrid:Training individual models...
INFO:src.models.hybrid.switching_hybrid:Training svd model...
INFO:src.models.base_recommender:Training SVD++ Matrix Factorization...



SWITCHING HYBRID RECOMMENDER
Training switching hybrid recommender...


INFO:src.models.base_recommender:Created mappings: 610 users, 8493 items


KeyboardInterrupt: 

In [None]:
# Comprehensive evaluation of hybrid models
print("\nHYBRID MODEL EVALUATION")
print("=" * 35)

# Initialize metrics calculator
metrics_calculator = RecommendationMetrics()

def evaluate_hybrid_model(model, model_name, test_df, sample_size=100):
    """Evaluate hybrid model performance"""
    
    print(f"\nEvaluating {model_name}...")
    
    # Sample test users
    test_users = test_df['userId'].unique()[:sample_size]
    
    precision_scores = []
    recall_scores = []
    f1_scores = []
    
    successful_users = 0
    
    for user_id in test_users:
        try:
            # Get user's test ratings
            user_test = test_df[test_df['userId'] == user_id]
            if len(user_test) == 0:
                continue
            
            # Get recommendations
            recommendations = model.recommend(user_id, 10)
            if not recommendations:
                continue
            
            # Calculate metrics
            relevant_items = set(user_test[user_test['rating'] >= 4.0]['movieId'])
            recommended_items = [item_id for item_id, _ in recommendations]
            
            if len(relevant_items) > 0:
                precision = metrics_calculator.precision_at_k(recommended_items, relevant_items, 10)
                recall = metrics_calculator.recall_at_k(recommended_items, relevant_items, 10)
                f1 = metrics_calculator.f1_at_k(recommended_items, relevant_items, 10)
                
                precision_scores.append(precision)
                recall_scores.append(recall)
                f1_scores.append(f1)
                successful_users += 1
                
        except Exception as e:
            continue
    
    # Calculate averages
    results = {
        'precision_at_10': np.mean(precision_scores) if precision_scores else 0,
        'recall_at_10': np.mean(recall_scores) if recall_scores else 0,
        'f1_at_10': np.mean(f1_scores) if f1_scores else 0,
        'successful_users': successful_users,
        'total_users': len(test_users)
    }
    
    print(f"Results for {model_name}:")
    print(f"  Precision@10: {results['precision_at_10']:.4f}")
    print(f"  Recall@10: {results['recall_at_10']:.4f}")
    print(f"  F1@10: {results['f1_at_10']:.4f}")
    print(f"  Success Rate: {results['successful_users']}/{results['total_users']}")
    
    return results



In [None]:
# Evaluate all models
evaluation_results = {}

# Evaluate individual models
if svd_model:
    svd_results = evaluate_hybrid_model(svd_model, "SVD++", test_df)
    evaluation_results['SVD++'] = svd_results

if tfidf_model:
    tfidf_results = evaluate_hybrid_model(tfidf_model, "TF-IDF", test_df)
    evaluation_results['TF-IDF'] = tfidf_results

# Evaluate hybrid models
if weighted_hybrid:
    weighted_results = evaluate_hybrid_model(weighted_hybrid, "Weighted Hybrid", test_df)
    evaluation_results['Weighted Hybrid'] = weighted_results

if switching_hybrid:
    switching_results = evaluate_hybrid_model(switching_hybrid, "Switching Hybrid", test_df)
    evaluation_results['Switching Hybrid'] = switching_results

# Compare results
if evaluation_results:
    print(f"\nMODEL COMPARISON:")
    print("=" * 50)
    
    results_df = pd.DataFrame(evaluation_results).T
    display(results_df.round(4))
    
    # Find best model
    best_model = results_df['f1_at_10'].idxmax()
    best_f1 = results_df.loc[best_model, 'f1_at_10']
    
    print(f"\n Best Model: {best_model} (F1@10: {best_f1:.4f})")

    # Calculate improvements
    if 'Weighted Hybrid' in evaluation_results and 'SVD++' in evaluation_results:
        improvement = (evaluation_results['Weighted Hybrid']['f1_at_10'] - 
                      evaluation_results['SVD++']['f1_at_10']) / evaluation_results['SVD++']['f1_at_10'] * 100
        print(f" Weighted Hybrid improvement over SVD++: {improvement:.1f}%")


In [None]:
# Visualize evaluation results
if evaluation_results:
    print("\n PERFORMANCE VISUALIZATION")
    print("=" * 35)
    
    # Create comparison plots
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    models = list(evaluation_results.keys())
    
    # Precision comparison
    precision_values = [evaluation_results[model]['precision_at_10'] for model in models]
    axes[0].bar(models, precision_values, color='skyblue', alpha=0.7)
    axes[0].set_title('Precision@10 Comparison')
    axes[0].set_ylabel('Precision')
    axes[0].tick_params(axis='x', rotation=45)
    
    # Recall comparison
    recall_values = [evaluation_results[model]['recall_at_10'] for model in models]
    axes[1].bar(models, recall_values, color='lightcoral', alpha=0.7)
    axes[1].set_title('Recall@10 Comparison')
    axes[1].set_ylabel('Recall')
    axes[1].tick_params(axis='x', rotation=45)
    
    # F1 comparison
    f1_values = [evaluation_results[model]['f1_at_10'] for model in models]
    axes[2].bar(models, f1_values, color='lightgreen', alpha=0.7)
    axes[2].set_title('F1@10 Comparison')
    axes[2].set_ylabel('F1 Score')
    axes[2].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Save evaluation results
    results_save_path = Path('../results/hybrid_model_evaluation.json')
    results_save_path.parent.mkdir(exist_ok=True)
    
    import json
    with open(results_save_path, 'w') as f:
        json.dump(evaluation_results, f, indent=2, default=str)
    
    print(f"Evaluation results saved to {results_save_path}")

# Save trained hybrid models
if weighted_hybrid:
    weighted_model_path = models_dir / 'weighted_hybrid_model.pkl'
    with open(weighted_model_path, 'wb') as f:
        pickle.dump(weighted_hybrid, f)
    print(f"Weighted hybrid model saved to {weighted_model_path}")

if switching_hybrid:
    switching_model_path = models_dir / 'switching_hybrid_model.pkl'
    with open(switching_model_path, 'wb') as f:
        pickle.dump(switching_hybrid, f)
    print(f"Switching hybrid model saved to {switching_model_path}")

print(f"\nPhase 4 Step 1 (Hybrid Models) Complete!")
