# Ghibli Food Recommendation System Analysis

This notebook demonstrates the machine learning recommendation system for the Ghibli Food Recipe application.

In [None]:
import sys
sys.path.append('../src')
sys.path.append('../utils')
sys.path.append('../config')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from models.recommendation_engine import GhibliFoodRecommendationEngine
from utils.data_fetcher import DataFetcher
import asyncio

# Set up plotting
plt.style.use('ggplot')
sns.set_palette("husl")
%matplotlib inline

## 1. Data Loading and Exploration

In [None]:
# Initialize data fetcher
data_fetcher = DataFetcher()

# Load data
books_data, ratings_data = await data_fetcher.get_training_data(use_mock_data=True)

print(f"Loaded {len(books_data)} books and {len(ratings_data)} ratings")

In [None]:
# Convert to DataFrames for analysis
books_df = pd.DataFrame(books_data)
ratings_df = pd.DataFrame(ratings_data)

print("Books DataFrame:")
print(books_df.head())
print("\nRatings DataFrame:")
print(ratings_df.head())

## 2. Data Visualization

In [None]:
# Genre distribution
plt.figure(figsize=(12, 6))
books_df['genre'].value_counts().plot(kind='bar')
plt.title('Distribution of Book Genres')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Rating distribution
plt.figure(figsize=(10, 6))
ratings_df['rating'].hist(bins=5, alpha=0.7, edgecolor='black')
plt.title('Distribution of User Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.xticks(range(1, 6))
plt.show()

## 3. Model Training

In [None]:
# Initialize and train recommendation engine
engine = GhibliFoodRecommendationEngine("../models/")

# Train content-based model
engine.train_content_based_model(books_data)
print("Content-based model trained!")

# Train collaborative filtering model
engine.train_collaborative_filtering_model(ratings_data)
print("Collaborative filtering model trained!")

## 4. Testing Recommendations

In [None]:
# Test content-based recommendations
if books_data:
    sample_book_id = books_data[0]['id']
    content_recs = engine.get_content_based_recommendations(sample_book_id, n_recommendations=5)
    
    print(f"Content-based recommendations for book '{books_data[0]['title']}':")
    for i, rec in enumerate(content_recs, 1):
        print(f"{i}. {rec['title']} by {rec['author']} (Score: {rec['similarity_score']:.3f})")
        print(f"   Reason: {rec['reason']}")
        print()

In [None]:
# Test collaborative filtering recommendations
if ratings_data:
    sample_user_id = ratings_data[0]['userId']
    collab_recs = engine.get_collaborative_recommendations(sample_user_id, n_recommendations=5)
    
    print(f"Collaborative filtering recommendations for user {sample_user_id}:")
    for i, rec in enumerate(collab_recs, 1):
        print(f"{i}. {rec['title']} by {rec['author']} (Predicted Rating: {rec['predicted_rating']:.2f})")

## 5. Model Analysis

In [None]:
# Analyze content similarity matrix
if engine.content_similarity_matrix is not None:
    plt.figure(figsize=(10, 8))
    similarity_sample = engine.content_similarity_matrix[:20, :20]  # Sample for visualization
    sns.heatmap(similarity_sample, cmap='coolwarm', center=0, annot=False)
    plt.title('Content Similarity Matrix (Sample)')
    plt.xlabel('Books')
    plt.ylabel('Books')
    plt.show()

In [None]:
# Save the trained models
engine.save_models()
print("Models saved successfully!")

## 6. Performance Metrics

In [None]:
# Calculate basic performance metrics
print("Model Performance Summary:")
print(f"- Total books in dataset: {len(books_data)}")
print(f"- Total ratings: {len(ratings_data)}")
print(f"- Average rating: {ratings_df['rating'].mean():.2f}")
print(f"- Rating standard deviation: {ratings_df['rating'].std():.2f}")
print(f"- Unique users: {ratings_df['userId'].nunique()}")
print(f"- Unique books rated: {ratings_df['bookId'].nunique()}")

# Sparsity calculation
total_possible_ratings = ratings_df['userId'].nunique() * ratings_df['bookId'].nunique()
actual_ratings = len(ratings_df)
sparsity = (1 - actual_ratings / total_possible_ratings) * 100
print(f"- Data sparsity: {sparsity:.2f}%")