In [1]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import pickle
import json

# Initialize BigQuery client
client = bigquery.Client(project="students-group2")

In [2]:
# Load ratings data
query = """
SELECT userId, movieId, rating
FROM `master-ai-cloud.MoviePlatform.ratings`
"""

print("Loading ratings data...")
df_ratings = client.query(query).to_dataframe()
print(f"Loaded {len(df_ratings)} ratings")
print(df_ratings.head())

Loading ratings data...
Loaded 105339 ratings
   userId  movieId  rating
0       1      204     0.5
1       1      256     0.5
2       1      277     0.5
3       1      719     0.5
4       1    45950     0.5


In [3]:
# Load movie data for recommendations display
query_movies = """
SELECT movieId, title, genres
FROM `master-ai-cloud.MoviePlatform.movies`
"""

print("Loading movie metadata...")
df_movies = client.query(query_movies).to_dataframe()
print(f"Loaded {len(df_movies)} movies")
print(df_movies.head())

Loading movie metadata...
Loaded 10329 movies
   movieId                             title              genres
0   126929              Li'l Quinquin (    )  (no genres listed)
1   135460                      Pablo (2012)  (no genres listed)
2   138863  The Big Broadcast of 1936 (1935)  (no genres listed)
3   141305       Round Trip to Heaven (1992)  (no genres listed)
4   141472       The 50 Year Argument (2014)  (no genres listed)


In [4]:
# Create a pivot table: rows=users, columns=movies, values=ratings
print("Creating user-item matrix...")
user_item_matrix = df_ratings.pivot_table(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)  # Fill missing values with 0

print(f"Matrix shape: {user_item_matrix.shape}")
print(f"Users: {user_item_matrix.shape[0]}, Movies: {user_item_matrix.shape[1]}")

Creating user-item matrix...
Matrix shape: (668, 10325)
Users: 668, Movies: 10325


In [5]:
# Calculate item-item (movie-movie) similarity using cosine similarity
print("Computing item-item similarity matrix...")
print("This may take a minute...")

# Transpose so movies are rows
item_matrix = user_item_matrix.T

# Calculate cosine similarity between movies
item_similarity = cosine_similarity(item_matrix)

# Convert to DataFrame for easier access
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=item_matrix.index,
    columns=item_matrix.index
)

print(f"Similarity matrix shape: {item_similarity_df.shape}")
print("Done!")

Computing item-item similarity matrix...
This may take a minute...
Similarity matrix shape: (10325, 10325)
Done!


In [14]:
# 1. Explore user_item_matrix
print("=" * 60)
print("USER-ITEM MATRIX")
print("=" * 60)
print(f"Shape: {user_item_matrix.shape}")
print(f"\nSample (first 10 users, first 10 movies):")
print(user_item_matrix.iloc[:10, :10])

# 2. Explore item_similarity_df
print("\n" + "=" * 60)
print("ITEM SIMILARITY MATRIX")
print("=" * 60)
print(f"Shape: {item_similarity_df.shape}")
print(f"\nSample (first 10 movies × first 10 movies):")
print(item_similarity_df.iloc[:10, :10].round(3))

# 3. Find similar movies to a specific movie
toy_story_id = 1
print(f"\n" + "=" * 60)
print(f"MOVIES SIMILAR TO TOY STORY (ID={toy_story_id})")
print("=" * 60)
similar_movies = item_similarity_df[toy_story_id].sort_values(ascending=False)[1:11]  # Skip itself
for movie_id, similarity in similar_movies.items():
    movie_info = df_movies[df_movies['movieId'] == movie_id]
    if not movie_info.empty:
        print(f"{similarity:.3f} - {movie_info.iloc[0]['title']}")

USER-ITEM MATRIX
Shape: (668, 10325)

Sample (first 10 users, first 10 movies):
movieId   1    2    3    4    5    6    7    8    9    10
userId                                                   
1        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
2        5.0  0.0  2.0  0.0  3.0  0.0  0.0  0.0  0.0  0.0
3        0.0  0.0  0.0  0.0  3.0  0.0  3.0  0.0  0.0  0.0
4        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
5        4.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
6        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
7        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  4.0
8        5.0  0.0  4.0  0.0  3.0  0.0  0.0  0.0  0.0  0.0
9        0.0  0.0  3.0  0.0  0.0  4.0  0.0  0.0  0.0  3.0
10       0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0

ITEM SIMILARITY MATRIX
Shape: (10325, 10325)

Sample (first 10 movies × first 10 movies):
movieId     1      2      3      4      5      6      7      8      9      10
movieId                                                

In [17]:
def get_recommendations(user_ratings, n_recommendations=10):
    """
    Get movie recommendations based on user's ratings with confidence weighting
    
    Parameters:
    - user_ratings: dict of {movieId: rating}
    - n_recommendations: number of recommendations to return
    
    Returns:
    - DataFrame with recommended movies
    """
    # Calculate weighted scores for all movies
    scores = {}
    similarity_sums = {}
    
    for movie_id, rating in user_ratings.items():
        if movie_id not in item_similarity_df.index:
            continue
            
        # Get similar movies
        similar_movies = item_similarity_df[movie_id]
        
        for other_movie_id, similarity in similar_movies.items():
            # Skip if already rated
            if other_movie_id in user_ratings:
                continue
            
            # Weighted score
            if other_movie_id not in scores:
                scores[other_movie_id] = 0
                similarity_sums[other_movie_id] = 0
            
            scores[other_movie_id] += similarity * rating
            similarity_sums[other_movie_id] += similarity
    
    # Normalize scores and calculate confidence
    recommendations = {}
    confidences = {}
    for movie_id in scores:
        if similarity_sums[movie_id] > 0:
            recommendations[movie_id] = scores[movie_id] / similarity_sums[movie_id]
            confidences[movie_id] = similarity_sums[movie_id]
    
    # Sort by confidence-weighted score: predicted_rating × confidence
    sorted_recommendations = sorted(
        recommendations.items(),
        key=lambda x: x[1] * confidences[x[0]],
        reverse=True
    )[:n_recommendations]  # ← Also limit here!
    
    # Get movie details
    recommended_movie_ids = [movie_id for movie_id, _ in sorted_recommendations]
    recommended_movies = df_movies[df_movies['movieId'].isin(recommended_movie_ids)].copy()
    
    # Add scores (confidence-weighted)
    score_dict = dict(sorted_recommendations)
    recommended_movies['score'] = recommended_movies['movieId'].map(score_dict)
    recommended_movies = recommended_movies.sort_values('score', ascending=False)
    
    return recommended_movies[['movieId', 'title', 'genres', 'score']]

In [18]:
# Test with a sample user
print("Testing recommendation system...")

# Simulate a new user who rates a few movies
test_user_ratings = {
    1: 5.0,      # Toy Story
    50: 4.5,     # Usual Suspects
    260: 4.0,    # Star Wars
}

print("\nUser's ratings:")
for movie_id, rating in test_user_ratings.items():
    movie_info = df_movies[df_movies['movieId'] == movie_id]
    if not movie_info.empty:
        print(f"  - {movie_info.iloc[0]['title']}: {rating} stars")

print("\nRecommended movies:")
recommendations = get_recommendations(test_user_ratings, n_recommendations=10)
print(recommendations)

Testing recommendation system...

User's ratings:
  - Toy Story (1995): 5.0 stars
  - Usual Suspects, The (1995): 4.5 stars
  - Star Wars: Episode IV - A New Hope (1977): 4.0 stars

Recommended movies:
      movieId                                              title  \
516       480                               Jurassic Park (1993)   
2098     1270                          Back to the Future (1985)   
3695      296                                Pulp Fiction (1994)   
6067      593                   Silence of the Lambs, The (1991)   
59       1198  Raiders of the Lost Ark (Indiana Jones and the...   
60       1291          Indiana Jones and the Last Crusade (1989)   
1604     2571                                 Matrix, The (1999)   
1602     1240                             Terminator, The (1984)   
451      1210  Star Wars: Episode VI - Return of the Jedi (1983)   
450      1196  Star Wars: Episode V - The Empire Strikes Back...   

                                genres     score 

In [19]:
# Demonstrate how recommendations evolve as user rates more movies
print("=== DEMONSTRATING EVOLVING RECOMMENDATIONS ===\n")

# Start with 1 rating
user_ratings_step1 = {1: 5.0}  # Toy Story
print("Step 1: User rates 1 movie")
print("Ratings:", user_ratings_step1)
recs_1 = get_recommendations(user_ratings_step1, n_recommendations=5)
print("\nTop 5 Recommendations:")
print(recs_1[['title', 'score']].to_string(index=False))

# Add 2 more ratings
user_ratings_step2 = {1: 5.0, 50: 4.5, 260: 4.0}
print("\n" + "="*50)
print("Step 2: User rates 3 movies")
print("Ratings:", user_ratings_step2)
recs_2 = get_recommendations(user_ratings_step2, n_recommendations=5)
print("\nTop 5 Recommendations:")
print(recs_2[['title', 'score']].to_string(index=False))

# Add even more ratings
user_ratings_step3 = {1: 5.0, 50: 4.5, 260: 4.0, 296: 4.5, 356: 4.0}
print("\n" + "="*50)
print("Step 3: User rates 5 movies")
print("Ratings:", user_ratings_step3)
recs_3 = get_recommendations(user_ratings_step3, n_recommendations=5)
print("\nTop 5 Recommendations:")
print(recs_3[['title', 'score']].to_string(index=False))

=== DEMONSTRATING EVOLVING RECOMMENDATIONS ===

Step 1: User rates 1 movie
Ratings: {1: 5.0}

Top 5 Recommendations:
                                                                         title  score
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    5.0
                                     Star Wars: Episode IV - A New Hope (1977)    5.0
                             Star Wars: Episode VI - Return of the Jedi (1983)    5.0
                                          Independence Day (a.k.a. ID4) (1996)    5.0
                                                     Back to the Future (1985)    5.0

Step 2: User rates 3 movies
Ratings: {1: 5.0, 50: 4.5, 260: 4.0}

Top 5 Recommendations:
                                                                         title    score
                                                     Back to the Future (1985) 4.492525
                                                           Pulp Fiction (1994) 4.491070
Raiders of th

In [20]:
import pickle
import os

# Create a models directory
os.makedirs('../models', exist_ok=True)

print("Saving model artifacts...")

# Save the item similarity matrix
with open('../models/item_similarity_2/item_similarity_with_confidence_weighting.pkl', 'wb') as f:
    pickle.dump(item_similarity_df, f)
print("✓ Saved item_similarity.pkl")

# Save the movie metadata
df_movies.to_pickle('../models/item_similarity_2/movies_metadata.pkl')
print("✓ Saved movies_metadata.pkl")

# Save movie ID to index mapping
movie_ids = item_similarity_df.index.tolist()
with open('../models/item_similarity_2/movie_ids.pkl', 'wb') as f:
    pickle.dump(movie_ids, f)
print("✓ Saved movie_ids.pkl")

print("\nAll model artifacts saved successfully!")

Saving model artifacts...
✓ Saved item_similarity.pkl
✓ Saved movies_metadata.pkl
✓ Saved movie_ids.pkl

All model artifacts saved successfully!


FileNotFoundError: [Errno 2] No such file or directory: '../models/item_similarity.pkl'