In [13]:
%pip install pandas
%pip install scikit-learn
%pip install numpy
%pip install streamlit


Note: you may need to restart the kernel to use updated packages.
Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp310-cp310-win_amd64.whl.metadata (11 kB)
Downloading scikit_learn-1.7.1-cp310-cp310-win_amd64.whl (8.9 MB)
   ---------------------------------------- 0.0/8.9 MB ? eta -:--:--
   --------------- ------------------------ 3.4/8.9 MB 22.5 MB/s eta 0:00:01
   ---------------------------- ----------- 6.3/8.9 MB 16.8 MB/s eta 0:00:01
   -------------------------------------- - 8.7/8.9 MB 14.5 MB/s eta 0:00:01
   ---------------------------------------- 8.9/8.9 MB 14.6 MB/s eta 0:00:00
Installing collected packages: scikit-learn
Successfully installed scikit-learn-1.7.1
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting streamlit
  Downloading streamlit-1.47.0-py3-none-any.whl.metadata (9.0 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Downloading altair-5.5.0-py3-none-any.whl

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re # For cleaning movie titles

In [2]:
movies_df = pd.read_csv('../data/movies.csv')
ratings_df = pd.read_csv('../data/ratings.csv')

In [3]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
print("Movies DataFrame Head:")
print(movies_df.head())
print("\nRatings DataFrame Head:")
print(ratings_df.head())

Movies DataFrame Head:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings DataFrame Head:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [6]:
print("\nMovies Info:")
movies_df.info()
print("\nRatings Info:")
ratings_df.info()


Movies Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB

Ratings Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [7]:
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title) # Remove special characters
    return title

In [8]:
movies_df["clean_title"] = movies_df["title"].apply(clean_title)


In [9]:
movies_df["genres"] = movies_df["genres"].apply(lambda x: x.split("|"))

print("\nMovies DataFrame with Cleaned Title and List Genres:")
print(movies_df.head())


Movies DataFrame with Cleaned Title and List Genres:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                              genres  \
0  [Adventure, Animation, Children, Comedy, Fantasy]   
1                     [Adventure, Children, Fantasy]   
2                                  [Comedy, Romance]   
3                           [Comedy, Drama, Romance]   
4                                           [Comedy]   

                        clean_title  
0                    Toy Story 1995  
1                      Jumanji 1995  
2             Grumpier Old Men 1995  
3            Waiting to Exhale 1995  
4  Father of the Bride Part II 1995  


In [10]:
movie_ratings = pd.merge(ratings_df, movies_df, on='movieId')
print("\nMerged Movie Ratings Head:")
print(movie_ratings.head())


Merged Movie Ratings Head:
   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                              genres              clean_title  
0  [Adventure, Animation, Children, Comedy, Fantasy]           Toy Story 1995  
1                                  [Comedy, Romance]    Grumpier Old Men 1995  
2                          [Action, Crime, Thriller]                Heat 1995  
3                                [Mystery, Thriller]     Seven aka Se7en 1995  
4                         [Crime, Mystery, Thriller]  Usual Suspects The 1995  


In [11]:
user_movie_matrix = movie_ratings.pivot_table(index='userId', columns='title', values='rating')
print("\nUser-Movie Matrix Head (Sparse):")
print(user_movie_matrix.head())


User-Movie Matrix Head (Sparse):
title   '71 (2014)  'Hellboy': The Seeds of Creation (2004)  \
userId                                                        
1              NaN                                      NaN   
2              NaN                                      NaN   
3              NaN                                      NaN   
4              NaN                                      NaN   
5              NaN                                      NaN   

title   'Round Midnight (1986)  'Salem's Lot (2004)  \
userId                                                
1                          NaN                  NaN   
2                          NaN                  NaN   
3                          NaN                  NaN   
4                          NaN                  NaN   
5                          NaN                  NaN   

title   'Til There Was You (1997)  'Tis the Season for Love (2015)  \
userId                                                              

In [12]:
user_movie_matrix_filled = user_movie_matrix.fillna(0)
print("\nUser-Movie Matrix Filled with 0s Head:")
print(user_movie_matrix_filled.head())


User-Movie Matrix Filled with 0s Head:
title   '71 (2014)  'Hellboy': The Seeds of Creation (2004)  \
userId                                                        
1              0.0                                      0.0   
2              0.0                                      0.0   
3              0.0                                      0.0   
4              0.0                                      0.0   
5              0.0                                      0.0   

title   'Round Midnight (1986)  'Salem's Lot (2004)  \
userId                                                
1                          0.0                  0.0   
2                          0.0                  0.0   
3                          0.0                  0.0   
4                          0.0                  0.0   
5                          0.0                  0.0   

title   'Til There Was You (1997)  'Tis the Season for Love (2015)  \
userId                                                        

In [13]:
movies_df['genres_str'] = movies_df['genres'].apply(lambda x: ' '.join(x))
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['genres_str'])


In [14]:
print("\nTF-IDF Matrix Shape (Movies x Genres):", tfidf_matrix.shape)

# Calculate cosine similarity between movies based on genres
genre_cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print("\nGenre Cosine Similarity Matrix Shape:", genre_cosine_sim.shape)


TF-IDF Matrix Shape (Movies x Genres): (9742, 23)

Genre Cosine Similarity Matrix Shape: (9742, 9742)


In [15]:
movie_to_index = {movie: i for i, movie in enumerate(movies_df['title'])}
index_to_movie = {i: movie for i, movie in enumerate(movies_df['title'])}


In [16]:
item_user_matrix = user_movie_matrix_filled.T # Movies as rows, Users as columns
item_similarity = cosine_similarity(item_user_matrix)

In [17]:
cf_movie_titles = user_movie_matrix_filled.columns
cf_movie_to_index = {movie: i for i, movie in enumerate(cf_movie_titles)}
cf_index_to_movie = {i: movie for i, movie in enumerate(cf_movie_titles)}


In [20]:
def get_hybrid_recommendations(movie_title, movies_df, user_movie_matrix_filled,
                               genre_cosine_sim, item_similarity,
                               movie_to_index, index_to_movie,
                               cf_movie_to_index, cf_index_to_movie,
                               top_n=10, genre_weight=0.5):
    """
    Generates hybrid movie recommendations (collaborative + content-based).
    """
    if movie_title not in movies_df['title'].values:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return []

    # Get the index of the input movie from movies_df
    movie_idx_genre = movie_to_index.get(movie_title)
    if movie_idx_genre is None: # Fallback for slight title variations if clean_title not used consistently
        matching_movies = movies_df[movies_df['clean_title'].str.contains(clean_title(movie_title), case=False, na=False)]
        if not matching_movies.empty:
            movie_idx_genre = movie_to_index.get(matching_movies.iloc[0]['title'])
        else:
            print(f"Could not find exact match or close match for '{movie_title}' in movies_df for genre.")
            return []


    # Content-based (Genre) recommendations
    genre_scores = genre_cosine_sim[movie_idx_genre]
    genre_recommendations = sorted(list(enumerate(genre_scores)), key=lambda x: x[1], reverse=True)
    # Exclude the input movie itself
    genre_recommendations = [rec for rec in genre_recommendations if index_to_movie[rec[0]] != movie_title]


    # Collaborative Filtering (Item-Item) recommendations
    # Ensure the movie exists in the CF matrix (not all movies might have ratings)
    if movie_title not in cf_movie_to_index:
        print(f"Movie '{movie_title}' not found in the collaborative filtering matrix. Relying on content-based only.")
        cf_recommendations = []
    else:
        movie_idx_cf = cf_movie_to_index[movie_title]
        cf_scores = item_similarity[movie_idx_cf]
        cf_recommendations = sorted(list(enumerate(cf_scores)), key=lambda x: x[1], reverse=True)
        # Exclude the input movie itself
        cf_recommendations = [rec for rec in cf_recommendations if cf_index_to_movie[rec[0]] != movie_title]


    # Combine and Hybridize
    # Create a dictionary to store combined scores
    combined_scores = {}

    # Add genre scores
    for idx, score in genre_recommendations:
        combined_scores[index_to_movie[idx]] = score * genre_weight

    # Add collaborative filtering scores
    for idx, score in cf_recommendations:
        movie = cf_index_to_movie[idx]
        # Add CF score, weighted by (1 - genre_weight)
        # If movie already has a genre score, add to it. Otherwise, initialize.
        combined_scores[movie] = combined_scores.get(movie, 0) + (score * (1 - genre_weight))

    # Sort combined scores
    sorted_recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

    # Get top N recommendations
    top_recommendations = []
    seen_movies = set() # To ensure uniqueness
    for movie, score in sorted_recommendations:
        if movie not in seen_movies:
            top_recommendations.append(movie)
            seen_movies.add(movie)
        if len(top_recommendations) >= top_n:
            break
    return top_recommendations

# --- Test the Recommendation Function ---
print("\n--- Testing Recommendation Function ---")
example_movie = "Toy Story (1995)"
recommendations = get_hybrid_recommendations(
    example_movie,
    movies_df,
    user_movie_matrix_filled,
    genre_cosine_sim,
    item_similarity,
    movie_to_index,
    index_to_movie,
    cf_movie_to_index,
    cf_index_to_movie,
    top_n=5,
    genre_weight=0.5 # Adjust this weight to prioritize genre vs. collaborative filtering
)

print(f"\nRecommendations for '{example_movie}':")
for i, movie in enumerate(recommendations):
    print(f"{i+1}. {movie}")

example_movie_2 = "Jumanji (1995)"
recommendations_2 = get_hybrid_recommendations(
    example_movie_2,
    movies_df,
    user_movie_matrix_filled,
    genre_cosine_sim,
    item_similarity,
    movie_to_index,
    index_to_movie,
    cf_movie_to_index,
    cf_index_to_movie,
    top_n=5,
    genre_weight=0.7 # More emphasis on genre
)

print(f"\nRecommendations for '{example_movie_2}' (more genre emphasis):")
for i, movie in enumerate(recommendations_2):
    print(f"{i+1}. {movie}")


--- Testing Recommendation Function ---

Recommendations for 'Toy Story (1995)':
1. Toy Story 2 (1999)
2. Monsters, Inc. (2001)
3. Shrek (2001)
4. Antz (1998)
5. Finding Nemo (2003)

Recommendations for 'Jumanji (1995)' (more genre emphasis):
1. Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
2. Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005)
3. NeverEnding Story, The (1984)
4. Indian in the Cupboard, The (1995)
5. Golden Compass, The (2007)
