In [None]:
import pandas as pd
from urllib.request import urlretrieve
from zipfile import ZipFile
import os
import sys
from shutil import rmtree

module_path = os.path.abspath(os.path.join('utils'))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_loader import load_imdb_dataset

# Data Loading

## Load IMDB dataset

In [None]:
name_basics, title_basics, title_ratings = load_imdb_dataset()

print(name_basics.shape)
print(title_basics.shape)
print(title_ratings.shape)

## Load MovieLens 1M dataset

In [None]:
data_path = 'data/'

if (not os.path.exists(data_path)):
    os.makedirs(data_path)

# Remove ml-1m if it exists in the data folder
if 'ml-1m' in os.listdir(data_path):
    rmtree(data_path + 'ml-1m')
    
dsURL = "http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip";
print(f"Downloading {dsURL[0]}...")
urlretrieve(dsURL[0], dsURL[1])

ZipFile(dsURL[1], "r").extractall(data_path)

# Remove the zip file
os.remove(dsURL[1])

In [None]:
users = pd.read_csv(
    data_path + 'ml-1m/users.dat', 
    sep='::',
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
    engine="python"
)

ratings = pd.read_csv(
    data_path + "ml-1m/ratings.dat",
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
    engine="python",
)

movies = pd.read_csv(
    data_path + "ml-1m/movies.dat",
    sep="::",
    names=["movie_id", "title", "genres"],
    engine="python",
    encoding="latin-1",
)

print("Size for users:", users.shape)
print("Size for ratings:", ratings.shape)
print("Size for movies:", movies.shape)

In [None]:
# Copy that we will use throughout the project

ml_users = users.copy()
ml_ratings = ratings.copy()
ml_movies = movies.copy()

# Data Preprocessing

For this project we use two datasets that are not link to each other. We will preprocess them separately and then merge them together. What we do on the datasets:
- change column names
- remove columns that are not needed
- process titles to remove unwanted characters to make the merging easier
- genre are hot encoded

## Preprocess MovieLens 1M dataset

In [None]:
ml_users["user_id"] = ml_users["user_id"].apply(lambda x: f"user_{x}")
ml_users["age_group"] = ml_users["age_group"].apply(lambda x: f"group_{x}")
ml_users["occupation"] = ml_users["occupation"].apply(lambda x: f"occupation_{x}")

ml_movies["movie_id"] = ml_movies["movie_id"].apply(lambda x: f"movie_{x}")
ml_movies["date"] = ml_movies["title"].apply(lambda x: x[-5:-1])
ml_movies["title"] = ml_movies["title"].apply(lambda x: x[:-7])
ml_movies["original_title"] = ml_movies["title"].str.extract(r"\((.*)\)").to_string()
ml_movies["title"] = ml_movies["title"].str.replace(r"\(.*\)", "", regex=True).str.strip()

# For all the movies title that have ", The" or ", Les" at the end, we will move it to the beginning without the comma. End remove it from the end.
ml_movies["title"] = ml_movies["title"].apply(lambda x: "The " + x[:-5] if x[-5:] == ", The" else x)
ml_movies["title"] = ml_movies["title"].apply(lambda x: "Les " + x[:-5] if x[-5:] == ", Les" else x)

# Rename movies['title'] to movies['primary_title']
ml_movies.rename(columns={"title": "primary_title"}, inplace=True)

ml_ratings["movie_id"] = ml_ratings["movie_id"].apply(lambda x: f"movie_{x}")
ml_ratings["user_id"] = ml_ratings["user_id"].apply(lambda x: f"user_{x}")
ml_ratings["rating"] = ml_ratings["rating"].apply(lambda x: float(x))

In [None]:
ml_movies["primary_title_processed"] = ml_movies["primary_title"].apply(lambda x: x.replace(" ", "").lower())
ml_movies["primary_title_processed"] = ml_movies["primary_title_processed"].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

# If original_title is NaN, we will use the primary_title
ml_movies["original_title"] = ml_movies["original_title"].fillna(ml_movies["primary_title"])
ml_movies["original_title_processed"] = ml_movies["original_title"].apply(lambda x: x.replace(" ", "").lower())
ml_movies["original_title_processed"] = ml_movies["original_title_processed"].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

In [None]:
genres = []
for genre in ml_movies["genres"].str.split("|"):
    genres.extend(genre)
    
genres = list(set(genres))

for genre in genres:
    ml_movies[genre] = ml_movies["genres"].apply(lambda gs: int(genre in gs.split("|")))
    
ml_movies.drop(columns=["genres"], inplace=True)

## Preprocess IMDB dataset

We only keep the movies, shorts and tvSeries. I personnaly do not mind being recommended a short or a tvSeries even though I am looking for a movie. I think it can be interesting to have a mix of different types of content.

In [None]:
# Copy that we will use throughout the project

imdb_name = name_basics.copy()
imdb_title = title_basics.copy()
imdb_rating = title_ratings.copy()

In [None]:
# Print all different titleType 
print("Different genres:", imdb_title["titleType"].unique())

# Print the number of each titleType
print("Number of each titleType:")
print(imdb_title["titleType"].value_counts())

In [None]:
size_before = imdb_title.shape[0]

imdb_title = imdb_title[imdb_title["titleType"].isin(["movie", "short", "tvSeries"])]

size_after = imdb_title.shape[0]
print(f"Number of rows removed: {size_before - size_after}")

In [None]:
# Rename 
imdb_title.rename(columns={"primaryTitle": "primary_title", "originalTitle": "original_title", "startYear": "date"}, inplace=True)

# Drop columns isAdult, endYear, runtimeMinutes and titleType
imdb_title.drop(columns=["isAdult", "endYear", "runtimeMinutes", "titleType"], inplace=True)

In [None]:
genres = []
for genre in imdb_title["genres"].str.split(","):
    genres.extend(genre)
    
genres = list(set(genres))

for genre in genres:
    imdb_title[genre] = imdb_title["genres"].apply(lambda gs: int(genre in gs.split(",") if type(gs) == str else False))
    
imdb_title.drop(columns=["genres"], inplace=True)

In [None]:
# Drop the column "\N"

imdb_title.drop(columns=["\\N"], inplace=True)

In [None]:
imdb_title["primary_title_processed"] = imdb_title["primary_title"].apply(lambda x: str(x).replace(" ", "").lower())
imdb_title["primary_title_processed"] = imdb_title["primary_title_processed"].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

imdb_title["original_title_processed"] = imdb_title["original_title"].apply(lambda x: str(x).replace(" ", "").lower())
imdb_title["original_title_processed"] = imdb_title["original_title_processed"].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

## Merge MovieLens 1M and IMDB datasets

We want to merge both dataset to **enrich** the MovieLens 1M dataset with the IMDB dataset. We will the primary title of the movie. We will also use the year of the movie to filter out the movies to differentiate between movies with the same title.

To do so we will :
- Merge the datasets on the primary title of the movie (processed)
- Process the unwanted columns

In [None]:
merged_movies = pd.merge(ml_movies, imdb_title, on=['primary_title_processed', 'date'], how='left')

# Remove the rows where tconst is NaN
merged_movies = merged_movies[~merged_movies["tconst"].isna()]

In [None]:
def select_non_zero(col1, col2):
    if col1.name.endswith('_x'):
        base_name = col1.name[:-2]
    else:
        base_name = col2.name[:-2]
    
    mask = (col1 != 0) | (col2 != 0)
    result = pd.Series(0, index=col1.index, name=base_name)
    result[mask] = col1[mask].combine_first(col2[mask])
    return result

# Identify columns with _x and _y suffixes
x_cols = [col for col in merged_movies.columns if col.endswith('_x')]
y_cols = [col for col in merged_movies.columns if col.endswith('_y')]

# Combine the columns and remove suffixes
for x_col in x_cols:
    base_name = x_col[:-2]
    y_col = base_name + '_y'
    
    if y_col in y_cols:
        merged_movies[base_name] = select_non_zero(merged_movies[x_col], merged_movies[y_col])
        merged_movies.drop(columns=[x_col, y_col], inplace=True)
    else:
        merged_movies.rename(columns={x_col: base_name}, inplace=True)

# Rename any remaining _y columns
for y_col in y_cols:
    if y_col in merged_movies.columns:
        base_name = y_col[:-2]
        merged_movies.rename(columns={y_col: base_name}, inplace=True)

# Remove any duplicate columns that might still exist
merged_movies = merged_movies.loc[:, ~merged_movies.columns.duplicated()]

print("Merged and cleaned dataset shape:", merged_movies.shape)
print("Columns in final dataset:", merged_movies.columns.tolist())

In [None]:
# Reorder the columns for better readability

new_order = [
    'movie_id', 'tconst', 
    
    'primary_title', 'original_title', 
    
    'date',
    
    'Action', 'Adventure', 'Animation', 'Biography', 'Children\'s', 'Comedy', 'Crime', 
    'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History', 
    'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 
    'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western', 'Adult'
]

merged_movies = merged_movies[new_order]

In [None]:
# Convert all the float columns to int
for col in merged_movies.columns:
    if merged_movies[col].dtype == float:
        merged_movies[col] = merged_movies[col].fillna(0).astype(int)
        
display(merged_movies.head())

This merging will allow us to have more information about the movies in the MovieLens 1M dataset. We only kept the columns that are useful for the recommendation system.

## Features extraction

For this exercise we will use the following features:
- genres
- actors and actresses
- average rating from IMDB
- user rating from MovieLens 1M

I decided to use the actors and actresses as features because based on my girlfriend and I experience, we tend to like movies with the same actors and actresses. We sometimes watch a movie because of the casting and not the plot.

In [None]:
actor_data = imdb_name[imdb_name['primaryProfession'].str.contains('actor|actress', case=False, na=False)]

# Dictionary mapping tconst to a list of actors
movie_actors = {}
for _, row in actor_data.iterrows():
    for movie in row['knownForTitles'].split(','):
        if movie in movie_actors:
            movie_actors[movie].append(row['primaryName'])
        else:
            movie_actors[movie] = [row['primaryName']]

In [None]:
merged_movies = pd.merge(merged_movies, imdb_rating[['tconst', 'averageRating']], on='tconst', how='left')

In [None]:
merged_movies['actors'] = merged_movies['tconst'].map(movie_actors)
merged_movies['actors'] = merged_movies['actors'].fillna('').apply(lambda x: ','.join(x[:5]) if isinstance(x, list) else '')

We create the features column by concatenating the genres, actors and actresses columns. We then use the TfidfVectorizer to transform the features into a matrix of token counts. We use **TF-IDF** to prepare the data for the recommendation system.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_movie_features(movie):
    features = []
    for genre in ['Action', 'Adventure', 'Animation', 'Biography', "Children's", 'Comedy', 'Crime', 
                  'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History', 
                  'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 
                  'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western', 'Adult']:
        if movie[genre] == 1:
            features.append(genre)
    features.extend(movie['actors'].split(','))
    return ' '.join(features)

merged_movies['features'] = merged_movies.apply(get_movie_features, axis=1)

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(merged_movies['features'])

In [None]:
from scipy.sparse import csr_matrix

user_item_matrix = ml_ratings.pivot(index='movie_id', columns='user_id', values='rating').fillna(0)
csr_ratings = csr_matrix(user_item_matrix.values)

## Model and recommendation

I decided to recommend 3 movies based on my own experience I like to still have the choice over what I will watch. Movies will be recommend based on features and users rating and then order by the average rating from IMDB. 

In [None]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

The first approach is very naive. We will combine the content-based filtering and collaborative filtering.

- `get_movie_recommendations`: This function uses collaborative filtering to find similar movies based on user ratings.
- `make_recommendations`: This is the main function that combines content-based and collaborative filtering to generate recommendations. This is were we combine the features extracted from the IMDB dataset and the user ratings from the MovieLens 1M dataset.

In details: The function `make_recommendations(movie1, movie2)` first retrieves the specified movies from the dataset. It then combines their TF-IDF vectors to create a unified feature vector. Using this combined vector, it identifies the top 20 similar movies through content-based recommendations. Concurrently, it obtains recommendations via collaborative filtering for each movie. These recommendations are merged, filtered, and sorted based on frequency and rating. Finally, the function outputs the top 3 recommended movie titles.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

def get_movie_recommendations(movie_id, user_item_matrix, model, n_recommendations=5):
    if movie_id not in user_item_matrix.index:
        return []
    movie_vector = user_item_matrix.loc[movie_id].values.reshape(1, -1)
    _, indices = model.kneighbors(movie_vector, n_neighbors=n_recommendations+1)
    similar_movies = user_item_matrix.index[indices.flatten()][1:]
    return similar_movies.tolist()

In [None]:
def make_recommendations(movie1, movie2):
    model.fit(csr_ratings)
    
    movies = merged_movies[merged_movies['primary_title'].isin([movie1, movie2])]
    if len(movies) != 2:
        raise ValueError("One or both movies not found in the dataset")
    
    indices = movies.index
    combined_features = tfidf_matrix[indices[0]] + tfidf_matrix[indices[1]]
    similar_scores = cosine_similarity(combined_features, tfidf_matrix).flatten()
    
    content_based_indices = similar_scores.argsort()[::-1][2:22]  # Top 20 excluding input movies
    content_based_recommendations = merged_movies.iloc[content_based_indices]['movie_id'].tolist()
    
    collaborative_recommendations = []
    for movie_id in movies['movie_id']:
        collaborative_recommendations.extend(get_movie_recommendations(movie_id, user_item_matrix, model))
    
    # Combine content-based and collaborative recommendations
    all_recommendations = content_based_recommendations + collaborative_recommendations
    recommendation_counts = Counter(all_recommendations)
    
    valid_recommendations = [
        (movie_id, (count, merged_movies.loc[merged_movies['movie_id'] == movie_id, 'averageRating'].iloc[0]))
        for movie_id, count in recommendation_counts.items()
        if movie_id in merged_movies['movie_id'].values
    ]
    
    sorted_recommendations = sorted(valid_recommendations, key=lambda x: (x[1][0], x[1][1]), reverse=True)
    
    # Return top 3 recommendations
    top_recommendations = sorted_recommendations[:3]
    return [merged_movies.loc[merged_movies['movie_id'] == rec[0], 'primary_title'].iloc[0] for rec in top_recommendations]

In [None]:
movie1 = 'Toy Story'
movie2 = 'Jumanji'

recommendations = make_recommendations(movie1, movie2)
print(f"Recommendations for {movie1} and {movie2}:")
for i, rec in enumerate(recommendations):
    print(f"{i+1}. {rec}")

## Analysis

The current method combines content-based and collaborative recommendations by simply adding them together and then ranking based on frequency and average rating. Let's break down the advantages and disadvantages of this approach.

**Advantages:**
- The model is simple and easy to understand.
- Easy to implement.
- It still uses both content-based and collaborative filtering.

**Disadvantages:**
- It does not directly balance the influence of content-based and collaborative filtering.
- It might favor movies that appear in both content-based and collaborative recommendations.

We might improve this model by adding **weights** to the content-based and collaborative recommendations.

The function `make_recommendations_improved(movie1, movie2, content_weight=0.2, collab_weight=0.4, movies_df=merged_movies)` first retrieves the specified movies from the dataset. It combines their TF-IDF vectors to create a unified feature vector and calculates content-based similarity scores. These scores are normalized for consistency. Concurrently, it computes collaborative filtering scores by recommending similar movies for each input movie, with scores normalized similarly. The function then combines the normalized content and collaborative scores using the provided weights. After excluding the input movies from the recommendations, it identifies the top 10 recommendations based on the combined scores. These recommendations are sorted by the combined score and average rating. Finally, the function outputs the top 3 recommended movie titles.

In [None]:
import numpy as np

def make_recommendations_improved(movie1, movie2, content_weight=0.2, collab_weight=0.4, movies_df=merged_movies):
    model.fit(csr_ratings)
    
    movies = movies_df[movies_df['primary_title'].isin([movie1, movie2])]
    if len(movies) != 2:
        raise ValueError("One or both movies not found in the dataset")
    
    indices = movies.index
    combined_features = tfidf_matrix[indices[0]] + tfidf_matrix[indices[1]]
    content_scores = cosine_similarity(combined_features, tfidf_matrix).flatten()
    
    # Normalize content scores
    content_scores = (content_scores - content_scores.min()) / (content_scores.max() - content_scores.min())
    
    # Get collaborative scores
    collab_scores = np.zeros(len(movies_df))
    for movie_id in movies['movie_id']:
        similar_movies = get_movie_recommendations(movie_id, user_item_matrix, model, n_recommendations=len(movies_df))
        for i, similar_movie in enumerate(similar_movies):
            collab_scores[movies_df.index[movies_df['movie_id'] == similar_movie]] += 1 / (i + 1)
    
    # Normalize collaborative scores
    collab_scores = (collab_scores - collab_scores.min()) / (collab_scores.max() - collab_scores.min())
    
    # Combine scores
    combined_scores = content_weight * content_scores + collab_weight * collab_scores
    
    # Remove input movies from recommendations
    combined_scores[indices] = -1
    
    # Get top recommendations
    top_indices = combined_scores.argsort()[::-1][:10]  # Get top 10 recommendations
    
    recommendations = []
    for idx in top_indices:
        movie = movies_df.iloc[idx]
        recommendations.append((movie['primary_title'], movie['averageRating'], combined_scores[idx]))
    
    # Sort by combined score and then by average rating
    recommendations.sort(key=lambda x: (x[2], x[1]), reverse=True)
    
    return [rec[0] for rec in recommendations[:3]]  # Return top 3 recommendations

In [None]:
movie1 = 'Toy Story'
movie2 = 'Jumanji'

recommendations = make_recommendations_improved(movie1, movie2)
print(f"Recommendations for {movie1} and {movie2}:")
for i, rec in enumerate(recommendations):
    print(f"{i+1}. {rec}")

This improved version:

- Uses both content-based and collaborative methods for all potential recommendations.
- Normalizes scores from both methods to ensure fair comparison.
- Allows adjusting the weight of content-based vs. collaborative methods.
- Provides a more nuanced ranking system that considers both similarity and user ratings.

## Misunderstanding of the task

I thought we had to predict movies based on 2 movies input but we have to provide 2 users. The function make recommendations has to be adapted to take 2 users. 

New version of the function `make_recommendations_improved`:
- Retrieves the movies rated by each user
- Combines these movies to create a joint user profile
- Calculates an average feature vector for their combined movies
- Computes content-based similarity scores using this combined vector
- Determines collaborative filtering scores based on similar movies
- Merges content-based and collaborative scores with adjustable weights
- Excludes movies that either user has already rated
- Ranks potential recommendations based on the combined scores and average ratings

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def make_recommendations_for_users(user1_id, user2_id, content_weight=0.2, collab_weight=0.4, movies_df=merged_movies, n_recommendations=5):
    model.fit(csr_ratings)
    
    # Get the movies rated by each user
    user1_movies = ml_ratings[ml_ratings['user_id'] == user1_id]['movie_id'].tolist()
    user2_movies = ml_ratings[ml_ratings['user_id'] == user2_id]['movie_id'].tolist()
    
    # Combine the movies from both users
    combined_movies = list(set(user1_movies + user2_movies))
    
    if len(combined_movies) == 0:
        raise ValueError("No movies found for these users")
    
    # Calculate the average feature vector for the combined movies
    combined_features = tfidf_matrix[movies_df[movies_df['movie_id'].isin(combined_movies)].index].mean(axis=0)
    combined_features = np.asarray(combined_features).flatten()  # Convert to 1D numpy array
    
    content_scores = cosine_similarity(combined_features.reshape(1, -1), tfidf_matrix).flatten()
    
    # Normalize content scores
    content_scores = (content_scores - content_scores.min()) / (content_scores.max() - content_scores.min())
    
    # Get collaborative scores
    collab_scores = np.zeros(len(movies_df))
    for movie_id in combined_movies:
        similar_movies = get_movie_recommendations(movie_id, user_item_matrix, model, n_recommendations=len(movies_df))
        for i, similar_movie in enumerate(similar_movies):
            idx = movies_df.index[movies_df['movie_id'] == similar_movie]
            if len(idx) > 0:
                collab_scores[idx[0]] += 1 / (i + 1)
    
    # Normalize collaborative scores
    if collab_scores.max() != collab_scores.min():
        collab_scores = (collab_scores - collab_scores.min()) / (collab_scores.max() - collab_scores.min())
    else:
        collab_scores = np.zeros_like(collab_scores)
    
    # Combine scores
    combined_scores = content_weight * content_scores + collab_weight * collab_scores
    
    # Remove already watched movies from recommendations
    combined_scores[movies_df[movies_df['movie_id'].isin(combined_movies)].index] = -1
    
    # Get top recommendations
    top_indices = combined_scores.argsort()[::-1][:10]  # Get top 10 recommendations
    
    recommendations = []
    for idx in top_indices:
        movie = movies_df.iloc[idx]
        recommendations.append((movie['primary_title'], movie['averageRating'], combined_scores[idx]))
    
    # Sort by combined score and then by average rating
    recommendations.sort(key=lambda x: (x[2], x[1]), reverse=True)
    
    return [rec[0] for rec in recommendations[:n_recommendations]]  # Return top n recommendations

In [None]:
# Print user ids
print("User IDs:")
print(ml_ratings['user_id'].unique())

In [None]:
user1_id = 'user_123'
user2_id = 'user_6'

recommendations = make_recommendations_for_users(user1_id, user2_id)
print(f"Recommendations for users {user1_id} and {user2_id}:")
for i, rec in enumerate(recommendations):
    print(f"{i+1}. {rec}")

In [None]:
# Check is the users have already seen the recommended movies
user1_movies = ml_ratings[ml_ratings['user_id'] == user1_id]['movie_id'].tolist()
user2_movies = ml_ratings[ml_ratings['user_id'] == user2_id]['movie_id'].tolist()

print(f"Movies seen by {user1_id}:")
print(user1_movies)
print(f"Movies seen by {user2_id}:")
print(user2_movies)

# Check if the recommended movies are in the list of movies seen by the users
print("Recommended movies:")
print(recommendations)

recommendations_id = merged_movies[merged_movies['primary_title'].isin(recommendations)]['movie_id'].tolist()
print("Recommended movies IDs:")
print(recommendations_id)

print("Recommended movies already seen by the users:")
print(set(recommendations_id).intersection(user1_movies))
print(set(recommendations_id).intersection(user2_movies))

# Conclusion

The recommendation system is based on a combination of content-based and collaborative filtering. It uses TF-IDF vectors to represent movie features and user ratings to identify similar movies. The model can be improved by adjusting the weights of content-based and collaborative recommendations. The final recommendations are based on the combined scores and average ratings.

The system is simple and easy to understand, providing a good starting point for movie recommendations. Further enhancements could involve more sophisticated algorithms and additional features to improve accuracy and personalization. The current model is quite slow and may not scale well to larger datasets. Optimizations such as vectorization and parallel processing could be explored to improve performance.