In [1]:
import pandas as pd
from urllib.request import urlretrieve
from zipfile import ZipFile
import os
import sys
from shutil import rmtree

module_path = os.path.abspath(os.path.join('utils'))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_loader import load_imdb_dataset

## Load IMDB dataset

In [2]:
name_basics, title_basics, title_ratings = load_imdb_dataset()

print(name_basics.shape)
print(title_basics.shape)
print(title_ratings.shape)

../data/imdb/name.basics.tsv.gz
../data/imdb/title.basics.tsv.gz
../data/imdb/title.ratings.tsv.gz
Downloading name.basics...
Extracting name.basics...
Downloading title.basics...
Extracting title.basics...
Downloading title.ratings...
Extracting title.ratings...
(13614341, 6)
(10900927, 9)
(1452487, 3)


## Load MovieLens 1M dataset

In [3]:
data_path = 'data/'

if (not os.path.exists(data_path)):
    os.makedirs(data_path)

# Remove ml-1m if it exists in the data folder
if 'ml-1m' in os.listdir(data_path):
    rmtree(data_path + 'ml-1m')
    
dsURL = "http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip";
print(f"Downloading {dsURL[0]}...")
urlretrieve(dsURL[0], dsURL[1])

ZipFile(dsURL[1], "r").extractall(data_path)

# Remove the zip file
os.remove(dsURL[1])

Downloading http://files.grouplens.org/datasets/movielens/ml-1m.zip...


In [4]:
users = pd.read_csv(
    data_path + 'ml-1m/users.dat', 
    sep='::',
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
    engine="python"
)

ratings = pd.read_csv(
    data_path + "ml-1m/ratings.dat",
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
    engine="python",
)

movies = pd.read_csv(
    data_path + "ml-1m/movies.dat",
    sep="::",
    names=["movie_id", "title", "genres"],
    engine="python",
    encoding="latin-1",
)

print("Size for users:", users.shape)
print("Size for ratings:", ratings.shape)
print("Size for movies:", movies.shape)

Size for users: (6040, 5)
Size for ratings: (1000209, 4)
Size for movies: (3883, 3)


In [5]:
# Copy that we will use throughout the project

ml_users = users.copy()
ml_ratings = ratings.copy()
ml_movies = movies.copy()

## Preprocess MovieLens 1M dataset

In [6]:
ml_users["user_id"] = ml_users["user_id"].apply(lambda x: f"user_{x}")
ml_users["age_group"] = ml_users["age_group"].apply(lambda x: f"group_{x}")
ml_users["occupation"] = ml_users["occupation"].apply(lambda x: f"occupation_{x}")

ml_movies["movie_id"] = ml_movies["movie_id"].apply(lambda x: f"movie_{x}")
ml_movies["date"] = ml_movies["title"].apply(lambda x: x[-5:-1])
ml_movies["title"] = ml_movies["title"].apply(lambda x: x[:-7])
ml_movies["original_title"] = ml_movies["title"].str.extract(r"\((.*)\)").to_string()
ml_movies["title"] = ml_movies["title"].str.replace(r"\(.*\)", "", regex=True).str.strip()

# For all the movies title that have ", The" or ", Les" at the end, we will move it to the beginning without the comma. End remove it from the end.
ml_movies["title"] = ml_movies["title"].apply(lambda x: "The " + x[:-5] if x[-5:] == ", The" else x)
ml_movies["title"] = ml_movies["title"].apply(lambda x: "Les " + x[:-5] if x[-5:] == ", Les" else x)

# Rename movies['title'] to movies['primary_title']
ml_movies.rename(columns={"title": "primary_title"}, inplace=True)

ml_ratings["movie_id"] = ml_ratings["movie_id"].apply(lambda x: f"movie_{x}")
ml_ratings["user_id"] = ml_ratings["user_id"].apply(lambda x: f"user_{x}")
ml_ratings["rating"] = ml_ratings["rating"].apply(lambda x: float(x))

In [7]:
ml_movies["primary_title_processed"] = ml_movies["primary_title"].apply(lambda x: x.replace(" ", "").lower())
ml_movies["primary_title_processed"] = ml_movies["primary_title_processed"].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

# If original_title is NaN, we will use the primary_title
ml_movies["original_title"] = ml_movies["original_title"].fillna(ml_movies["primary_title"])
ml_movies["original_title_processed"] = ml_movies["original_title"].apply(lambda x: x.replace(" ", "").lower())
ml_movies["original_title_processed"] = ml_movies["original_title_processed"].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

In [8]:
genres = []
for genre in ml_movies["genres"].str.split("|"):
    genres.extend(genre)
    
genres = list(set(genres))

for genre in genres:
    ml_movies[genre] = ml_movies["genres"].apply(lambda gs: int(genre in gs.split("|")))
    
ml_movies.drop(columns=["genres"], inplace=True)

In [9]:
# Print the columns of the ml_movies dataframe

print(ml_movies.columns)

Index(['movie_id', 'primary_title', 'date', 'original_title',
       'primary_title_processed', 'original_title_processed', 'Fantasy',
       'Action', 'Children's', 'Western', 'Sci-Fi', 'Documentary', 'Romance',
       'Drama', 'Horror', 'Adventure', 'Thriller', 'War', 'Crime', 'Animation',
       'Film-Noir', 'Musical', 'Mystery', 'Comedy'],
      dtype='object')


## Preprocess IMDB dataset

In [10]:
# Copy that we will use throughout the project

imdb_name = name_basics.copy()
imdb_title = title_basics.copy()
imdb_rating = title_ratings.copy()

In [11]:
# Print all different titleType 
print("Different genres:", imdb_title["titleType"].unique())

# Print the number of each titleType
print("Number of each titleType:")
print(imdb_title["titleType"].value_counts())

Different genres: ['short' 'movie' 'tvShort' 'tvMovie' 'tvSeries' 'tvEpisode' 'tvMiniSeries'
 'tvSpecial' 'video' 'videoGame' 'tvPilot']
Number of each titleType:
titleType
tvEpisode       8350946
short           1003130
movie            685254
video            294371
tvSeries         265981
tvMovie          148177
tvMiniSeries      55280
tvSpecial         48252
videoGame         39190
tvShort           10345
tvPilot               1
Name: count, dtype: int64


In [12]:
size_before = imdb_title.shape[0]

imdb_title = imdb_title[imdb_title["titleType"].isin(["movie", "short", "tvSeries"])]

size_after = imdb_title.shape[0]
print(f"Number of rows removed: {size_before - size_after}")

Number of rows removed: 8946562


In [13]:
# Rename 
imdb_title.rename(columns={"primaryTitle": "primary_title", "originalTitle": "original_title", "startYear": "date"}, inplace=True)

# Drop columns isAdult, endYear, runtimeMinutes and titleType
imdb_title.drop(columns=["isAdult", "endYear", "runtimeMinutes", "titleType"], inplace=True)

In [14]:
genres = []
for genre in imdb_title["genres"].str.split(","):
    genres.extend(genre)
    
genres = list(set(genres))

for genre in genres:
    imdb_title[genre] = imdb_title["genres"].apply(lambda gs: int(genre in gs.split(",") if type(gs) == str else False))
    
imdb_title.drop(columns=["genres"], inplace=True)

In [15]:
# Drop the column "\N"

imdb_title.drop(columns=["\\N"], inplace=True)

In [16]:
imdb_title["primary_title_processed"] = imdb_title["primary_title"].apply(lambda x: str(x).replace(" ", "").lower())
imdb_title["primary_title_processed"] = imdb_title["primary_title_processed"].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

imdb_title["original_title_processed"] = imdb_title["original_title"].apply(lambda x: str(x).replace(" ", "").lower())
imdb_title["original_title_processed"] = imdb_title["original_title_processed"].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

## Merge MovieLens 1M and IMDB datasets
We want to merge both dataset to enrich the MovieLens 1M dataset with the IMDB dataset. We will merge on both the primary and original title of the movie. We will also use the year of the movie to filter out the movies to differentiate between movies with the same title.

To do so we will :
- Get all the columns of the IMDB dataset that are not in the MovieLens 1M dataset
- Merge the datasets on the primary title and original title of the movie (processed)
- Filter out the movies that have the same title but different years
- Drop the columns that are not useful

In [17]:
merged_movies = pd.merge(ml_movies, imdb_title, on=['primary_title_processed', 'date'], how='left')

# Remove the rows where tconst is NaN
merged_movies = merged_movies[~merged_movies["tconst"].isna()]

In [18]:
def select_non_zero(col1, col2):
    if col1.name.endswith('_x'):
        base_name = col1.name[:-2]
    else:
        base_name = col2.name[:-2]
    
    mask = (col1 != 0) | (col2 != 0)
    result = pd.Series(0, index=col1.index, name=base_name)
    result[mask] = col1[mask].combine_first(col2[mask])
    return result

# Identify columns with _x and _y suffixes
x_cols = [col for col in merged_movies.columns if col.endswith('_x')]
y_cols = [col for col in merged_movies.columns if col.endswith('_y')]

# Combine the columns and remove suffixes
for x_col in x_cols:
    base_name = x_col[:-2]
    y_col = base_name + '_y'
    
    if y_col in y_cols:
        merged_movies[base_name] = select_non_zero(merged_movies[x_col], merged_movies[y_col])
        merged_movies.drop(columns=[x_col, y_col], inplace=True)
    else:
        merged_movies.rename(columns={x_col: base_name}, inplace=True)

# Rename any remaining _y columns
for y_col in y_cols:
    if y_col in merged_movies.columns:
        base_name = y_col[:-2]
        merged_movies.rename(columns={y_col: base_name}, inplace=True)

# Remove any duplicate columns that might still exist
merged_movies = merged_movies.loc[:, ~merged_movies.columns.duplicated()]

print("Merged and cleaned dataset shape:", merged_movies.shape)
print("Columns in final dataset:", merged_movies.columns.tolist())

Merged and cleaned dataset shape: (3296, 36)
Columns in final dataset: ['movie_id', 'date', 'primary_title_processed', "Children's", 'tconst', 'Family', 'Music', 'Reality-TV', 'Sport', 'Short', 'Adult', 'History', 'News', 'Biography', 'Talk-Show', 'Game-Show', 'primary_title', 'original_title', 'original_title_processed', 'Fantasy', 'Action', 'Western', 'Sci-Fi', 'Documentary', 'Romance', 'Drama', 'Horror', 'Adventure', 'Thriller', 'War', 'Crime', 'Animation', 'Film-Noir', 'Musical', 'Mystery', 'Comedy']


In [19]:
new_order = [
    'movie_id', 'tconst', 
    
    'primary_title', 'original_title', 
    
    'date',
    
    'Action', 'Adventure', 'Animation', 'Biography', 'Children\'s', 'Comedy', 'Crime', 
    'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History', 
    'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 
    'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western', 'Adult'
]

merged_movies = merged_movies[new_order]

In [20]:
# Convert all the float columns to int
for col in merged_movies.columns:
    if merged_movies[col].dtype == float:
        merged_movies[col] = merged_movies[col].fillna(0).astype(int)
        
display(merged_movies.head())

Unnamed: 0,movie_id,tconst,primary_title,original_title,date,Action,Adventure,Animation,Biography,Children's,...,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Adult
0,movie_1,tt0114709,Toy Story,...,1995,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,movie_2,tt0113497,Jumanji,...,1995,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,movie_3,tt0113228,Grumpier Old Men,...,1995,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,movie_4,tt0114885,Waiting to Exhale,...,1995,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,movie_5,tt0113041,Father of the Bride Part II,...,1995,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Features extraction

For this exercise we will use the following features:
- genres
- actors and actresses
- average rating from IMDB
- user rating from MovieLens 1M

In [21]:
actor_data = imdb_name[imdb_name['primaryProfession'].str.contains('actor|actress', case=False, na=False)]

# Dictionary mapping tconst to a list of actors
movie_actors = {}
for _, row in actor_data.iterrows():
    for movie in row['knownForTitles'].split(','):
        if movie in movie_actors:
            movie_actors[movie].append(row['primaryName'])
        else:
            movie_actors[movie] = [row['primaryName']]

In [22]:
merged_movies = pd.merge(merged_movies, imdb_rating[['tconst', 'averageRating']], on='tconst', how='left')

In [24]:
merged_movies['actors'] = merged_movies['tconst'].map(movie_actors)
merged_movies['actors'] = merged_movies['actors'].fillna('').apply(lambda x: ','.join(x[:5]) if isinstance(x, list) else '')

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_movie_features(movie):
    features = []
    for genre in ['Action', 'Adventure', 'Animation', 'Biography', "Children's", 'Comedy', 'Crime', 
                  'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History', 
                  'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 
                  'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western', 'Adult']:
        if movie[genre] == 1:
            features.append(genre)
    features.extend(movie['actors'].split(','))
    return ' '.join(features)

merged_movies['features'] = merged_movies.apply(get_movie_features, axis=1)

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(merged_movies['features'])

In [26]:
from scipy.sparse import csr_matrix

user_item_matrix = ml_ratings.pivot(index='movie_id', columns='user_id', values='rating').fillna(0)
csr_ratings = csr_matrix(user_item_matrix.values)

## Model and recommendation

In [28]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model.fit(csr_ratings)

In [38]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

# Function to get movie recommendations based on the ratings of users
def get_movie_recommendations(movie_id, user_item_matrix, model, n_recommendations=5):
    if movie_id not in user_item_matrix.index:
        return []
    movie_vector = user_item_matrix.loc[movie_id].values.reshape(1, -1)
    distances, indices = model.kneighbors(movie_vector, n_neighbors=n_recommendations+1)

    similar_movies = list(user_item_matrix.index[indices.flatten()])[1:]
    similar_movies_ratings = user_item_matrix.loc[similar_movies].mean(axis=1).sort_values(ascending=False)

    return similar_movies_ratings.index.tolist()[:n_recommendations]

def make_recommendations(movie1, movie2):
    index1 = merged_movies[merged_movies['primary_title'] == movie1].index[0]
    index2 = merged_movies[merged_movies['primary_title'] == movie2].index[0]
    
    combined_features = tfidf_matrix[index1] + tfidf_matrix[index2]
    similar_scores = cosine_similarity(combined_features, tfidf_matrix).flatten()
    
    similar_scores = list(enumerate(similar_scores))
    similar_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True)
    similar_scores = [i for i in similar_scores if i[0] not in [index1, index2]][:20]
    
    content_based_recommendations = [merged_movies.iloc[i[0]]['movie_id'] for i in similar_scores]
    
    movie1_id = merged_movies.iloc[index1]['movie_id']
    movie2_id = merged_movies.iloc[index2]['movie_id']
    
    collaborative_recommendations = get_movie_recommendations(movie1_id, user_item_matrix, model)
    collaborative_recommendations.extend(get_movie_recommendations(movie2_id, user_item_matrix, model))
    
    # Combine the two recommendation lists
    all_recommendations = content_based_recommendations + collaborative_recommendations
    recommendations_counts = {movie: all_recommendations.count(movie) for movie in all_recommendations}
    
    # Filter out movies not in merged_movies and get their average ratings
    valid_recommendations = {}
    for movie_id, count in recommendations_counts.items():
        movie_data = merged_movies[merged_movies['movie_id'] == movie_id]
        if not movie_data.empty:
            avg_rating = movie_data['averageRating'].values[0]
            valid_recommendations[movie_id] = (count, avg_rating)
    
    # Sort the recommendations by the number of times they appear and then by average rating
    sorted_recommendations = sorted(valid_recommendations.items(), 
                                    key=lambda x: (x[1][0], x[1][1]), 
                                    reverse=True)
    
    top_recommendations = sorted_recommendations[:3]
    return [merged_movies[merged_movies['movie_id'] == rec[0]]['primary_title'].values[0] for rec in top_recommendations]

movie1 = 'Toy Story'
movie2 = 'Jumanji'

recommendations = make_recommendations(movie1, movie2)
print(f"Recommendations for {movie1} and {movie2}:")
for i, rec in enumerate(recommendations):
    print(f"{i+1}. {rec}")



Recommendations for Toy Story and Jumanji:
1. Back to the Future
2. The Iron Giant
3. Groundhog Day
