In [14]:
import pandas as pd
import numpy as np
import re

In [22]:
movies_df = pd.read_csv("./data/ml-latest-small/movies.csv")
ratings_df = pd.read_csv("./data/ml-latest-small/ratings.csv")
data_tags = pd.read_csv("./data/ml-latest-small/tags.csv")

In [16]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [17]:
# Extract year from title and create a clean title column
def extract_year(title):
    match = re.search(r'\((\d{4})\)$', title)
    if match:
        return int(match.group(1))
    return None

def clean_title(title):
    return re.sub(r'\s*\(\d{4}\)$', '', title).strip()

# Apply the functions to the dataframe
movies_df['year'] = movies_df['title'].apply(extract_year)
movies_df['title_clean'] = movies_df['title'].apply(clean_title)    

In [20]:
# One-hot encode genres
# First, create a list of all genres
movies_df['genres_list'] = movies_df['genres'].str.split('|')
all_genres = []
for genres in movies_df['genres_list']:
    all_genres.extend(genres)
unique_genres = sorted(list(set(all_genres)))
unique_genres.remove('(no genres listed)') if '(no genres listed)' in unique_genres else None

# Create a column for each genre
for genre in unique_genres:
    movies_df[genre] = movies_df['genres'].apply(lambda x: 1 if genre in x else 0)

# Check the processed data
print("\nProcessed movies dataset:")
movies_df.head()


Processed movies dataset:


Unnamed: 0,movieId,title,genres,year,title_clean,genres_list,Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995.0,Jumanji,"[Adventure, Children, Fantasy]",0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,1995.0,Grumpier Old Men,"[Comedy, Romance]",0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995.0,Waiting to Exhale,"[Comedy, Drama, Romance]",0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,1995.0,Father of the Bride Part II,[Comedy],0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Group ratings by movie to get average rating and count
movie_stats = ratings_df.groupby('movieId').agg(
    avg_rating=('rating', 'mean'),
    num_ratings=('rating', 'count')
).reset_index()
movie_stats

Unnamed: 0,movieId,avg_rating,num_ratings
0,1,3.920930,215
1,2,3.431818,110
2,3,3.259615,52
3,4,2.357143,7
4,5,3.071429,49
...,...,...,...
9719,193581,4.000000,1
9720,193583,3.500000,1
9721,193585,3.500000,1
9722,193587,3.500000,1


In [24]:
# Merge with movies dataframe
movies_df = pd.merge(movies_df, movie_stats, on='movieId', how='left')

In [25]:
# Fill NaN values for movies with no ratings
movies_df['avg_rating'] = movies_df['avg_rating'].fillna(0)
movies_df['num_ratings'] = movies_df['num_ratings'].fillna(0)

In [26]:
movies_df.head()

Unnamed: 0,movieId,title,genres,avg_rating,num_ratings
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,215.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,110.0
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,52.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143,7.0
4,5,Father of the Bride Part II (1995),Comedy,3.071429,49.0


In [28]:
import pandas as pd
import numpy as np
import re
import os

# Create processed_data directory if it doesn't exist
os.makedirs('processed_data', exist_ok=True)

# Load the datasets
# Adjust the file paths based on where you've saved the MovieLens data
movies_df = pd.read_csv('./data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('./data/ml-latest-small/ratings.csv')
links_df = pd.read_csv('./data/ml-latest-small/links.csv')
tags_df = pd.read_csv('./data/ml-latest-small/tags.csv')

print("Movies dataset shape:", movies_df.shape)
print("Ratings dataset shape:", ratings_df.shape)

# Step 1: Clean and preprocess the movies dataset
def extract_year(title):
    match = re.search(r'\((\d{4})\)$', title)
    if match:
        return int(match.group(1))
    return None

def clean_title(title):
    return re.sub(r'\s*\(\d{4}\)$', '', title).strip()

# Apply the functions to the dataframe
movies_df['year'] = movies_df['title'].apply(extract_year)
movies_df['title_clean'] = movies_df['title'].apply(clean_title)

# Step 2: Process genres
# One-hot encode genres
movies_df['genres_list'] = movies_df['genres'].str.split('|')
all_genres = []
for genres in movies_df['genres_list']:
    all_genres.extend(genres)
unique_genres = sorted(list(set(all_genres)))
if '(no genres listed)' in unique_genres:
    unique_genres.remove('(no genres listed)')

# Create a column for each genre
for genre in unique_genres:
    movies_df[genre] = movies_df['genres'].apply(lambda x: 1 if genre in x else 0)

Movies dataset shape: (9742, 3)
Ratings dataset shape: (100836, 4)


In [29]:
movies_df.head()

Unnamed: 0,movieId,title,genres,year,title_clean,genres_list,Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995.0,Jumanji,"[Adventure, Children, Fantasy]",0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,1995.0,Grumpier Old Men,"[Comedy, Romance]",0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995.0,Waiting to Exhale,"[Comedy, Drama, Romance]",0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,1995.0,Father of the Bride Part II,[Comedy],0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# Step 3: Process ratings separately and then merge
# Group ratings by movie to get average rating and count
movie_stats = ratings_df.groupby('movieId').agg(
    avg_rating=('rating', 'mean'),
    num_ratings=('rating', 'count')
).reset_index()

# Merge with movies dataframe
# Use left join to keep all movies even if they don't have ratings
movies_with_ratings = pd.merge(movies_df, movie_stats, on='movieId', how='left')

# Fill NaN values for movies with no ratings
movies_with_ratings['avg_rating'] = movies_with_ratings['avg_rating'].fillna(0)
movies_with_ratings['num_ratings'] = movies_with_ratings['num_ratings'].fillna(0)

In [31]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,year,title_clean,genres_list,Action,Adventure,Animation,Children,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,avg_rating,num_ratings
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",0,1,1,1,...,0,0,0,0,0,0,0,0,3.92093,215.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995.0,Jumanji,"[Adventure, Children, Fantasy]",0,1,0,1,...,0,0,0,0,0,0,0,0,3.431818,110.0
2,3,Grumpier Old Men (1995),Comedy|Romance,1995.0,Grumpier Old Men,"[Comedy, Romance]",0,0,0,0,...,0,0,0,1,0,0,0,0,3.259615,52.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995.0,Waiting to Exhale,"[Comedy, Drama, Romance]",0,0,0,0,...,0,0,0,1,0,0,0,0,2.357143,7.0
4,5,Father of the Bride Part II (1995),Comedy,1995.0,Father of the Bride Part II,[Comedy],0,0,0,0,...,0,0,0,0,0,0,0,0,3.071429,49.0


In [32]:
# Step 4: Create a popularity score (weighted rating)
# This gives preference to movies with more ratings
C = movies_with_ratings['num_ratings'].mean()  # minimum number of votes required
m = movies_with_ratings['avg_rating'].mean()   # mean vote across the whole report

movies_with_ratings['weighted_rating'] = ((movies_with_ratings['num_ratings'] / (movies_with_ratings['num_ratings'] + C)) * 
                               movies_with_ratings['avg_rating']) + ((C / (movies_with_ratings['num_ratings'] + C)) * m)

In [33]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,year,title_clean,genres_list,Action,Adventure,Animation,Children,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,avg_rating,num_ratings,weighted_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",0,1,1,1,...,0,0,0,0,0,0,0,3.92093,215.0,3.890408
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995.0,Jumanji,"[Adventure, Children, Fantasy]",0,1,0,1,...,0,0,0,0,0,0,0,3.431818,110.0,3.416733
2,3,Grumpier Old Men (1995),Comedy|Romance,1995.0,Grumpier Old Men,"[Comedy, Romance]",0,0,0,0,...,0,0,1,0,0,0,0,3.259615,52.0,3.259085
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995.0,Waiting to Exhale,"[Comedy, Drama, Romance]",0,0,0,0,...,0,0,1,0,0,0,0,2.357143,7.0,2.893613
4,5,Father of the Bride Part II (1995),Comedy,1995.0,Father of the Bride Part II,[Comedy],0,0,0,0,...,0,0,0,0,0,0,0,3.071429,49.0,3.103691


In [34]:
# Step 5: Merge with links to get external IDs
movies_final = pd.merge(movies_with_ratings, links_df, on='movieId', how='left')

# Convert IDs to proper format
movies_final['imdbId'] = movies_final['imdbId'].fillna(0).astype(int).astype(str).apply(lambda x: f"tt{x.zfill(7)}")
movies_final['tmdbId'] = movies_final['tmdbId'].fillna(0).astype(int)

In [35]:
movies_final.head()

Unnamed: 0,movieId,title,genres,year,title_clean,genres_list,Action,Adventure,Animation,Children,...,Romance,Sci-Fi,Thriller,War,Western,avg_rating,num_ratings,weighted_rating,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",0,1,1,1,...,0,0,0,0,0,3.92093,215.0,3.890408,tt0114709,862
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995.0,Jumanji,"[Adventure, Children, Fantasy]",0,1,0,1,...,0,0,0,0,0,3.431818,110.0,3.416733,tt0113497,8844
2,3,Grumpier Old Men (1995),Comedy|Romance,1995.0,Grumpier Old Men,"[Comedy, Romance]",0,0,0,0,...,1,0,0,0,0,3.259615,52.0,3.259085,tt0113228,15602
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995.0,Waiting to Exhale,"[Comedy, Drama, Romance]",0,0,0,0,...,1,0,0,0,0,2.357143,7.0,2.893613,tt0114885,31357
4,5,Father of the Bride Part II (1995),Comedy,1995.0,Father of the Bride Part II,[Comedy],0,0,0,0,...,0,0,0,0,0,3.071429,49.0,3.103691,tt0113041,11862


In [36]:
# Step 6: Save the preprocessed data
movies_final.to_csv('processed_data/movies_processed.csv', index=False)
ratings_df.to_csv('processed_data/ratings.csv', index=False)

In [38]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [39]:
# Create a "lightweight" version with just essential columns for the web app
movies_light = movies_final[['movieId', 'title_clean', 'genres', 'year', 'avg_rating', 
                          'num_ratings', 'weighted_rating', 'imdbId', 'tmdbId']]
movies_light.to_csv('processed_data/movies_light.csv', index=False)

print("\nPreprocessed data saved to 'processed_data' directory.")

# Step 7: Create a content matrix for similarity calculation
from sklearn.preprocessing import MinMaxScaler

# Scale numerical features
scaler = MinMaxScaler()
if 'year' in movies_final.columns and movies_final['year'].notna().all():
    movies_final['year_scaled'] = scaler.fit_transform(movies_final[['year']])
else:
    # Handle case where year might be missing for some movies
    movies_final['year_scaled'] = scaler.fit_transform(movies_final[['year']].fillna(movies_final['year'].median()))
    
# Scale ratings
movies_final['rating_scaled'] = scaler.fit_transform(movies_final[['weighted_rating']])

# Prepare the final feature matrix (genres + scaled features)
feature_cols = unique_genres + ['year_scaled', 'rating_scaled']
feature_matrix = movies_final[feature_cols].fillna(0).values

# Save the feature matrix
np.save('processed_data/feature_matrix.npy', feature_matrix)
# Save the corresponding movie IDs
np.save('processed_data/movie_indices.npy', movies_final['movieId'].values)

print("\nFeature matrix created and saved.")


Preprocessed data saved to 'processed_data' directory.

Feature matrix created and saved.


In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import pickle

# Load the feature matrix
feature_matrix = np.load('processed_data/feature_matrix.npy')
movie_indices = np.load('processed_data/movie_indices.npy')
movies_df = pd.read_csv('processed_data/movies_processed.csv')

In [4]:
# Calculate the similarity matrix
similarity_matrix = cosine_similarity(feature_matrix)

In [6]:
# Create a function to get recommendations
def get_recommendations(movie_ids, similarity_matrix=similarity_matrix, 
                       movie_indices=movie_indices, movies_df=movies_df, top_n=10):
    """
    Get movie recommendations based on a list of movie IDs
    """
    # Find the indices of the input movies
    movie_idx_list = []
    for movie_id in movie_ids:
        try:
            idx = np.where(movie_indices == movie_id)[0][0]
            movie_idx_list.append(idx)
        except IndexError:
            continue
    
    if not movie_idx_list:
        return []
    
    # Get the average similarity scores across all input movies
    sim_scores = np.zeros(len(similarity_matrix))
    for idx in movie_idx_list:
        sim_scores += similarity_matrix[idx]
    
    sim_scores = sim_scores / len(movie_idx_list)
    
    # Get the indices of movies sorted by similarity score
    sim_scores_with_indices = list(enumerate(sim_scores))
    sim_scores_with_indices = sorted(sim_scores_with_indices, key=lambda x: x[1], reverse=True)
    
    # Filter out the input movies
    sim_scores_with_indices = [x for x in sim_scores_with_indices if x[0] not in movie_idx_list]
    
    # Get the top N most similar movies
    top_movies_indices = [i[0] for i in sim_scores_with_indices[:top_n]]
    
    # Get the movie IDs for the top movies
    top_movie_ids = [movie_indices[i] for i in top_movies_indices]
    
    # Return the top movies' information
    return movies_df[movies_df['movieId'].isin(top_movie_ids)][
        ['movieId', 'title_clean', 'genres', 'year', 'avg_rating', 'num_ratings']
    ].sort_values(by='avg_rating', ascending=False)

# Save the model components
with open('model/movie_recommender_model.pkl', 'wb') as f:
    pickle.dump({
        'similarity_matrix': similarity_matrix,
        'movie_indices': movie_indices,
        'get_recommendations': get_recommendations
    }, f)

print("Recommendation model built and saved.")

Recommendation model built and saved.
