In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [None]:
# Load dataset
df = pd.read_csv("TMDB_IMDB_movies.csv")

In [24]:
# test = pd.read_csv("TMDB_IMDB_movies.csv")
# test.shape

In [25]:
# Clean & Filter
df['popularity'] = pd.to_numeric(df['popularity'], errors='coerce') 
df.dropna(subset=['popularity'], inplace=True)  

# Filter movies based on your conditions
df = df[(df['popularity'] >= 10) & (df['vote_average'] >= 6.5) & (df['vote_count'] >= 100)] 

In [26]:
df.shape

(6672, 29)

In [27]:
df.head(1)

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,genres,production_companies,production_countries,spoken_languages,keywords,directors,writers,averageRating,numVotes,cast
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",Christopher Nolan,Christopher Nolan,8.8,2644069,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W..."


In [28]:
# Remove duplicates based on 'title'
df = df.drop_duplicates(subset='title', keep='first')  

# Fill NaN values in required columns
for feature in ['id', 'title', 'genres', 'directors', 'writers', 'cast', 'overview', 'keywords']:
    df[feature] = df[feature].fillna('')  

In [29]:
def format_and_limit_names(title_names):
    # Split names by commas
    title_list = title_names.split(',')
    
    # Format each name to camel case (remove spaces between first and last name)
    formatted_title_list = [name.strip().replace(' ', '') for name in title_list]
    
    # Join the names with a space
    return ' '.join(formatted_title_list)

df['formatted_title'] = df['title'].apply(format_and_limit_names)

In [41]:
df['formatted_title'] = df['formatted_title'].str.lower().str.strip()

In [45]:
# movie_title = "Pride & Prejudice"  # make sure it's lowercase and stripped
# movie_row = df[df['formatted_title'] == movie_title]

# if not movie_row.empty:
#     print("Title found:", movie_row['title'].values[0])
# else:
#     print("Movie not found.")

In [43]:
movie_title = "Pride & Prejudice"
movie_row = df[df['title'] == movie_title]
print("Title:", movie_row['formatted_title'].values[0])

Title: pride&prejudice


In [47]:
df.head(1)

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,production_companies,production_countries,spoken_languages,keywords,directors,writers,averageRating,numVotes,cast,formatted_title
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,"Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",Christopher Nolan,Christopher Nolan,8.8,2644069,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W...",inception


In [102]:
def format_and_limit_names(cast_names, limit=2):
    # Split names by commas
    cast_list = cast_names.split(',')
    
    # Format each name to camel case (remove spaces between first and last name)
    formatted_cast_list = [name.strip().replace(' ', '') for name in cast_list]
    
    # Limit to the first 3 to 5 names (you can adjust this as needed)
    formatted_cast_list = formatted_cast_list[:limit]
    
    # Join the names with a space
    return ' '.join(formatted_cast_list)

# Apply the function to 'directors', 'writers', and 'cast' columns
df['formatted_directors'] = df['directors'].apply(format_and_limit_names).str.lower()
df['formatted_writers'] = df['writers'].apply(format_and_limit_names).str.lower()
df['formatted_cast'] = df['cast'].apply(format_and_limit_names).str.lower()


# Check the result for a specific movie
movie_title = "Snowpiercer"
movie_row = df[df['title'] == movie_title]
print("Directors:", movie_row['formatted_directors'].values[0])
print("Writers:", movie_row['formatted_writers'].values[0])
print("Cast:", movie_row['formatted_cast'].values[0])

Directors: bongjoonho
Writers: jacqueslob benjaminlegrand
Cast: chrisevans songkang-ho


In [None]:
# def format_and_limit_keywords(keywords, limit=5):
#     # Split keywords by commas
#     keyword_list = keywords.split(',')
    
#     # Strip any unnecessary spaces around each keyword
#     formatted_keyword_list = [keyword.strip().replace(' ', '') for keyword in keyword_list]
    
#     # Limit to the first 3 to 5 keywords
#     formatted_keyword_list = formatted_keyword_list[:limit]
    
#     # Join the keywords with a space
#     return ' '.join(formatted_keyword_list)

# # Apply the function to the 'keywords' column
# df['formatted_keywords'] = df['keywords'].apply(format_and_limit_keywords)

# # Check the result for a specific movie
# # movie_title = "Pride & Prejudice"
# # movie_row = df[df['title'] == movie_title]
# # # print("Keywords:", movie_row['formatted_keywords'].values[0])

In [51]:
def format_and_limit_genres(genres_string, limit=3):
    # Split genres by commas
    genres_list = genres_string.split(',')
    
    # Format each genre (remove spaces inside if you want compactness)
    formatted_genres_list = [genre.strip().replace(' ', '') for genre in genres_list]
    
    # Limit to first 'limit' genres
    formatted_genres_list = formatted_genres_list[:limit]
    
    # Join the genres with a space
    return ' '.join(formatted_genres_list)

df['formatted_genres'] = df['genres'].apply(format_and_limit_genres).str.lower()

# Check the result for a specific movie
# movie_title = "Pride & Prejudice"
# movie_row = df[df['title'] == movie_title]
# print("Genres:", movie_row['formatted_genres'].values[0])

In [53]:
for feature in ['keywords', 'overview']:
    df[feature] = df[feature].fillna('').str.lower().str.replace(r'[^\w\s]', '', regex=True)

In [55]:
df.head(1)

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,directors,writers,averageRating,numVotes,cast,formatted_title,formatted_directors,formatted_writers,formatted_cast,formatted_genres
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Christopher Nolan,Christopher Nolan,8.8,2644069,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W...",inception,christophernolan,christophernolan,leonardodicaprio josephgordon-levitt,action sciencefiction adventure


In [57]:
def create_combined_features(row):
    return (
        str(row['formatted_title']) + " " +
        str(row['formatted_cast']) + " " +
        str(row['formatted_directors']) + " " +
        str(row['formatted_writers']) + " " +
        str(row['formatted_genres']) + " " +
        str(row['overview']) + " " +
        str(row['keywords'])
    )

df['combined_features'] = df.apply(create_combined_features, axis=1)

# Check the result for a specific movie
movie_title = "Pride & Prejudice"
movie_row = df[df['title'] == movie_title]
print("combined_features:", movie_row['combined_features'].values[0])

combined_features: pride&prejudice keiraknightley matthewmacfadyen joewright deborahmoggach janeausten drama romance a story of love and life among the landed english gentry during the georgian era mr bennet is a gentleman living in hertfordshire with his overbearing wife and five daughters but if he dies their house will be inherited by a distant cousin whom they have never met so the familys future happiness and security is dependent on the daughters making good marriages based on novel or book england bachelor family relationships suitor prejudice period drama pride 18th century opposites attract gentleman georgian or regency era 1790s sisters


In [59]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isna(text):
        return ''
    text = re.sub(r'[^a-zA-Z]', ' ', text.lower())
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [61]:
df['combined_features'] = df['combined_features'].apply(clean_text)

In [62]:
df.head(1)

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,writers,averageRating,numVotes,cast,formatted_title,formatted_directors,formatted_writers,formatted_cast,formatted_genres,combined_features
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Christopher Nolan,8.8,2644069,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W...",inception,christophernolan,christophernolan,leonardodicaprio josephgordon-levitt,action sciencefiction adventure,inception leonardodicaprio josephgordon levitt...


In [63]:
# List of required columns
required_columns = ['id',
                    'title', 
                    'keywords',
                    'cast',
                    'directors',
                    'writers', 
                    'genres',
                    'combined_features']

# Create a new dataframe with only these columns
new_df = df[required_columns].copy()

# Rename the columns
new_df = new_df.rename(columns={
    'combined_features': 'tags'
})

# Display the first few rows to check
new_df.head(3)

Unnamed: 0,id,title,keywords,cast,directors,writers,genres,tags
0,27205,Inception,rescue mission dream airplane paris france vir...,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W...",Christopher Nolan,Christopher Nolan,"Action, Science Fiction, Adventure",inception leonardodicaprio josephgordon levitt...
1,157336,Interstellar,rescue future spacecraft race against time art...,"Matthew McConaughey, Anne Hathaway, Michael Ca...",Christopher Nolan,"Jonathan Nolan, Christopher Nolan","Adventure, Drama, Science Fiction",interstellar matthewmcconaughey annehathaway c...
2,155,The Dark Knight,joker sadism chaos secret identity crime fight...,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Go...","Drama, Action, Crime, Thriller",thedarkknight christianbale heathledger christ...


In [67]:
new_df.to_csv('processed_movies_dataset.csv', index=False)

In [69]:
new_df.shape

(6508, 8)

In [71]:
new_df.head(1)

Unnamed: 0,id,title,keywords,cast,directors,writers,genres,tags
0,27205,Inception,rescue mission dream airplane paris france vir...,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W...",Christopher Nolan,Christopher Nolan,"Action, Science Fiction, Adventure",inception leonardodicaprio josephgordon levitt...


Content-Based

In [74]:
# Vectorize with TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(new_df['tags'])  # 'tags' contains combined features

# Compute similarity matrix
sim_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)

In [75]:
from difflib import SequenceMatcher

# Title Similarity (Updated to handle case-insensitivity)
def get_title_similar_movies(title, new_df, top_n=5):
    # Convert the input title to lowercase for comparison
    title = title.lower()
    
    # Apply case-insensitive comparison
    new_df['title_score'] = new_df['title'].apply(lambda x: SequenceMatcher(None, title, x.lower()).ratio())
    top_titles = new_df.sort_values(by='title_score', ascending=False)
    
    return top_titles[['title']].head(top_n).assign(reason='Similar Title')

# Genre Matching (Ensure no case sensitivity issue)
def get_movies_with_similar_genre(title, new_df, top_n=10):
    # Get the genre of the input title (case-insensitive)
    target_genres = new_df[new_df['title'].str.lower() == title.lower()]['genres'].values[0]
    genre_match = new_df[new_df['genres'] == target_genres]
    return genre_match[['title']].drop_duplicates().head(top_n).assign(reason='Same Genre')

# Same Director (Handle case insensitivity)
def get_movies_by_same_director(title, new_df, top_n=10):
    # Get the director name for the input title
    director = new_df[new_df['title'].str.lower() == title.lower()]['directors'].values[0]
    same_director = new_df[new_df['directors'] == director]
    return same_director[['title']].drop_duplicates().head(top_n).assign(reason='Same Director')

# Same Cast (Handle case insensitivity)
def get_movies_with_same_cast(title, new_df, top_n=10):
    # Get the cast for the input title
    target_cast = set(new_df[new_df['title'].str.lower() == title.lower()]['cast'].values[0].split(", "))
    
    def cast_overlap(cast):
        return len(target_cast.intersection(set(cast.split(", "))))
    
    new_df['cast_score'] = new_df['cast'].apply(cast_overlap)
    return new_df.sort_values(by='cast_score', ascending=False)[['title']].head(top_n).assign(reason='Similar Cast')

# Same Writer (Handle case insensitivity)
def get_movies_by_same_writer(title, new_df, top_n=10):
    writer = new_df[new_df['title'].str.lower() == title.lower()]['writers'].values[0]
    same_writer = new_df[new_df['writers'] == writer]
    return same_writer[['title']].drop_duplicates().head(top_n).assign(reason='Same Writer')

# Remove duplicates
def remove_duplicates(df, movie_title):
    df = df[df['title'] != movie_title]
    return df.drop_duplicates(subset='title')

In [76]:
indices = pd.Series(df.index, index=df['title'])

In [80]:
def get_tfidf_similar_movies(title, df, tfidf_matrix, indices, top_n=10):
    if title not in indices:
        return pd.DataFrame(columns=['title', 'reason'])  # empty if title not found
    
    idx = indices[title]
    cosine_sim = linear_kernel(tfidf_matrix[idx], tfidf_matrix).flatten()
    sim_scores = list(enumerate(cosine_sim))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    
    return df.iloc[movie_indices][['title']].assign(reason='Same Genre')

In [96]:
def hybrid_recommend(title, new_df):
    part1 = get_movies_with_same_cast(title, new_df, top_n=10)  # Increase top_n here
    part2 = get_movies_by_same_director(title, new_df, top_n=10)
    part3 = get_movies_with_similar_genre(title, new_df, top_n=10)
    part4 = get_tfidf_similar_movies(title, df, tfidf_matrix, indices)
    part5 = get_movies_by_same_writer(title, new_df, top_n=10)
    part6 = get_title_similar_movies(title, new_df, top_n=5)

    # Combine all recommendations
    combined = pd.concat([part3, part4, part1, part2, part5, part6], ignore_index=True)
    #print(f"Total movies before cleaning: {len(combined)}")  # Debug: Check how many movies before cleaning

    # Clean duplicates
    clean = remove_duplicates(combined, title)
    #print(f"Total movies after cleaning: {len(clean)}")  # Debug: Check how many movies after cleaning

    # Return top 45 unique movies
    return clean.head(45)

In [98]:
recommendations = hybrid_recommend("Snowpiercer", df)
print(recommendations.to_string(index=False))

                                title        reason
                           Real Steel    Same Genre
                           Inside Job    Same Genre
                      Too Big to Fail    Same Genre
              The Wolf of Wall Street    Same Genre
                        Don't Look Up    Same Genre
                                 Vice    Same Genre
                             99 Homes    Same Genre
Anchorman: The Legend of Ron Burgundy    Same Genre
                        Step Brothers    Same Genre
                          Rising High    Same Genre
                The Resistance Banker    Same Genre
                           Hallam Foe  Similar Cast
                             The Host  Similar Cast
                               Gifted  Similar Cast
               Only Lovers Left Alive  Similar Cast
                   Death at a Funeral  Similar Cast
                       Broken Flowers  Similar Cast
     Perfume: The Story of a Murderer  Similar Cast
            

Collaborative Filtering

In [104]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from fuzzywuzzy import process
import pandas as pd
import re

In [106]:
# Step 1: Load your ratings data 
ratings_df = pd.read_csv('ratings.csv')

In [107]:
movies_df = pd.read_csv('movies.csv')  # The movies dataset

In [281]:
# Define the title cleaning function
def clean_movie_title(title):
    # Move articles like ", The" to the front and remove year
    match = re.match(r"(.+),\s(The|An|A)\s*\((\d{4})\)", title)
    if match:
        title_cleaned = f"{match.group(2)} {match.group(1)}"
    else:
        # Remove year in parentheses
        title_cleaned = re.sub(r"\s*\(\d{4}\)", "", title)

        # Handle "Title, The" format without year
        article_match = re.match(r"(.+),\s(The|An|A|La)$", title_cleaned)
        if article_match:
            title_cleaned = f"{article_match.group(2)} {article_match.group(1)}"

    return title_cleaned.strip()

# Apply the cleaning function
movies_df['title'] = movies_df['title'].apply(clean_movie_title)

# Optional: Remove duplicates based on cleaned titles
movies_df = movies_df.drop_duplicates(subset='title')

# Save back to the same CSV (or choose a new name like 'movies_cleaned.csv')
movies_df.to_csv('movies.csv', index=False)

print("🎉 Movie titles cleaned and saved to movies.csv!")

🎉 Movie titles cleaned and saved to movies.csv!


In [283]:
print(movies_df.shape)
movies_df.head(2)

(58139, 2)


Unnamed: 0,movieId,title
0,1,Toy Story
1,2,Jumanji


In [111]:
ratings_df = ratings_df.drop('timestamp', axis=1)

In [112]:
ratings_df.head(2)

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5


In [113]:
# Step 1: Filter the ratings dataset

# Keep only users who rated at least 100 movies
user_counts = ratings_df['userId'].value_counts()
active_users = user_counts[user_counts >= 100].index
ratings_df = ratings_df[ratings_df['userId'].isin(active_users)]

# Keep only movies that received at least 500 ratings
movie_counts = ratings_df['movieId'].value_counts()
popular_movies = movie_counts[movie_counts >= 500].index
ratings_df = ratings_df[ratings_df['movieId'].isin(popular_movies)]

In [114]:
ratings_df.head(1)

Unnamed: 0,userId,movieId,rating
70,2,1,3.5


In [115]:
ratings_df[ratings_df['userId'] == 2].shape[0]

181

In [116]:
# Step 2: Optionally, sample a fraction of the data (e.g., 20% of the data)
ratings_df = ratings_df.sample(frac=0.20, random_state=42)

In [117]:
ratings_df.shape

(3734130, 3)

In [118]:
ratings_df.head(1)

Unnamed: 0,userId,movieId,rating
11100743,72210,127202,4.0


In [119]:
# Filter the row with movieId 89745
movie_row = movies_df[movies_df['movieId'] == 8533]

# Display the row
print(movie_row)

      movieId         title
7721     8533  The Notebook


In [180]:
# #I noticed that a movie name which should have been The Avenger (2012) is written as Avengers,The (2012) 
# #hence I am manually changing it and storing it to the original dataset.

# # Find the row where movieId is 89745 and update the title
# movies_df.loc[movies_df['movieId'] == 89745, 'title'] = 'The Avengers (2012)'

# # Save the updated dataframe to the same CSV file
# movies_df.to_csv('movies.csv', index=False)

In [214]:
links_df=pd.read_csv('links.csv')

In [216]:
links_df.head(1)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0


In [218]:
# Merge on 'movieId'
merged_df = pd.merge(ratings_df, movies_df, on='movieId')

In [219]:
movie_links_merged = pd.merge(merged_df, links_df, on='movieId', how='left')

In [137]:
movies_df.head(1)

Unnamed: 0,movieId,title
0,1,Toy Story


In [228]:
# # Creating a shallow copy of df
# df_copy = df.copy()

# df_copy.rename(columns={'id': 'tmdbId'}, inplace=True)

# # Merge on tmdbId from step1 and id from tmdb
# Hybrid_df = pd.merge(movie_links_merged, df_copy, on='tmdbId', how='left')

In [None]:
# Hybrid_df.head(5)

In [241]:
# final_hybrid_df = Hybrid_df[[
#     'userId', 'movieId', 'rating', 'title_y', 'tmdbId', 
#     'overview', 'genres', 'directors', 'cast', 'writers', 'production_companies', 'tagline',
#     'vote_average', 'vote_count', 'runtime', 'adult', 'release_date', 'poster_path'
# ]]

# # Optional: rename columns for clarity
# final_hybrid_df = final_hybrid_df.rename(columns={
#     'title_y': 'title'
# })

In [140]:
ratings_df.head(1)

Unnamed: 0,userId,movieId,rating
11100743,72210,127202,4.0


In [142]:
final_hybrid_df.head(1)

Unnamed: 0,userId,movieId,rating,title
0,72210,127202,4.0,Me and Earl and the Dying Girl


In [255]:
merged_df.sort_values(by=['userId', 'rating'], ascending=[True, True], inplace=True)

In [253]:
merged_df.head(5)

Unnamed: 0,userId,movieId,rating,title
0,72210,127202,4.0,Me and Earl and the Dying Girl
1,60065,6947,4.5,Master and Commander: The Far Side of the World
2,72263,68954,3.0,Up
3,6908,500,4.0,Mrs. Doubtfire
4,56132,2194,4.0,The Untouchables


In [285]:
# Remove duplicates based on all columns
merged_df = merged_df.drop_duplicates()

# If you want to remove duplicates based on specific columns (e.g., userId and movieId)
# merged_df_no_duplicates = merged_df.drop_duplicates(subset=['userId', 'movieId'])

# Preview the first few rows
merged_df.head()

Unnamed: 0,userId,movieId,rating,title
1964379,2,1923,0.5,There's Something About Mary
2503336,2,2720,0.5,Inspector Gadget
1176722,2,2797,1.0,Big
2964544,2,1035,1.0,The Sound of Music
606078,2,4571,1.5,Bill & Ted's Excellent Adventure


In [147]:
merged_df.shape

(3639606, 4)

In [259]:
from scipy.sparse import csr_matrix

movie_user_mat_sparse = csr_matrix((merged_df['ratiZng'], (merged_df['movieId'], merged_df['userId'])))

In [261]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(movie_user_mat_sparse)

In [263]:
#Step 5: Create a mapping of movie titles to indices
# Create a dictionary to map movie titles to their movieId
movie_to_idx = pd.Series(merged_df.movieId.values, index=merged_df.title).drop_duplicates()

In [265]:
#Step 6: Define a fuzzy matching function to get the closest movie title match
# Define fuzzy matching function
def fuzzy_matching(mapper, fav_movie):
    match = process.extractOne(fav_movie, mapper.index)
    if match:
        return mapper[match[0]]
    else:
        return None

In [267]:
# Function to clean up collaborative filtering output from merged_df
def clean_collaborative_output(merged_df):
    # Keep only title column and add a reason column
    merged_df = merged_df[['title']Z].copy()
    merged_df['reason'] = 'Collaborative Filtering'
    return merged_df

In [275]:
#Step 7: Make Recommendations
# Define the collaborative recommendation function
def get_collaborative_recommendations(model_knn, data, mapper, fav_movie, n_recommendations):
    idx = fuzzy_matching(mapper, fav_movie)
    if idx is None:Z
        return pd.DataFrame(columns=['title', 'reason'])

    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations + 1)
    
    reverse_mapper = {v: k for k, v in mapper.items()}
    
    recommendations = []
    for i in range(1, len(distances.flatten())):  # skip the first item (itself)
        movie_id = indices.flatten()[i]
        movie_title = reverse_mapper.get(movie_id)
        if movie_title:
            cleaned_title = clean_movie_title(movie_title)
            recommendations.append({'title': cleaned_title, 'reason': 'Others also watched these'})

    # Convert recommendations to DataFrame and return
    return pd.DataFrame(recommendations)

In [279]:
# Sample test
query_movie = "Pride & Prejudice"  # Your query movie title
recommendations = get_collaborative_recommendations(model_knn, movie_user_mat_sparse, movie_to_idx, query_movie, 10)

# Set width for the columns for better alignment
title_width = 30 # Set the width for the movie titles column
reason_width = 20  # Set the width for the reason column

# Print headers with the appropriate alignment
print(f"{'title'.rjust(title_width)}        {'reason'.rjust(reason_width)}")

# Print each recommendation
for index, row in recommendations.iterrows():
    print(f"{row['title'].rjust(title_width)}   {row['reason'].rjust(reason_width)}")

                         title                      reason
               Sherlock Holmes   Others also watched these
Pirates of the Caribbean: The Curse of the Black Pearl   Others also watched these
Pirates of the Caribbean: At World's End   Others also watched these
                        Avatar   Others also watched these
                      Watchmen   Others also watched these
               The Dark Knight   Others also watched these


Forming the final dataset

In [None]:
print(Hybrid_df.columns)

In [None]:
final_hybrid_df.shape

In [None]:
final_hybrid_df = final_hybrid_df.sample(frac=0.30, random_state=42)

In [None]:
final_hybrid_df.to_csv('hybrid_dataset.csv', index=False)

Hybrid Recommendation

In [200]:
def hybrid_recommendation(title, content_df, tfidf_matrix, indices, model_knn, movie_user_mat_sparse, movie_to_idx, top_n=50):
    # --- Content-Based Parts ---
    part1 = get_movies_with_same_cast(title, content_df, top_n=10)
    part2 = get_movies_by_same_director(title, content_df, top_n=10)
    part3 = get_movies_with_similar_genre(title, content_df, top_n=10)
    part4 = get_tfidf_similar_movies(title, df, tfidf_matrix, indices, top_n=10)
    part5 = get_movies_by_same_writer(title, content_df, top_n=10)
    part6 = get_title_similar_movies(title, content_df, top_n=5)

    # Combine all content-based recommendations
    content_based_df = pd.concat([part1, part2, part3, part4, part5, part6], ignore_index=True)

    # --- Collaborative Part ---
    collab_df = get_collaborative_recommendations(model_knn, movie_user_mat_sparse, movie_to_idx, title, n_recommendations=10)

    # --- Combine both ---
    all_recs = pd.concat([content_based_df, collab_df], ignore_index=True)

    # --- Clean Up ---
    all_recs = all_recs[all_recs['title'].str.lower() != title.lower()]  # remove the queried title
    all_recs = all_recs.drop_duplicates(subset='title')  # remove duplicates
    final_recs = all_recs.head(top_n)  # take top N

    return final_recs

In [208]:
query = "Kung Fu PAnda"
final_recommendations = hybrid_recommendation(query, new_df, tfidf_matrix, indices, model_knn, movie_user_mat_sparse, movie_to_idx)

# Nicely print the recommendations
print(f"{'title'.ljust(40)}{'reason'}")
print("-" * 60)
for index, row in final_recommendations.iterrows():
    print(f"{row['title'].ljust(40)}{row['reason']}")

title                                   reason
------------------------------------------------------------
Kung Fu Panda Holiday                   Similar Cast
Kung Fu Panda 2                         Similar Cast
Kung Fu Panda: Secrets of the Scroll    Similar Cast
Kung Fu Panda 3                         Similar Cast
Kung Fu Panda: Secrets of the Furious FiveSimilar Cast
The Super Mario Bros. Movie             Similar Cast
The Disaster Artist                     Similar Cast
Tinker Bell and the Pirate Fairy        Similar Cast
Maleficent                              Similar Cast
LEGO DC Comics Super Heroes: Justice League vs. Bizarro LeagueSame Genre
Kung Fu Jungle                          Similar Title
Iron Man                                Others also watched these
Inception                               Others also watched these
Sherlock Holmes                         Others also watched these
The Incredibles                         Others also watched these
WALL·E                