In [1]:
import pandas as pd
import numpy as np
import scipy as sp # <-- The sister of Numpy, used in our code for numerical efficientcy. 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
# Entity featurization and similarity computation
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Libraries used during sorting procedures.
import operator # <-- Convienient item retrieval during iteration 
import heapq # <-- Efficient sorting of large lists

np.random.seed(42)

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
genome_scores = pd.read_csv("genome_scores.csv")
genome_tags = pd.read_csv("genome_tags.csv")
imdb_data =  pd.read_csv("imdb_data.csv")
links = pd.read_csv("links.csv")
movies = pd.read_csv("movies.csv")
tags = pd.read_csv("tags.csv")
movie_csv = pd.read_csv("MovieGenre.csv", encoding='latin-1')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [4]:
imdb_data

Unnamed: 0,movieId,title_cast,director,runtime,budget,plot_keywords
0,1,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation
1,2,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game
2,3,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,"$25,000,000",boat|lake|neighbor|rivalry
3,4,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,"$16,000,000",black american|husband wife relationship|betra...
4,5,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion
...,...,...,...,...,...,...
27273,131254,Franz Dinda|Florian Lukas|Axel Stein|Kailas Ma...,Carsten Funke,85.0,,man wrapped in a towel|man wears a thong|male ...
27274,131256,Rick Kavanian|Axel Stein|Eva Habermann|Christo...,Matthias Dinter,83.0,"DEM5,800,000",ski|ski resort|ampersand in title|drink in title
27275,131258,Nam-gil Kim|Ye-jin Son|Hae-Jin Yoo|Kyeong-yeon...,Seong-il Cheon,130.0,,pirate|sword fight|korea|bandit
27276,131260,Martti Suosalo|Ilkka Koivula|Vexi Salmi|Riitta...,Timo Koivusalo,102.0,,friend|friendship|television show|restaurant


In [5]:
movie_csv

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),6.6,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),5.7,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),5.9,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...
...,...,...,...,...,...,...
40103,83168,http://www.imdb.com/title/tt83168,Tanya's Island (1980),4.3,Drama,https://images-na.ssl-images-amazon.com/images...
40104,82875,http://www.imdb.com/title/tt82875,Pacific Banana (1981),4.7,Comedy,https://images-na.ssl-images-amazon.com/images...
40105,815258,http://www.imdb.com/title/tt815258,Werewolf in a Womens Prison (2006),4.5,Horror,https://images-na.ssl-images-amazon.com/images...
40106,79142,http://www.imdb.com/title/tt79142,Xiao zi ming da (1979),6.5,Action|Comedy,https://images-na.ssl-images-amazon.com/images...


In [6]:

movie_merge = pd.merge(movies, imdb_data, on= ['movieId','movieId'], how = 'inner')

In [7]:
movie_merge

Unnamed: 0,movieId,title,genres,title_cast,director,runtime,budget,plot_keywords
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game
2,3,Grumpier Old Men (1995),Comedy|Romance,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,"$25,000,000",boat|lake|neighbor|rivalry
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,"$16,000,000",black american|husband wife relationship|betra...
4,5,Father of the Bride Part II (1995),Comedy,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion
...,...,...,...,...,...,...,...,...
24861,131254,Kein Bund für's Leben (2007),Comedy,Franz Dinda|Florian Lukas|Axel Stein|Kailas Ma...,Carsten Funke,85.0,,man wrapped in a towel|man wears a thong|male ...
24862,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,Rick Kavanian|Axel Stein|Eva Habermann|Christo...,Matthias Dinter,83.0,"DEM5,800,000",ski|ski resort|ampersand in title|drink in title
24863,131258,The Pirates (2014),Adventure,Nam-gil Kim|Ye-jin Son|Hae-Jin Yoo|Kyeong-yeon...,Seong-il Cheon,130.0,,pirate|sword fight|korea|bandit
24864,131260,Rentun Ruusu (2001),(no genres listed),Martti Suosalo|Ilkka Koivula|Vexi Salmi|Riitta...,Timo Koivusalo,102.0,,friend|friendship|television show|restaurant


In [8]:
# Rename the 'title' column to 'Title' in the smaller dataset
movie_merge.rename(columns={'title': 'Title'}, inplace=True)

# Merge the datasets on the 'Title' column
merged_data = pd.merge(movie_merge, movie_csv[['Title', 'IMDB Score','Poster', 'Imdb Link']], on='Title', how='left')
merged_data


Unnamed: 0,movieId,Title,genres,title_cast,director,runtime,budget,plot_keywords,IMDB Score,Poster,Imdb Link
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation,8.3,https://images-na.ssl-images-amazon.com/images...,http://www.imdb.com/title/tt114709
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game,6.9,https://images-na.ssl-images-amazon.com/images...,http://www.imdb.com/title/tt113497
2,3,Grumpier Old Men (1995),Comedy|Romance,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,"$25,000,000",boat|lake|neighbor|rivalry,6.6,https://images-na.ssl-images-amazon.com/images...,http://www.imdb.com/title/tt113228
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,"$16,000,000",black american|husband wife relationship|betra...,5.7,https://images-na.ssl-images-amazon.com/images...,http://www.imdb.com/title/tt114885
4,5,Father of the Bride Part II (1995),Comedy,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion,5.9,https://images-na.ssl-images-amazon.com/images...,http://www.imdb.com/title/tt113041
...,...,...,...,...,...,...,...,...,...,...,...
25196,131254,Kein Bund für's Leben (2007),Comedy,Franz Dinda|Florian Lukas|Axel Stein|Kailas Ma...,Carsten Funke,85.0,,man wrapped in a towel|man wears a thong|male ...,,,
25197,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,Rick Kavanian|Axel Stein|Eva Habermann|Christo...,Matthias Dinter,83.0,"DEM5,800,000",ski|ski resort|ampersand in title|drink in title,3.3,https://images-na.ssl-images-amazon.com/images...,http://www.imdb.com/title/tt277703
25198,131258,The Pirates (2014),Adventure,Nam-gil Kim|Ye-jin Son|Hae-Jin Yoo|Kyeong-yeon...,Seong-il Cheon,130.0,,pirate|sword fight|korea|bandit,,,
25199,131260,Rentun Ruusu (2001),(no genres listed),Martti Suosalo|Ilkka Koivula|Vexi Salmi|Riitta...,Timo Koivusalo,102.0,,friend|friendship|television show|restaurant,,,


In [9]:
merged_data.rename(columns={'Title': 'title'}, inplace=True)


In [10]:
#movie_merge = pd.merge(train,merged_data, on= ['movieId','movieId'], how = 'inner')


In [11]:
movie_merge

Unnamed: 0,movieId,Title,genres,title_cast,director,runtime,budget,plot_keywords
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game
2,3,Grumpier Old Men (1995),Comedy|Romance,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,"$25,000,000",boat|lake|neighbor|rivalry
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,"$16,000,000",black american|husband wife relationship|betra...
4,5,Father of the Bride Part II (1995),Comedy,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion
...,...,...,...,...,...,...,...,...
24861,131254,Kein Bund für's Leben (2007),Comedy,Franz Dinda|Florian Lukas|Axel Stein|Kailas Ma...,Carsten Funke,85.0,,man wrapped in a towel|man wears a thong|male ...
24862,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,Rick Kavanian|Axel Stein|Eva Habermann|Christo...,Matthias Dinter,83.0,"DEM5,800,000",ski|ski resort|ampersand in title|drink in title
24863,131258,The Pirates (2014),Adventure,Nam-gil Kim|Ye-jin Son|Hae-Jin Yoo|Kyeong-yeon...,Seong-il Cheon,130.0,,pirate|sword fight|korea|bandit
24864,131260,Rentun Ruusu (2001),(no genres listed),Martti Suosalo|Ilkka Koivula|Vexi Salmi|Riitta...,Timo Koivusalo,102.0,,friend|friendship|television show|restaurant


In [12]:
movie_merge.drop( columns= 'budget', inplace= True)

In [13]:
movie_merge.dropna(inplace=True)

In [14]:
movie_merge.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12632 entries, 0 to 24865
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   movieId        12632 non-null  int64  
 1   Title          12632 non-null  object 
 2   genres         12632 non-null  object 
 3   title_cast     12632 non-null  object 
 4   director       12632 non-null  object 
 5   runtime        12632 non-null  float64
 6   plot_keywords  12632 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 789.5+ KB


In [15]:
movie_merge.isnull().sum()

movieId          0
Title            0
genres           0
title_cast       0
director         0
runtime          0
plot_keywords    0
dtype: int64

In [16]:
movie_merge.shape

(12632, 7)

In [17]:
# Combine relevant text features into a single string for each movie
movie_merge['combined_features'] = movie_merge['director'] + ' ' + movie_merge['genres'] + ' ' + movie_merge['plot_keywords'].fillna('') + ' ' + movie_merge['title_cast'].fillna('')


In [18]:
grouped_movie_merge = movie_merge

In [19]:
# grouped_movie_merge = movie_merge.groupby(["combined_features","genres","title"])["rating"].mean().reset_index()
# grouped_movie_merge

In [20]:
#number_of_ratings = movie_merge.groupby("title")['rating'].count().reset_index()

In [21]:
#grouped_movie_merge = pd.merge(grouped_movie_merge, number_of_ratings, on= ['title','title'], how = 'inner')

In [22]:
grouped_movie_merge

Unnamed: 0,movieId,Title,genres,title_cast,director,runtime,plot_keywords,combined_features
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,toy|rivalry|cowboy|cgi animation,John Lasseter Adventure|Animation|Children|Com...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,board game|adventurer|fight|game,Jonathan Hensleigh Adventure|Children|Fantasy ...
2,3,Grumpier Old Men (1995),Comedy|Romance,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,boat|lake|neighbor|rivalry,Mark Steven Johnson Comedy|Romance boat|lake|n...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,black american|husband wife relationship|betra...,Terry McMillan Comedy|Drama|Romance black amer...
4,5,Father of the Bride Part II (1995),Comedy,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,fatherhood|doberman|dog|mansion,Albert Hackett Comedy fatherhood|doberman|dog|...
...,...,...,...,...,...,...,...,...
24861,131254,Kein Bund für's Leben (2007),Comedy,Franz Dinda|Florian Lukas|Axel Stein|Kailas Ma...,Carsten Funke,85.0,man wrapped in a towel|man wears a thong|male ...,Carsten Funke Comedy man wrapped in a towel|ma...
24862,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,Rick Kavanian|Axel Stein|Eva Habermann|Christo...,Matthias Dinter,83.0,ski|ski resort|ampersand in title|drink in title,Matthias Dinter Comedy ski|ski resort|ampersan...
24863,131258,The Pirates (2014),Adventure,Nam-gil Kim|Ye-jin Son|Hae-Jin Yoo|Kyeong-yeon...,Seong-il Cheon,130.0,pirate|sword fight|korea|bandit,Seong-il Cheon Adventure pirate|sword fight|ko...
24864,131260,Rentun Ruusu (2001),(no genres listed),Martti Suosalo|Ilkka Koivula|Vexi Salmi|Riitta...,Timo Koivusalo,102.0,friend|friendship|television show|restaurant,Timo Koivusalo (no genres listed) friend|frien...


In [23]:
grouped_movie_merge

Unnamed: 0,movieId,Title,genres,title_cast,director,runtime,plot_keywords,combined_features
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,toy|rivalry|cowboy|cgi animation,John Lasseter Adventure|Animation|Children|Com...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,board game|adventurer|fight|game,Jonathan Hensleigh Adventure|Children|Fantasy ...
2,3,Grumpier Old Men (1995),Comedy|Romance,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,boat|lake|neighbor|rivalry,Mark Steven Johnson Comedy|Romance boat|lake|n...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,black american|husband wife relationship|betra...,Terry McMillan Comedy|Drama|Romance black amer...
4,5,Father of the Bride Part II (1995),Comedy,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,fatherhood|doberman|dog|mansion,Albert Hackett Comedy fatherhood|doberman|dog|...
...,...,...,...,...,...,...,...,...
24861,131254,Kein Bund für's Leben (2007),Comedy,Franz Dinda|Florian Lukas|Axel Stein|Kailas Ma...,Carsten Funke,85.0,man wrapped in a towel|man wears a thong|male ...,Carsten Funke Comedy man wrapped in a towel|ma...
24862,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,Rick Kavanian|Axel Stein|Eva Habermann|Christo...,Matthias Dinter,83.0,ski|ski resort|ampersand in title|drink in title,Matthias Dinter Comedy ski|ski resort|ampersan...
24863,131258,The Pirates (2014),Adventure,Nam-gil Kim|Ye-jin Son|Hae-Jin Yoo|Kyeong-yeon...,Seong-il Cheon,130.0,pirate|sword fight|korea|bandit,Seong-il Cheon Adventure pirate|sword fight|ko...
24864,131260,Rentun Ruusu (2001),(no genres listed),Martti Suosalo|Ilkka Koivula|Vexi Salmi|Riitta...,Timo Koivusalo,102.0,friend|friendship|television show|restaurant,Timo Koivusalo (no genres listed) friend|frien...


In [24]:
grouped_movie_merge.to_csv(f'grouped_movies_merge.csv', index=False)

In [25]:
# Create a TfidfVectorizer object
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the combined features
tfidf_matrix = tfidf.fit_transform(grouped_movie_merge['combined_features'])

# Display the shape of the Tfidf matrix
print(tfidf_matrix.shape)


(12632, 71782)


In [26]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Display the shape of the cosine similarity matrix
print(cosine_sim.shape)

(12632, 12632)


In [27]:
# # Create a reverse mapping of movie titles to indices
# indices = pd.Series(grouped_movie_merge.index, index=grouped_movie_merge['genres']).drop_duplicates()

# def recommend_movies(title, cosine_sim=cosine_sim):
#     # Get the index of the movie that matches the title
#     idx = indices[title]
    
#     # Get the pairwise similarity scores of all movies with that movie
#     sim_scores = list(enumerate(cosine_sim[idx]))
    
#     # Sort the movies based on the similarity scores
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
#     # Get the scores of the 10 most similar movies
#     sim_scores = sim_scores[1:11]
    
#     # Get the movie indices
#     movie_indices = [i[0] for i in sim_scores]
    
#     # Return the top 10 most similar movies
#     return grouped_movie_merge[''].iloc[movie_indices]

# # Test the recommendation function
# print(recommend_movies('Documentary'))

In [28]:
# Create a reverse mapping of movie titles to indices
indices = pd.Series(grouped_movie_merge.index, index=grouped_movie_merge['Title']).drop_duplicates()

# Save the components to disk
joblib.dump(grouped_movie_merge, 'data.pkl')
joblib.dump(tfidf, 'vectorizer.pkl')
joblib.dump(cosine_sim, 'cosine_sim.pkl')
joblib.dump(indices, 'indices.pkl')

print("Model components saved!")

Model components saved!


In [29]:
data = grouped_movie_merge
feature = data["genres"].tolist()

#Create an instance of TfidfVectorizer
tfidf = TfidfVectorizer(stop_words="english")

#Fit and transform the vectorizer on our corpus
tfidf_matrix = tfidf.fit_transform(feature)

#Compute the cosine similarity matrix
similarity = cosine_similarity(tfidf_matrix)

In [30]:
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# # Assuming grouped_movie_merge is your DataFrame
# data = grouped_movie_merge.reset_index(drop=True)

# # Combine features for the vectorizer
# data['combined_features'] = data.apply(lambda row: ' '.join([str(row['genres']), str(row['title_cast']), str(row['plot_keywords']), str(row['director'])]), axis=1)

# feature = data["combined_features"].tolist()

# # Create an instance of TfidfVectorizer
# tfidf = TfidfVectorizer(stop_words="english")

# # Fit and transform the vectorizer on our corpus
# tfidf_matrix = tfidf.fit_transform(feature)

# # Compute the cosine similarity matrix
# similarity = cosine_similarity(tfidf_matrix)

# # Create a reverse mapping of movie titles to indices
# indices = pd.Series(data.index, index=data['Title']).drop_duplicates()

# def recommend_movies_by_genre(genre, similarity=similarity):
#     # Filter the movies that belong to the specified genre
#     genre_movies = data[data['genres'].str.contains(genre, case=False, na=False)]
    
#     if genre_movies.empty:
#         return "Genre not found in the dataset."
    
#     genre_indices = genre_movies.index.tolist()
    
#     # Compute the mean similarity scores for movies in the genre
#     mean_similarity_scores = similarity[genre_indices].mean(axis=0)
    
#     # Sort the movies based on the mean similarity scores
#     sim_scores = sorted(list(enumerate(mean_similarity_scores)), key=lambda x: x[1], reverse=True)
    
#     # Get the scores of the 10 most similar movies
#     sim_scores = sim_scores[:10]
    
#     # Get the movie indices
#     movie_indices = [i[0] for i in sim_scores]
    
#     # Return the top 10 most similar movies
#     return data['Title'].iloc[movie_indices]

# # Test the recommendation function
# print(recommend_movies_by_genre("Comedy"))

In [31]:
# Rename the 'title' column to 'Title' in the smaller dataset
movie_merge.rename(columns={'title': 'Title'}, inplace=True)

# Merge the datasets on the 'Title' column
merged_data = pd.merge(movie_merge, movie_csv[['Title', 'IMDB Score','Poster', 'Imdb Link']], on='Title', how='left')
merged_data
movie_merge = merged_data
movie_merge.columns

Index(['movieId', 'Title', 'genres', 'title_cast', 'director', 'runtime',
       'plot_keywords', 'combined_features', 'IMDB Score', 'Poster',
       'Imdb Link'],
      dtype='object')

Now create a model that can be able to detect people with similar taste of movie, recommend what the other watched to the other user and vise versa

In [33]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from PIL import Image
import requests
from io import BytesIO

# Assume grouped_movie_merge and movie_merge are already defined DataFrames

# Merge with the larger dataset to get poster, IMDb link, and IMDb score
grouped_movie_merge = pd.merge(grouped_movie_merge, movie_merge[['Title','title_cast','plot_keywords', 'IMDB Score', 'Poster', 'Imdb Link','director']], on='title', how='left')

# Preprocess the data
grouped_movie_merge.fillna('', inplace=True)  # Fill NaNs with empty strings if any

# Extract features
feature = grouped_movie_merge["genres"].tolist()

# Create an instance of TfidfVectorizer
tfidf = TfidfVectorizer(stop_words="english")

# Fit and transform the vectorizer on our corpus
tfidf_matrix = tfidf.fit_transform(feature)

# Compute the cosine similarity matrix
similarity = cosine_similarity(tfidf_matrix)

# Create a reverse mapping of movie titles to indices
indices = pd.Series(grouped_movie_merge.index, index=grouped_movie_merge['Title']).drop_duplicates()

def fetch_movie_details(row):
    image_url = row['Poster']
    director = row['director']
    cast = row['title_cast']
    plot_keywords = row['plot_keywords']
    imdb_score = row['IMDB Score']
    imdb_link = row['Imdb Link']
    
    return image_url, director, cast, plot_keywords, imdb_score, imdb_link

def recommend_movies_by_genre(genre, similarity=similarity):
    # Filter the movies that belong to the specified genre
    genre_movies = grouped_movie_merge[grouped_movie_merge['genres'].str.contains(genre, case=False, na=False)]
    
    if genre_movies.empty:
        return "Genre not found in the dataset."
    
    genre_indices = genre_movies.index
    
    # Map genre_indices to the similarity matrix
    genre_indices_mapped = [indices[idx] for idx in genre_indices]
    
    # Compute the mean similarity scores for movies in the genre
    mean_similarity_scores = similarity[genre_indices_mapped].mean(axis=0)
    
    # Sort the movies based on the mean similarity scores
    sim_scores = sorted(list(enumerate(mean_similarity_scores)), key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[:10]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Get the top 10 most similar movies
    recommended_titles = grouped_movie_merge.iloc[movie_indices]
    
    # Fetch and display images and details
    for _, row in recommended_titles.iterrows():
        image_url, director, cast, plot_keywords, imdb_score, imdb_link = fetch_movie_details(row)
        
        if image_url:
            response = requests.get(image_url)
            img = Image.open(BytesIO(response.content))
            
            plt.figure(figsize=(5, 7))
            plt.imshow(img)
            plt.axis('off')
            plt.title(row['Title'])
            
            details = (
                f"IMDB Score: {imdb_score}\n"
                f"Director: {director}\n"
                f"Cast: {cast}\n"
                f"Plot: {plot_keywords}\n"
                f"IMDB Link: {imdb_link}"
            )
            plt.gca().text(0.5, -0.1, details, fontsize=12, ha='center', wrap=True, transform=plt.gca().transAxes)
            
            plt.show()
        else:
            print(f"No image found for {row['Title']}")
    
    return recommended_titles

# Test the recommendation function
print(recommend_movies_by_genre("Documentary"))


  grouped_movie_merge.fillna('', inplace=True)  # Fill NaNs with empty strings if any


KeyError: 'Poster'