In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv("data/movies.csv")
ratings = pd.read_csv("data/ratings.csv")

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
print("Movies shape:", movies.shape)
print("Ratings shape:", ratings.shape)

Movies shape: (9742, 3)
Ratings shape: (100836, 4)


In [5]:
data = pd.merge(ratings, movies, on="movieId")
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [6]:
data.shape

(100836, 6)

In [7]:
movie_matrix = data.pivot_table(
    index="userId",
    columns="title",
    values="rating"
)

movie_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [8]:
movie_matrix_filled = movie_matrix.fillna(0)
movie_matrix_filled.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

movie_similarity = cosine_similarity(movie_matrix_filled.T)

In [10]:
movie_similarity_df = pd.DataFrame(
    movie_similarity,
    index=movie_matrix_filled.columns,
    columns=movie_matrix_filled.columns
)

movie_similarity_df.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141653,0.0,...,0.0,0.342055,0.543305,0.707107,0.0,0.0,0.139431,0.327327,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.707107,1.0,0.0,0.0,0.0,0.176777,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,1.0,0.857493,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.857493,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
def recommend_movies(movie_title, num_recommendations=5):
    
    if movie_title not in movie_similarity_df.index:
        return "Movie not found in dataset."
    
    similar_scores = movie_similarity_df[movie_title].sort_values(ascending=False)
    
    # Remove the movie itself
    similar_scores = similar_scores.iloc[1:]
    
    return similar_scores.head(num_recommendations)

In [12]:
recommend_movies("Heat (1995)")

title
Rock, The (1996)                                                  0.522755
Twelve Monkeys (a.k.a. 12 Monkeys) (1995)                         0.510677
Léon: The Professional (a.k.a. The Professional) (Léon) (1994)    0.497140
Casino (1995)                                                     0.492802
Fargo (1996)                                                      0.485918
Name: Heat (1995), dtype: float64

In [14]:
recommend_movies("Toy Story (1995)")
recommend_movies("Seven (a.k.a. Se7en) (1995)")
recommend_movies("The Usual Suspects (1995)")

'Movie not found in dataset.'

In [15]:
movie_matrix.columns[:20]

Index([''71 (2014)', ''Hellboy': The Seeds of Creation (2004)',
       ''Round Midnight (1986)', ''Salem's Lot (2004)',
       ''Til There Was You (1997)', ''Tis the Season for Love (2015)',
       ''burbs, The (1989)', ''night Mother (1986)',
       '(500) Days of Summer (2009)', '*batteries not included (1987)',
       '...All the Marbles (1981)', '...And Justice for All (1979)',
       '00 Schneider - Jagd auf Nihil Baxter (1994)', '1-900 (06) (1994)',
       '10 (1979)', '10 Cent Pistol (2015)', '10 Cloverfield Lane (2016)',
       '10 Items or Less (2006)', '10 Things I Hate About You (1999)',
       '10 Years (2011)'],
      dtype='object', name='title')

In [16]:
[m for m in movie_matrix.columns if "Seven" in m]

['187 (One Eight Seven) (1997)',
 'Magnificent Seven, The (1960)',
 'Ryuzo and the Seven Henchmen (2015)',
 'Seven (a.k.a. Se7en) (1995)',
 'Seven Brides for Seven Brothers (1954)',
 'Seven Days in May (1964)',
 'Seven Pounds (2008)',
 'Seven Psychopaths (2012)',
 'Seven Samurai (Shichinin no samurai) (1954)',
 'Seven Sisters (2017)',
 'Seven Up! (1964)',
 'Seven Year Itch, The (1955)',
 'Seven Years in Tibet (1997)',
 'Seven-Per-Cent Solution, The (1976)',
 'Seventh Seal, The (Sjunde inseglet, Det) (1957)',
 'Seventh Sign, The (1988)',
 'Seventh Son (2014)',
 'Sinbad: Legend of the Seven Seas (2003)',
 'Six Days Seven Nights (1998)',
 'Snow White and the Seven Dwarfs (1937)',
 'The Edge of Seventeen (2016)',
 'The Magnificent Seven (2016)']

In [17]:
[m for m in movie_matrix.columns if "Suspects" in m]

['Usual Suspects, The (1995)']

In [18]:
recommend_movies("Toy Story (1995)")
recommend_movies("Seven (a.k.a. Se7en) (1995)")
recommend_movies("Usual Suspects, The (1995)")

title
Pulp Fiction (1994)                 0.672616
Shawshank Redemption, The (1994)    0.631787
Reservoir Dogs (1992)               0.597412
Goodfellas (1990)                   0.580006
Seven (a.k.a. Se7en) (1995)         0.578066
Name: Usual Suspects, The (1995), dtype: float64

In [19]:
movies["genres"] = movies["genres"].str.replace("|", " ")

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english")
genre_matrix = tfidf.fit_transform(movies["genres"])

In [21]:
genre_similarity = cosine_similarity(genre_matrix, genre_matrix)

In [22]:
genre_similarity_df = pd.DataFrame(
    genre_similarity,
    index=movies["title"],
    columns=movies["title"]
)

In [23]:
def recommend_by_genre(movie_title, num_recommendations=5):
    
    if movie_title not in genre_similarity_df.index:
        return "Movie not found."
    
    scores = genre_similarity_df[movie_title].sort_values(ascending=False)
    scores = scores.iloc[1:]
    
    return scores.head(num_recommendations)

In [24]:
recommend_by_genre("Toy Story (1995)")

title
Shrek the Third (2007)              1.0
Wild, The (2006)                    1.0
Monsters, Inc. (2001)               1.0
Emperor's New Groove, The (2000)    1.0
Antz (1998)                         1.0
Name: Toy Story (1995), dtype: float64