# Content Based Recommender

* Create TF-IDF Matrix
* Create Cosine-Similarity Matrix
* Making Recommendation Based on Similarities
* Process Functionalization

# Import Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", 500)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Import Dataset

In [2]:
df = pd.read_csv("movies_metadata.csv", low_memory=False)
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
df["overview"].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

# Data Preprocessing

In [4]:
df["overview"].isnull().sum()

954

In [5]:
df["overview"] = df["overview"].fillna('')

In [6]:
df["overview"].isnull().sum()

0

In [7]:
# stop_words


In [8]:
tfidf

# Create TF-IDF Matrix

In [9]:
tfidf_matrix = tfidf.fit_transform(df["overview"])

In [10]:
tfidf_matrix.shape

(45466, 75827)

In [11]:
tfidf.get_feature_names_out()[2000:2100]

array(['agua', 'aguacates', 'aguila', 'aguilera', 'aguirre', 'agumon',
       'agus', 'agustín', 'agutter', 'ah', 'ahab', 'aharon', 'ahasuerus',
       'ahead', 'ahearn', 'ahem', 'ahh', 'ahkmenrah', 'ahlawat', 'ahlo',
       'ahmad', 'ahmat', 'ahmed', 'ahmedabad', 'ahmet', 'ahmi', 'ahn',
       'aho', 'ahonen', 'ahora', 'ahoy', 'ahuja', 'ahí', 'ai', 'aiads',
       'aiba', 'aibelli', 'aibileen', 'aibou', 'aid', 'aida', 'aidan',
       'aide', 'aided', 'aiden', 'aides', 'aiding', 'aids', 'aiello',
       'aigin', 'aiiiyyyo', 'aika', 'aikaa', 'aikau', 'aikawa', 'aikens',
       'aikido', 'aiko', 'aile', 'aileen', 'ailes', 'ailesini', 'ailey',
       'ailing', 'ailment', 'ailments', 'ails', 'ailur', 'aim', 'aimable',
       'aiman', 'aime', 'aimed', 'aimee', 'aimer', 'aimes', 'aimie',
       'aiming', 'aimless', 'aimlessly', 'aimlessness', 'aimo', 'aims',
       'aimé', 'aimée', 'ain', 'ainar', 'ainara', 'ainda', 'ainsley',
       'ainsworth', 'ainu', 'ainun', 'aiqing', 'air', 'aira', 'ai

# Create Cosine-Similarity Matrix

In [12]:
cosine_sim = cosine_similarity(tfidf_matrix)

In [13]:
cosine_sim.shape

(45466, 45466)

In [14]:
cosine_sim[1]

array([0.01504121, 1.        , 0.04681953, ..., 0.        , 0.02198641,
       0.00929411])

# Making Recommendation Based on Similarities

In [15]:
indices = pd.Series(df.index, index=df["title"])

In [16]:
indices[0:5]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [18]:
indices.index.value_counts()[0:5]

Cinderella              11
Hamlet                   9
Alice in Wonderland      9
Beauty and the Beast     8
Les Misérables           8
Name: title, dtype: int64

In [21]:
indices["Hamlet"]

title
Hamlet     1360
Hamlet     1832
Hamlet     3476
Hamlet     3601
Hamlet    13313
Hamlet    16104
Hamlet    18061
Hamlet    18908
Hamlet    21239
dtype: int64

In [22]:
indices = indices[~indices.index.duplicated(keep='last')]

In [23]:
indices.index.value_counts()[0:5]

Toy Story                   1
Russell Madness             1
Attack of the Sabretooth    1
The Millennials             1
X/Y                         1
Name: title, dtype: int64

In [24]:
indices["Hamlet"]

21239

In [25]:
movie_index = indices["Hamlet"]

In [26]:
movie_index

21239

In [28]:
cosine_sim[movie_index]

array([0.        , 0.        , 0.        , ..., 0.        , 0.01029205,
       0.        ])

In [29]:
similarity_scores = pd.DataFrame(cosine_sim[movie_index], columns = ["score"])

In [30]:
similarity_scores.head()

Unnamed: 0,score
0,0.0
1,0.0
2,0.0
3,0.0
4,0.016172


In [31]:
movies_index = similarity_scores.sort_values(by="score", ascending=False)[1:11].index

In [32]:
movies_index

Int64Index([24642, 27360, 1360, 3601, 31270, 29122, 45120, 34031, 33772, 44173], dtype='int64')

In [33]:
df["title"].iloc[movies_index]

24642                             Illusion of Blood
27360                          The Ghost of Yotsuya
1360                                         Hamlet
3601                                         Hamlet
31270                                Buried Secrets
29122    The Angel of Vengeance - The Female Hamlet
45120                             Water, Wind, Dust
34031                               Lonesome Ghosts
33772                                   Coming Soon
44173                            Ghost Recon: Alpha
Name: title, dtype: object

In [34]:
# Alice in Wonderland

movie_index = indices["Alice in Wonderland"]

In [35]:
movie_index

39530

In [36]:
similarity_scores = pd.DataFrame(cosine_sim[movie_index], columns = ["score"])

In [37]:
movies_index = similarity_scores.sort_values(by="score", ascending=False)[1:11].index

In [38]:
df["title"].iloc[movies_index]

9665                  Alice in Wonderland
21779    Alice's Adventures in Wonderland
1003                  Alice in Wonderland
8918                                Alice
11887                 Alice in Wonderland
44713             Faust et Méphistophélès
14892                 Alice in Wonderland
30735     Alice Through the Looking Glass
38975                       The Red Siren
29531                The Velveteen Rabbit
Name: title, dtype: object

# Process Functionalization

In [3]:
def calculate_cosine_sim(dataframe, description_var, stop_words="english"):
    tfidf = TfidfVectorizer(stop_words=stop_words)
    dataframe[description_var] = dataframe[description_var].fillna('')
    tfidf_matrix = tfidf.fit_transform(dataframe[description_var])
    cosine_sim = cosine_similarity(tfidf_matrix)
    return cosine_sim

In [4]:
def content_based_recommender(dataframe, title, cosine_sim):
    indices = pd.Series(dataframe.index, index=dataframe["title"])
    indices = indices[~indices.index.duplicated(keep='last')]
    movie_index = indices[title]
    similarity_scores = pd.DataFrame(cosine_sim[movie_index], columns = ["score"])
    movies_index = similarity_scores.sort_values(by="score", ascending=False)[1:11].index
    return dataframe["title"].iloc[movies_index]

In [5]:
cosine_sim = calculate_cosine_sim(df, "overview")

In [6]:
content_based_recommender(df, "Alice in Wonderland", cosine_sim)

9665                  Alice in Wonderland
21779    Alice's Adventures in Wonderland
1003                  Alice in Wonderland
8918                                Alice
11887                 Alice in Wonderland
44713             Faust et Méphistophélès
14892                 Alice in Wonderland
30735     Alice Through the Looking Glass
38975                       The Red Siren
29531                The Velveteen Rabbit
Name: title, dtype: object