In [34]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [35]:
movies = pd.read_csv('popular_10000_movies_tmdb.csv')
movies.head()

Unnamed: 0,id,title,release_date,genres,original_language,vote_average,vote_count,popularity,overview,budget,production_companies,revenue,runtime,tagline
0,758323,The Pope's Exorcist,2023-04-05,"['Horror', 'Mystery', 'Thriller']",English,7.4,619,5089.969,"Father Gabriele Amorth, Chief Exorcist of the ...",18000000,"['Screen Gems', '2.0 Entertainment', 'Jesus & ...",65675816,103,Inspired by the actual files of Father Gabriel...
1,640146,Ant-Man and the Wasp: Quantumania,2023-02-15,"['Action', 'Adventure', 'Science Fiction']",English,6.6,2294,4665.438,Super-Hero partners Scott Lang and Hope van Dy...,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125,Witness the beginning of a new dynasty.
2,502356,The Super Mario Bros. Movie,2023-04-05,"['Animation', 'Adventure', 'Family', 'Fantasy'...",English,7.5,1861,3935.55,"While working underground to fix a water main,...",100000000,"['Universal Pictures', 'Illumination', 'Ninten...",1121048165,92,
3,868759,Ghosted,2023-04-18,"['Action', 'Comedy', 'Romance']",English,7.2,652,2791.532,Salt-of-the-earth Cole falls head over heels f...,0,"['Skydance Media', 'Apple Studios']",0,120,Finding that special someone can be a real adv...
4,594767,Shazam! Fury of the Gods,2023-03-15,"['Action', 'Comedy', 'Fantasy', 'Adventure']",English,6.8,1510,2702.593,"Billy Batson and his foster siblings, who tran...",125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130,Oh. My. Gods.


In [36]:
movies_df = movies[['id','title','overview', 'genres', 'tagline', 'production_companies', 'original_language']]
movies_df.head()

Unnamed: 0,id,title,overview,genres,tagline,production_companies,original_language
0,758323,The Pope's Exorcist,"Father Gabriele Amorth, Chief Exorcist of the ...","['Horror', 'Mystery', 'Thriller']",Inspired by the actual files of Father Gabriel...,"['Screen Gems', '2.0 Entertainment', 'Jesus & ...",English
1,640146,Ant-Man and the Wasp: Quantumania,Super-Hero partners Scott Lang and Hope van Dy...,"['Action', 'Adventure', 'Science Fiction']",Witness the beginning of a new dynasty.,"['Marvel Studios', 'Kevin Feige Productions']",English
2,502356,The Super Mario Bros. Movie,"While working underground to fix a water main,...","['Animation', 'Adventure', 'Family', 'Fantasy'...",,"['Universal Pictures', 'Illumination', 'Ninten...",English
3,868759,Ghosted,Salt-of-the-earth Cole falls head over heels f...,"['Action', 'Comedy', 'Romance']",Finding that special someone can be a real adv...,"['Skydance Media', 'Apple Studios']",English
4,594767,Shazam! Fury of the Gods,"Billy Batson and his foster siblings, who tran...","['Action', 'Comedy', 'Fantasy', 'Adventure']",Oh. My. Gods.,"['New Line Cinema', 'The Safran Company', 'DC ...",English


In [37]:
movies_df.shape

(10000, 7)

In [38]:
movies_df.isnull().sum()

id                         0
title                      0
overview                  77
genres                     0
tagline                 2759
production_companies       0
original_language          0
dtype: int64

In [39]:
movies_df = movies_df.dropna(subset=['overview']).reset_index(drop=True)

In [40]:
movies_df.duplicated().sum()

0

In [41]:
def combine_columns_to_list(df, cols):
     return (
        df[cols]
        .fillna('')                     
        .astype(str)                    
        .agg(' '.join, axis=1)          
        .apply(lambda text: re.findall(r'\b\w+\b', text.lower()))  # tokenize into words
    )
movies_df['tags_list'] = combine_columns_to_list(movies_df, ['overview', 'genres', 'tagline', 'production_companies', 'original_language'])

In [42]:
new_movies = movies_df[['id','title','tags_list']]
new_movies.head()

Unnamed: 0,id,title,tags_list
0,758323,The Pope's Exorcist,"[father, gabriele, amorth, chief, exorcist, of..."
1,640146,Ant-Man and the Wasp: Quantumania,"[super, hero, partners, scott, lang, and, hope..."
2,502356,The Super Mario Bros. Movie,"[while, working, underground, to, fix, a, wate..."
3,868759,Ghosted,"[salt, of, the, earth, cole, falls, head, over..."
4,594767,Shazam! Fury of the Gods,"[billy, batson, and, his, foster, siblings, wh..."


In [43]:
new_movies['tags_list'] = new_movies['tags_list'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies['tags_list'] = new_movies['tags_list'].apply(lambda x: ' '.join(x))


In [44]:
new_movies.shape

(9923, 3)

In [45]:
new_movies.iloc[0]['tags_list']

'father gabriele amorth chief exorcist of the vatican investigates a young boy s terrifying possession and ends up uncovering a centuries old conspiracy the vatican has desperately tried to keep hidden horror mystery thriller inspired by the actual files of father gabriele amorth chief exorcist of the vatican screen gems 2 0 entertainment jesus mary worldwide katz loyola productions ffilme ro english'

In [46]:
cv = CountVectorizer(max_features=10000,stop_words='english')
vector = cv.fit_transform(new_movies['tags_list']).toarray()

In [47]:
similarity = cosine_similarity(vector)

In [48]:
def recommend(movie):
    movie = movie.lower().strip()
    matches = new_movies[new_movies['title'].str.lower() == movie]
    if not matches.empty:
        index1 = matches.index[0]
        distances = sorted(list(enumerate(similarity[index1])),
                           reverse=True, key=lambda x: x[1])
        for i in distances[1:6]:
            print(new_movies.iloc[i[0]].title)
    else:
        matches1 = new_movies[new_movies['tags_list'].str.contains(rf"\b{movie}\b", case=False, na=False)]
        if matches1.empty:
            print(f"No match found for '{movie}' in title or tags_list.")
            return
        index2 = matches1.index[0]
        distances = sorted(list(enumerate(similarity[index2])),
                           reverse=True, key=lambda x: x[1])
        for i in distances[1:6]:
            print(new_movies.iloc[i[0]].title)

In [53]:
recommend('Spider-Man')

Spider-Man 3
Spider-Man 2
The Amazing Spider-Man
Spider-Man
The Amazing Spider-Man 2
