In [1]:
import numpy as np 
import pandas as pd

In [2]:
movies=pd.read_csv('/kaggle/input/tmdb-movies-dataset-2023-930k-movies/TMDB_movie_dataset_v11.csv')

In [3]:
print(movies.head())

       id            title  vote_average  vote_count    status release_date  \
0   27205        Inception         8.364       34495  Released   2010-07-15   
1  157336     Interstellar         8.417       32571  Released   2014-11-05   
2     155  The Dark Knight         8.512       30619  Released   2008-07-16   
3   19995           Avatar         7.573       29815  Released   2009-12-15   
4   24428     The Avengers         7.710       29166  Released   2012-04-25   

      revenue  runtime  adult                     backdrop_path  ...  \
0   825532764      148  False  /8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg  ...   
1   701729206      169  False  /pbrkL804c8yAv3zBZR4QPEafpAR.jpg  ...   
2  1004558444      152  False  /nMKdUUepR0i5zn0y1T4CsSB5chy.jpg  ...   
3  2923706026      162  False  /vL5LR6WdxWPjLPFRLe133jXWsh5.jpg  ...   
4  1518815515      143  False  /9BBTo63ANSmhC4e6r62OJFuK2GL.jpg  ...   

    original_title                                           overview  \
0        Inception 

In [4]:
movies.dropna(subset=['title'], inplace=True)
#movies.dropna(subset=['release_date'], inplace=True)
movies.dropna(subset=['overview'], inplace=True)
movies.dropna(subset=['genres'], inplace=True)

In [5]:
print(movies.isna().sum())

id                           0
title                        0
vote_average                 0
vote_count                   0
status                       0
release_date             26783
revenue                      0
runtime                      0
adult                        0
backdrop_path           336438
budget                       0
homepage                466558
imdb_id                 158083
original_language            0
original_title               0
overview                     0
popularity                   0
poster_path             105824
tagline                 435753
genres                       0
production_companies    218485
production_countries    134322
spoken_languages        125534
keywords                332835
dtype: int64


In [6]:
movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')

In [7]:
movies['year'] = movies['release_date'].dt.year

In [8]:
movies.head(1)

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,year
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",2010.0


In [9]:
latest_movies = movies[movies['year'] >= 2023].reset_index(drop=True)

In [10]:
latest_movies.shape

(32918, 25)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [12]:

from sklearn.feature_extraction.text import TfidfVectorizer


tfidf = TfidfVectorizer(stop_words='english')


latest_movies['overview'] = latest_movies['overview'].fillna('')


tfidf_matrix = tfidf.fit_transform(latest_movies['overview'])


tfidf_matrix.shape

(32918, 57107)

In [13]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [14]:
cosine_sim

array([[1.        , 0.01932927, 0.        , ..., 0.0060826 , 0.02296331,
        0.        ],
       [0.01932927, 1.        , 0.        , ..., 0.0109984 , 0.0155332 ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.0060826 , 0.0109984 , 0.        , ..., 1.        , 0.00874695,
        0.00611335],
       [0.02296331, 0.0155332 , 0.        , ..., 0.00874695, 1.        ,
        0.00600496],
       [0.        , 0.        , 0.        , ..., 0.00611335, 0.00600496,
        1.        ]])

In [15]:
indices = pd.Series(latest_movies.index, index=latest_movies['title']).drop_duplicates()

In [16]:
indices

title
The Super Mario Bros. Movie                0
Barbie                                     1
Guardians of the Galaxy Vol. 3             2
John Wick: Chapter 4                       3
Spider-Man: Across the Spider-Verse        4
                                       ...  
Ardhangini                             32913
Negative Numbers                       32914
Невидимый мой                          32915
Moon Beams                             32916
Teresa                                 32917
Length: 32918, dtype: int64

In [17]:
def get_recommendations(title, cosine_sim=cosine_sim):
   
    idx_series = indices.get(title)

    
    if idx_series is None:
        return f"Movie title '{title}' not found in the dataset."
    
    
    if isinstance(idx_series, pd.Series):
        indices_list = idx_series.tolist()
    else:
        indices_list = [idx_series]

    
    all_recommendations = []
    for idx in indices_list:
        
        sim_scores = [(i, cosine_sim[idx, i].item()) for i in range(len(cosine_sim))]

       
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        
        sim_scores = sim_scores[1:11]

        
        movie_indices = [i[0] for i in sim_scores]

        
        all_recommendations.append(latest_movies['title'].iloc[movie_indices].tolist())

    return all_recommendations



In [18]:
latest_movies

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,year
0,502356,The Super Mario Bros. Movie,7.775,6743,Released,2023-04-05,1355725263,93,False,/9n2tJBplPbgR2ca05hS5CKXwP2c.jpg,...,"While working underground to fix a water main,...",410.411,/qNBAXBIQlnOThrVvA6mA2B5ggV6.jpg,Not all heroes wear capes. Some wear overalls.,"Animation, Family, Adventure, Fantasy, Comedy","Universal Pictures, Illumination, Nintendo","Japan, United States of America",English,"gorilla, plumber, anthropomorphism, magic mush...",2023.0
1,346698,Barbie,7.279,5074,Released,2023-07-19,1428545028,114,False,/ctMserH8g2SeOAnCw5gFjdQF8mo.jpg,...,Barbie and Ken are having the time of their li...,1069.340,/iuFNMS8U5cb6xfzi51Dbkovj7vM.jpg,She's everything. He's just Ken.,"Comedy, Adventure, Fantasy","LuckyChap Entertainment, Heyday Films, NB/GG P...","United Kingdom, United States of America",English,"feminism, satire, patriarchy, based on toy, fe...",2023.0
2,447365,Guardians of the Galaxy Vol. 3,8.023,4941,Released,2023-05-03,845430302,150,False,/5YZbUmjbMa3ClvSW1Wj3D6XGolb.jpg,...,"Peter Quill, still reeling from the loss of Ga...",318.516,/r2J02Z2OpNTctfOSN1Ydgii51I3.jpg,Once more with feeling.,"Science Fiction, Adventure, Action","Marvel Studios, Kevin Feige Productions",United States of America,English,"hero, superhero, mad scientist, based on comic...",2023.0
3,603692,John Wick: Chapter 4,7.814,4639,Released,2023-03-22,426978565,170,False,/i3OTGmLNOZIo4SRQLVfLjeWegB6.jpg,...,"With the price on his head ever increasing, Jo...",482.538,/vZloFAK7NmvMGKE7VkF5UHaz0I.jpg,"No way back, one way out.","Action, Thriller, Crime","Thunder Road, 87Eleven, Summit Entertainment, ...","Germany, United States of America","Arabic, Cantonese, English, French, German, Ja...","new york city, martial arts, hitman, sequel, o...",2023.0
4,569094,Spider-Man: Across the Spider-Verse,8.442,4394,Released,2023-05-31,683241751,140,False,/4HodYYKEIsGOdinkGi2Ucz6X9i0.jpg,...,"After reuniting with Gwen Stacy, Brooklyn’s fu...",512.336,/8Vt6mWEReuy4Of61Lnj5Xj704m8.jpg,It's how you wear the mask that matters.,"Animation, Action, Adventure","Columbia Pictures, Sony Pictures Animation, Lo...",United States of America,"English, Hindi, Italian, Spanish","new york city, sacrifice, superhero, comic boo...",2023.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32913,716373,Ardhangini,0.000,0,Released,2023-06-02,0,135,False,,...,"After Suman suffers an accident, the two women...",1.714,/hmglpFovmhStKBdV32HCJ5W08Q0.jpg,,"Romance, Family, Drama",Surinder Films,India,Bengali,,2023.0
32914,701393,Negative Numbers,0.000,0,Released,2023-10-04,0,93,False,/muXS6AlYcaRFz44ABIl9hp5eRJF.jpg,...,"""Negative Numbers"" is the true story of a juve...",8.221,/8XiIEGWb0AM7cVjgi5vZQ2PAU22.jpg,BASED ON REAL EVENTS,Drama,"Magnet Films, Alief, 1991 Productions, Wide Ma...","France, Georgia, Italy",Georgian,,2023.0
32915,707960,Невидимый мой,0.000,0,In Production,2023-10-06,0,102,False,,...,Single mother Maria (Daria Ekamasova) lives wi...,4.866,,,Drama,,Russia,Russian,,2023.0
32916,708415,Moon Beams,0.000,0,Post Production,2023-11-03,0,0,False,/hdri42Dt3xK3tu5CS8KNS6MYbVt.jpg,...,"While on the run after killing a cop, drug dea...",1.913,/cHuK99oSKvzYNof9DxKG4ZUDk3t.jpg,People change overnight.,"Crime, Thriller, Romance",Coffee Whale Films,,English,,2023.0


In [19]:
idx = indices['John Wick: Chapter 4']
print("Movie:", idx)


Movie: 3


In [20]:
get_recommendations("Barbie")

[['Barbie',
  'Barbie 2',
  'My First Barbie: Happy DreamDay',
  "Barbie 2: Ken's Revenge",
  'JUST KEN',
  'Barbie: The Perfect Woman?',
  'Love, In Color',
  'Anti-Human Rhapsody',
  'Bad Barbie',
  'Springtime in Amsterdam'],
 ['Barbie 2',
  'My First Barbie: Happy DreamDay',
  "Barbie 2: Ken's Revenge",
  'Barbie',
  'Die Hart 2',
  'Permanent Derealizer',
  'How Do I Look?',
  'In My Own Time',
  'JUST KEN',
  'Barbie: The Perfect Woman?']]

In [21]:
latest_movies[['genres','production_companies']].head(3)

Unnamed: 0,genres,production_companies
0,"Animation, Family, Adventure, Fantasy, Comedy","Universal Pictures, Illumination, Nintendo"
1,"Comedy, Adventure, Fantasy","LuckyChap Entertainment, Heyday Films, NB/GG P..."
2,"Science Fiction, Adventure, Action","Marvel Studios, Kevin Feige Productions"


In [22]:
latest_movies.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords', 'year'],
      dtype='object')

In [23]:

def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [24]:
features=['production_companies','genres']
for x in features:
    latest_movies[x] = latest_movies[x].apply(clean_data)

In [25]:
latest_movies[['production_companies','genres']].head()

Unnamed: 0,production_companies,genres
0,"universalpictures,illumination,nintendo","animation,family,adventure,fantasy,comedy"
1,"luckychapentertainment,heydayfilms,nb/ggpictur...","comedy,adventure,fantasy"
2,"marvelstudios,kevinfeigeproductions","sciencefiction,adventure,action"
3,"thunderroad,87eleven,summitentertainment,studi...","action,thriller,crime"
4,"columbiapictures,sonypicturesanimation,lordmil...","animation,action,adventure"


In [26]:

latest_movies['soup'] = latest_movies['production_companies'].fillna('') + ' ' + latest_movies['genres'].fillna('') + ' ' + latest_movies['overview'].fillna('')


latest_movies['soup'] = latest_movies['soup'].fillna('')


In [27]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(latest_movies['soup'])

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [29]:
indices2 = pd.Series(latest_movies.index, index=latest_movies['title']).drop_duplicates()

In [31]:
get_recommendations('Spider-Man: Across the Spider-Verse', cosine_sim2)

[['Enter The Spider-Verse: A Spider-Man Special',
  'Spider-Man: RAF',
  'Spider-Man: Date Night',
  'Spider-Man & The Spider-Verse Revolution (To Be Continued)',
  'Shin Spider-Man',
  'Lemon man vs the Multiverse',
  'Harry’s Brainrot(The Complete Trilogy)',
  'Spin & Ella',
  'Spider-Man: Lotus',
  'Homemade Into the Spider-Verse']]