Movie Recommending System

In [1]:
import numpy as np
import pandas as pd

# 1. Load and Explore the Dataset

In [2]:
movies=pd.read_csv('/kaggle/input/tmdb-movies-dataset-2023-930k-movies/TMDB_movie_dataset_v11.csv')

In [3]:
print(movies.head())

       id            title  vote_average  vote_count    status release_date  \
0   27205        Inception         8.364       34495  Released   2010-07-15   
1  157336     Interstellar         8.417       32571  Released   2014-11-05   
2     155  The Dark Knight         8.512       30619  Released   2008-07-16   
3   19995           Avatar         7.573       29815  Released   2009-12-15   
4   24428     The Avengers         7.710       29166  Released   2012-04-25   

      revenue  runtime  adult                     backdrop_path  ...  \
0   825532764      148  False  /8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg  ...   
1   701729206      169  False  /pbrkL804c8yAv3zBZR4QPEafpAR.jpg  ...   
2  1004558444      152  False  /nMKdUUepR0i5zn0y1T4CsSB5chy.jpg  ...   
3  2923706026      162  False  /vL5LR6WdxWPjLPFRLe133jXWsh5.jpg  ...   
4  1518815515      143  False  /9BBTo63ANSmhC4e6r62OJFuK2GL.jpg  ...   

    original_title                                           overview  \
0        Inception 

In [4]:
print(movies.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1083011 entries, 0 to 1083010
Data columns (total 24 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1083011 non-null  int64  
 1   title                 1082998 non-null  object 
 2   vote_average          1083011 non-null  float64
 3   vote_count            1083011 non-null  int64  
 4   status                1083011 non-null  object 
 5   release_date          924633 non-null   object 
 6   revenue               1083011 non-null  int64  
 7   runtime               1083011 non-null  int64  
 8   adult                 1083011 non-null  bool   
 9   backdrop_path         293684 non-null   object 
 10  budget                1083011 non-null  int64  
 11  homepage              116237 non-null   object 
 12  imdb_id               591051 non-null   object 
 13  original_language     1083011 non-null  object 
 14  original_title        1082998 non-

In [5]:
print(movies.shape)

(1083011, 24)


In [6]:
print(movies.isnull().sum())

id                           0
title                       13
vote_average                 0
vote_count                   0
status                       0
release_date            158378
revenue                      0
runtime                      0
adult                        0
backdrop_path           789327
budget                       0
homepage                966774
imdb_id                 491960
original_language            0
original_title              13
overview                208558
popularity                   0
poster_path             326117
tagline                 929402
genres                  420922
production_companies    584683
production_countries    465316
spoken_languages        448775
keywords                782010
dtype: int64


# 2. Data Preprocessing

In [7]:
movies.dropna(subset=['title'], inplace=True)
#movies.dropna(subset=['release_date'], inplace=True)
movies.dropna(subset=['overview'], inplace=True)
movies.dropna(subset=['genres'], inplace=True)

In [8]:
print(movies.isna().sum())

id                           0
title                        0
vote_average                 0
vote_count                   0
status                       0
release_date             26783
revenue                      0
runtime                      0
adult                        0
backdrop_path           336438
budget                       0
homepage                466558
imdb_id                 158083
original_language            0
original_title               0
overview                     0
popularity                   0
poster_path             105824
tagline                 435753
genres                       0
production_companies    218485
production_countries    134322
spoken_languages        125534
keywords                332835
dtype: int64


In [9]:
print(movies.shape)

(555232, 24)


In [10]:
movies.head(1)

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."


# Trending Movies

In [11]:
v_mean=movies['vote_average'].mean()
v_mean

2.841283160912916

In [12]:
v_90=movies['vote_count'].quantile(0.999)
v_90

6554.307000000263

In [13]:
trending=movies.copy().loc[movies['vote_count']>=v_90]
trending.shape

(556, 24)

In [14]:
def weighted_IMDB_rating(x, v_mean=v_mean, v_90=v_90):
    v = x['vote_count']
    R = x['vote_average']
    weighted_score = (v / (v + v_mean) * R) + (v_mean / (v_mean + v) * v_90)
    return weighted_score

In [15]:
trending['scores']=trending.apply(weighted_IMDB_rating,axis=1)

In [16]:
trending.head(2)

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,scores
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",8.903132
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,...",8.987971


In [17]:
trending=trending.sort_values('scores',ascending=False)
trending[['title','scores']].head(10)

Unnamed: 0,title,scores
532,City of God,11.20044
436,12 Angry Men,10.96772
482,Princess Mononoke,10.91017
423,"The Good, the Bad and the Ugly",10.871626
452,Apocalypse Now,10.765908
444,Top Gun: Maverick,10.723845
492,My Neighbor Totoro,10.68281
504,Ford v Ferrari,10.676711
447,The Help,10.66848
415,Oldboy,10.660695
