# Content Based Movie Recommendation System

### Import Dependencies

In [3]:
import pandas as pd
import numpy as np

### Load the Datasets

In [4]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [5]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [6]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [7]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [8]:
movies.shape

(4803, 20)

In [9]:
credits.shape

(4803, 4)

### Merge movies dataset with credits dataset

In [10]:
movies = movies.merge(credits, on='title', how='left')
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Crop the movies dataset

In [11]:
movies = movies[['movie_id','title','overview', 'genres', 'keywords', 'cast', 'crew']]
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [12]:
# View null values
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [13]:
# Drop null values
movies.dropna(inplace=True)
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [14]:
# Remove duplicates
movies.duplicated().sum()

np.int64(0)

In [15]:
### Extract only the names of genres and keywords

In [16]:
movies['keywords'][0]

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [17]:
movies['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [18]:
import ast
def convert(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [19]:
movies['keywords'] = movies['keywords'].apply(convert)
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [20]:
movies['genres'] = movies['genres'].apply(convert)
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [21]:
# for cast we take only the first three items from the dictionary
def convert1(text):
    L=[]
    counter = 0
    for i in ast.literal_eval(text):
        if counter<3:
            L.append(i['name'])
            counter +=1
        else:
            break
    return L

In [22]:
movies['cast'] = movies['cast'].apply(convert1)
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [23]:
def fetch_director(text):
    L=[]
    for i in ast.literal_eval(text):
        if i.get('job') == 'Director': 
            L.append(i.get('name'))
            break
    return L

In [24]:
movies['crew'][0]

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In [25]:
movies['crew'] = movies['crew'].apply(fetch_director)
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [26]:
#Convert overview column into a list form.
movies['overview']=movies['overview'].apply(lambda x:x.split())

In [27]:
movies['genres']=movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])

In [28]:
movies['keywords']=movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])

In [29]:
movies['cast']=movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])

In [30]:
movies['crew']=movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [31]:
### merging all the columns into one column named tags

In [32]:
movies['tags'] = movies['overview']+movies['genres']+movies['keywords']+ movies['crew'] + movies['cast']

In [33]:
new_movies = movies.drop(columns = ['overview','genres','keywords','cast','crew'])

In [34]:
new_movies

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."
...,...,...,...
4804,9367,El Mariachi,"[El, Mariachi, just, wants, to, play, his, gui..."
4805,72766,Newlyweds,"[A, newlywed, couple's, honeymoon, is, upended..."
4806,231617,"Signed, Sealed, Delivered","[""Signed,, Sealed,, Delivered"", introduces, a,..."
4807,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is..."


In [35]:
#feri convert list to string
new_movies['tags'] = new_movies['tags'].apply(lambda x: " ".join(x))
new_movies.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [36]:
new_movies['tags'][1]

"Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with Will Turner and Elizabeth Swann. But nothing is quite as it seems. Adventure Fantasy Action ocean drugabuse exoticisland eastindiatradingcompany loveofone'slife traitor shipwreck strongwoman ship alliance calypso afterlife fighter pirate swashbuckler aftercreditsstinger GoreVerbinski JohnnyDepp OrlandoBloom KeiraKnightley"

In [37]:
new_movies.head(5)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


## Vectorization

In [38]:
new_movies['tags'] = new_movies['tags'].apply(lambda x: str(x))

In [39]:
print(movies['tags'].head())
print(type(movies['tags'].iloc[0]))

0    [In, the, 22nd, century,, a, paraplegic, Marin...
1    [Captain, Barbossa,, long, believed, to, be, d...
2    [A, cryptic, message, from, Bond’s, past, send...
3    [Following, the, death, of, District, Attorney...
4    [John, Carter, is, a, war-weary,, former, mili...
Name: tags, dtype: object
<class 'list'>


In [40]:
# The target is to fetch 6000 most commonly occuring words
# And create a vector for each of the movies in the dataset and find the nearest vector of each movie
# Nearer the vector, More similar are the movies
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features =6000, stop_words='english')
# The value it returns is a scipy sparse matrix so, we must convert this matrix into a numpy array
vectors = cv.fit_transform(new_movies['tags']).toarray()
len(cv.get_feature_names_out())

6000

In [41]:
vectors.shape

(4806, 6000)

In [42]:
cv.get_feature_names_out()[1:100]

array(['007', '10', '100', '11', '12', '13', '14', '15', '16', '17',
       '17th', '18', '18th', '18thcentury', '19', '1930s', '1940s',
       '1944', '1950', '1950s', '1960s', '1970s', '1971', '1974', '1976',
       '1980', '1980s', '1985', '1990s', '1999', '19th', '19thcentury',
       '20', '200', '2003', '2009', '20th', '21st', '23', '24', '25',
       '30', '300', '3d', '40', '50', '500', '60', '60s', '70', '70s',
       'aaron', 'aaroneckhart', 'abandoned', 'abducted', 'abigailbreslin',
       'abilities', 'ability', 'able', 'aboard', 'abuse', 'abusive',
       'academic', 'academy', 'accept', 'accepted', 'accepts', 'access',
       'accident', 'accidental', 'accidentally', 'accompanied',
       'accomplish', 'account', 'accountant', 'accused', 'ace', 'achieve',
       'act', 'acting', 'action', 'actionhero', 'actions', 'activist',
       'activities', 'activity', 'actor', 'actors', 'actress', 'acts',
       'actual', 'actually', 'adam', 'adammckay', 'adams', 'adamsandler',
    

In [43]:
# Now we work on finding the similarity
from sklearn.metrics.pairwise import cosine_similarity

In [44]:
# Here we can see that there are certain words that are mentioned separately, even though they are the same like
# action | actions or actual |actually or dance | dances |dancing
# These kind of words corresponds to the same meaning hence we need to stem these kind of words for our system to perform efficiently
!pip install nltk




[notice] A new release of pip is available: 25.3 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [45]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [46]:
new_movies['tags'].apply(stem)

0       in the 22nd century, a parapleg marin is dispa...
1       captain barbossa, long believ to be dead, ha c...
2       a cryptic messag from bond’ past send him on a...
3       follow the death of district attorney harvey d...
4       john carter is a war-weary, former militari ca...
                              ...                        
4804    el mariachi just want to play hi guitar and ca...
4805    a newlyw couple' honeymoon is upend by the arr...
4806    "signed, sealed, delivered" introduc a dedic q...
4807    when ambiti new york attorney sam is sent to s...
4808    ever sinc the second grade when he first saw h...
Name: tags, Length: 4806, dtype: object

In [47]:
cv.get_feature_names_out()[0:100]

array(['000', '007', '10', '100', '11', '12', '13', '14', '15', '16',
       '17', '17th', '18', '18th', '18thcentury', '19', '1930s', '1940s',
       '1944', '1950', '1950s', '1960s', '1970s', '1971', '1974', '1976',
       '1980', '1980s', '1985', '1990s', '1999', '19th', '19thcentury',
       '20', '200', '2003', '2009', '20th', '21st', '23', '24', '25',
       '30', '300', '3d', '40', '50', '500', '60', '60s', '70', '70s',
       'aaron', 'aaroneckhart', 'abandoned', 'abducted', 'abigailbreslin',
       'abilities', 'ability', 'able', 'aboard', 'abuse', 'abusive',
       'academic', 'academy', 'accept', 'accepted', 'accepts', 'access',
       'accident', 'accidental', 'accidentally', 'accompanied',
       'accomplish', 'account', 'accountant', 'accused', 'ace', 'achieve',
       'act', 'acting', 'action', 'actionhero', 'actions', 'activist',
       'activities', 'activity', 'actor', 'actors', 'actress', 'acts',
       'actual', 'actually', 'adam', 'adammckay', 'adams', 'adamsandler

In [48]:
# Finding out the similarities between the vectors generated by the CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [49]:
similarity= cosine_similarity(vectors)

In [50]:
similarity

array([[1.        , 0.08385255, 0.05504819, ..., 0.02306328, 0.02469324,
        0.        ],
       [0.08385255, 1.        , 0.06154575, ..., 0.02578553, 0.        ,
        0.        ],
       [0.05504819, 0.06154575, 1.        , ..., 0.02539184, 0.        ,
        0.        ],
       ...,
       [0.02306328, 0.02578553, 0.02539184, ..., 1.        , 0.06834085,
        0.04671418],
       [0.02469324, 0.        , 0.        , ..., 0.06834085, 1.        ,
        0.05001563],
       [0.        , 0.        , 0.        , ..., 0.04671418, 0.05001563,
        1.        ]], shape=(4806, 4806))

In [51]:
# As we can see, the first movie has the highest similarity with the first movie (cuz Obviously), and we can see similarities with other movies further 
new_movies[new_movies['title'] == 'Interstellar'].index[0]

np.int64(95)

In [52]:
similarity[0]

array([1.        , 0.08385255, 0.05504819, ..., 0.02306328, 0.02469324,
       0.        ], shape=(4806,))

In [53]:
sorted(similarity[0])

[np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float6

In [54]:
# After sorting, we cannot seem to identify 

In [55]:
enumerate(similarity[0])

<enumerate at 0x1e1e884d530>

In [56]:
list(enumerate(similarity[0]))

[(0, np.float64(1.0000000000000002)),
 (1, np.float64(0.08385254915624211)),
 (2, np.float64(0.055048188256318034)),
 (3, np.float64(0.03603749850782236)),
 (4, np.float64(0.1649915822768611)),
 (5, np.float64(0.10963225241337866)),
 (6, np.float64(0.01863389981249825)),
 (7, np.float64(0.1649915822768611)),
 (8, np.float64(0.056796183424706485)),
 (9, np.float64(0.07150969419341943)),
 (10, np.float64(0.10397504898200727)),
 (11, np.float64(0.075955452531275)),
 (12, np.float64(0.08808303292720551)),
 (13, np.float64(0.04343722427630694)),
 (14, np.float64(0.10540925533894598)),
 (15, np.float64(0.04303314829119352)),
 (16, np.float64(0.07694837640638656)),
 (17, np.float64(0.13944333775567924)),
 (18, np.float64(0.10471347707292389)),
 (19, np.float64(0.07905694150420949)),
 (20, np.float64(0.05370861555295746)),
 (21, np.float64(0.08385254915624211)),
 (22, np.float64(0.0632455532033676)),
 (23, np.float64(0.08964214570007953)),
 (24, np.float64(0.051987524491003634)),
 (25, np.floa

In [59]:
def recommend(movie):
    index = new_movies[new_movies['title'] == movie].index[0]
    distances = similarity[index]
    movies_list = sorted(list(enumerate(distances)), reverse = True, key = lambda x:x[1])[1:6]

    for i in movies_list:
        print(new_movies.iloc[i[0]].title)

In [74]:
recommend('Inception')

Duplex
The Helix... Loaded
Chicago Overcoat
Star Trek II: The Wrath of Khan
Timecop
