In [1]:
import numpy as np;
import pandas as pd;
import matplotlib.pyplot as plt
 

credits = pd.read_csv("tmdb_5000_credits.csv")
movies = pd.read_csv("tmdb_5000_movies.csv")

movies.head()

movies = movies.merge(credits, on="title")

movies.head()

#__important colms:
#genre
#id
#Keywords
#title
#overview
#release date
#cast-top3 only
#crew-director only

movies = movies[["genres","id","keywords","overview","release_date","title","cast","crew"]]
movies.head()

movies.isnull().sum()


genres          0
id              0
keywords        0
overview        3
release_date    1
title           0
cast            0
crew            0
dtype: int64

In [2]:
movies.dropna(inplace=True)



In [3]:
movies.duplicated().sum()

0

In [4]:
import ast
ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

def convert(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)



def convert3(obj):
    L=[]
    counter=0
    for i in ast.literal_eval(obj):
        if counter!=3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L



def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 



movies['cast'] = movies['cast'].apply(lambda x:x[0:3])
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['crew'] = movies['crew'].apply(fetch_director)

def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

new = movies.drop(columns=['overview','genres','keywords','cast','crew'])
new['tags'] = new['tags'].apply(lambda x: " ".join(x)) #converting content of tags to string

new.head()
       

Unnamed: 0,id,release_date,title,tags
0,19995,2009-12-10,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,2007-05-19,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,2015-10-26,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,2012-07-16,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,2012-03-07,John Carter,"John Carter is a war-weary, former military ca..."


In [5]:
#to convert word into its original stem word to avoid redundant values
import nltk

from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

new['tags'] = new['tags'].apply(stem)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')
    
vector = cv.fit_transform(new['tags']).toarray()
vector.shape  #(x,y) where x:movies, y:common words

(4805, 5000)

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)
similarity  #gives angular distance between vectors of one movie with all movies(4805), hence distance of 4805 movies with all 4805 others

#contains array of arrays in which each internal array contains distance of each movie with all movies, therefore gives 1 when it compares with itself, thus diagonal of the matrix contains all '1s'


array([[1.        , 0.08858079, 0.08858079, ..., 0.04559608, 0.        ,
        0.        ],
       [0.08858079, 1.        , 0.06451613, ..., 0.02490677, 0.        ,
        0.0277137 ],
       [0.08858079, 0.06451613, 1.        , ..., 0.02490677, 0.        ,
        0.        ],
       ...,
       [0.04559608, 0.02490677, 0.02490677, ..., 1.        , 0.03962144,
        0.04279605],
       [0.        , 0.        , 0.        , ..., 0.03962144, 1.        ,
        0.08817334],
       [0.        , 0.0277137 , 0.        , ..., 0.04279605, 0.08817334,
        1.        ]])

In [8]:
def recommend(movie):
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(new.iloc[i[0]].title)

In [9]:
recommend('Batman Begins')

The Dark Knight
Batman
Batman
10th & Wolf
Synecdoche, New York


In [10]:
pip install pickle-mixin


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\Hp\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [11]:
import pickle

pickle.dump(similarity,open('similarity.pkl','wb'))

In [12]:
new.rename(columns = {'id':'movie_id'}, inplace = True)
new

Unnamed: 0,movie_id,release_date,title,tags
0,19995,2009-12-10,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,2007-05-19,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,2015-10-26,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,2012-07-16,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,2012-03-07,John Carter,"john carter is a war-weary, former militari ca..."
...,...,...,...,...
4804,9367,1992-09-04,El Mariachi,el mariachi just want to play hi guitar and ca...
4805,72766,2011-12-26,Newlyweds,a newlyw couple' honeymoon is upend by the arr...
4806,231617,2013-10-13,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduc a dedic q..."
4807,126186,2012-05-03,Shanghai Calling,when ambiti new york attorney sam is sent to s...


In [13]:
pickle.dump(new.to_dict(), open('movies_dict.pkl','wb'))

In [15]:
new[new['title'] == movie].index[0]


NameError: name 'movie' is not defined