In [44]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast
import warnings
import pickle as pkl
from collections import Counter
from nltk.stem.porter import PorterStemmer

In [2]:
warnings.filterwarnings('ignore')

In [3]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [4]:
movie = movies.copy(deep = True)
credit = credits.copy(deep = True)

In [5]:
movie.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [6]:
credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


In [7]:
df = movie.merge(credit, on = 'title')

#### For creating RS using content-based filtering. So, we need to create tags for each movie. We will keep those features which can help us creating tags.

##### budget is not something which decides whether I'm gonna like the movie or not(remove)
##### genre is something which describes my interests(keep)
##### homepage(remove)
##### id(keep)
##### keywords(keep)
##### original_language(remove) : highly imbalanced data
##### title, overview(keep)
##### Popularity(remove)
##### production_company(remove)
##### release_date(remove) : but can be used
##### cast(keep) : someone may like all the movies done by some actor
##### crew(keep)

In [8]:
movie = df[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

##### We need : movie_id, title and tags
##### tags : combining the data from overview, gneres, keywords, cast, crew

In [9]:
movie.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [10]:
movie.dropna(inplace = True)

In [11]:
movie.duplicated().sum()

0

In [12]:
movie.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

##### we need the content present in the "name" key
##### but there is a problem, the list is in the form of a string
##### to conunter this we can use : ast.literal_eval()

In [13]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [14]:
movie['genres'] = movie['genres'].apply(convert)

In [15]:
movie['keywords'] = movie['keywords'].apply(convert)

In [16]:
def convert_cast(obj):
    L = []
    cnt = 0
    for i in ast.literal_eval(obj):
        if cnt == 3:
            break
        else:
            L.append(i['name'])
            cnt += 1
    return L

In [17]:
movie['cast'] = movie['cast'].apply(convert_cast)

In [18]:
def crew(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

In [19]:
movie['crew'] = movie['crew'].apply(crew)

In [20]:
movie.overview[0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [21]:
movie['overview'] = movie['overview'].apply(lambda x : x.split())

In [22]:
movie['genres'] = movie['genres'].apply(lambda x : [i.replace(" ", "") for i in x])
movie['keywords'] = movie['keywords'].apply(lambda x : [i.replace(" ", "") for i in x])
movie['cast'] = movie['cast'].apply(lambda x : [i.replace(" ", "") for i in x])
movie['crew'] = movie['crew'].apply(lambda x : [i.replace(" ", "") for i in x])

In [23]:
movie['tags'] = movie.overview + movie.cast + movie.crew + movie.genres

In [24]:
df = movie[['movie_id', 'title', 'tags']]

In [25]:
df['tags'].apply(lambda x : " ".join(x))

0       In the 22nd century, a paraplegic Marine is di...
1       Captain Barbossa, long believed to be dead, ha...
2       A cryptic message from Bondâ€™s past sends him o...
3       Following the death of District Attorney Harve...
4       John Carter is a war-weary, former military ca...
                              ...                        
4804    El Mariachi just wants to play his guitar and ...
4805    A newlywed couple's honeymoon is upended by th...
4806    "Signed, Sealed, Delivered" introduces a dedic...
4807    When ambitious New York attorney Sam is sent t...
4808    Ever since the second grade when he first saw ...
Name: tags, Length: 4806, dtype: object

In [26]:
df['tags'] = df['tags'].apply(lambda x : " ".join(x))

In [27]:
df['tags'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. SamWorthington ZoeSaldana SigourneyWeaver JamesCameron Action Adventure Fantasy ScienceFiction'

##### There are some data containing capital letters(we want them to be small, recommeded to do so)

In [28]:
df['tags'].apply(lambda x : x.lower())[0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. samworthington zoesaldana sigourneyweaver jamescameron action adventure fantasy sciencefiction'

In [29]:
df['tags'] = df['tags'].apply(lambda x : x.lower())

##### Now we are going to convert the 'tags' column to Vector with the help of CountVectorizer

In [30]:
cvec = CountVectorizer(max_features = 5000, stop_words = 'english')

In [31]:
cvec.fit_transform(df['tags']).toarray()[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [32]:
vectors = cvec.fit_transform(df['tags']).toarray()

In [33]:
Counter(vectors[0])

Counter({0: 4980, 1: 20})

###### as we can see, the vectors we obtained, forms a highly sparse matrix

In [34]:
cvec.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [35]:
ps = PorterStemmer()

### Steming
##### There are some words which have similar meaning but CountVectorizer() consider them as two distinct words and created different features for same word.

In [36]:
def stemm(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [37]:
df['tags'].apply(stemm)[0]

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. samworthington zoesaldana sigourneyweav jamescameron action adventur fantasi sciencefict'

In [38]:
df['tags'] = df['tags'].apply(stemm)

In [39]:
vectors[0].shape

(5000,)

In [41]:
similar = cosine_similarity(vectors)

#### we want the movies which are similar to the given movie.
#### first we take out the index of the movie which user have seen and then we find the movies similar to this movie using the "similar" Matrix. But there is a problem, we want the similarities to be in sorted order and making them sorted(we will lose the indexing). To keep the things in correct way "enumerate()" : making them (index, similarity), and will apply sorting on the second element of the tuple using "key = lambda X: X[1]"
### We will recommend 10 movies similar to given Movie(by user)

In [42]:
def recommend(movie):
    idx = df[df['title'] == movie].index[0]
    distances = similar[idx]
    movies_list = sorted(list(enumerate(distances)), reverse = True, key = lambda x : x[1])[1:10]
    for i in movies_list:
        print(df.iloc[i[0]].title)

In [43]:
recommend('Batman Begins')

The Dark Knight Rises
Amidst the Devil's Wings
The Dark Knight
Batman
Dead Man Down
Batman
Batman & Robin
Gangster's Paradise: Jerusalema
Raising Helen


In [48]:
pkl.dump(df.to_dict(), open('Movies.pkl', 'wb'))

In [49]:
pkl.dump(similar, open('similarity.pkl', 'wb'))