In [24]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [25]:
import pandas as pd
import numpy as np
     

In [26]:
df = pd.read_csv('top10K-TMDB-movies.csv')

EDA- Exploration Data Analysis
The Dataset contains the following things:

- ID: Movie ID number on the website.
- title: Movie name
- genre: Movie genre (crime, adventure, etc.)
- original_language: Original language in which the movie is released
- overview: Summary of the movie
- popularity: Movie Popularity
- release_date: Movie release date
- vote_average: Movie vote average
- vote_count: Movie vote count

In [27]:
df.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [28]:
df.shape

(10000, 9)

In [29]:
df.describe()

Unnamed: 0,id,popularity,vote_average,vote_count
count,10000.0,10000.0,10000.0,10000.0
mean,161243.505,34.697267,6.62115,1547.3094
std,211422.046043,211.684175,0.766231,2648.295789
min,5.0,0.6,4.6,200.0
25%,10127.75,9.15475,6.1,315.0
50%,30002.5,13.6375,6.6,583.5
75%,310133.5,25.65125,7.2,1460.0
max,934761.0,10436.917,8.7,31917.0


Checking information about the datset

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.3+ KB


In [31]:
df.isnull().sum()

id                    0
title                 0
genre                 3
original_language     0
overview             13
popularity            0
release_date          0
vote_average          0
vote_count            0
dtype: int64

In [32]:
df.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

We need some feature only for recommendation system.

In [33]:
df = df[['id','title','overview','genre']]

In [34]:
df.head()

Unnamed: 0,id,title,overview,genre
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"Drama,Crime"
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...","Comedy,Drama,Romance"
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama,Crime"
3,424,Schindler's List,The true story of how businessman Oskar Schind...,"Drama,History,War"
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...,"Drama,Crime"


Combining features

In [35]:
df['tags'] = df['overview'] + df['genre']
df = df.drop(columns = ['overview','genre'])

In [36]:
df.head()

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...


In [37]:
df.isnull().sum()

id        0
title     0
tags     15
dtype: int64

In [38]:
df.dropna(inplace = True)

NLP-Natural Language Processing

In [39]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [40]:
from sklearn.feature_extraction.text import CountVectorizer

In [41]:
df['tags']

0       Framed in the 1940s for the double murder of h...
1       Raj is a rich, carefree, happy-go-lucky second...
2       Spanning the years 1945 to 1955, a chronicle o...
3       The true story of how businessman Oskar Schind...
4       In the continuing saga of the Corleone crime f...
                              ...                        
9995    The story follows the adventures of Aang, a yo...
9996    The sharks take bite out of the East Coast whe...
9997    During World War II, a brave, patriotic Americ...
9998    A man named Farmer sets out to rescue his kidn...
9999    Seeking justice for his partner’s murder by an...
Name: tags, Length: 9985, dtype: object

In [42]:
cv = CountVectorizer(max_features = 5000, stop_words = 'english')
cv.fit_transform(df['tags']).toarray().shape

(9985, 5000)

In [43]:
vector = cv.fit_transform(df['tags']).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

Using Cosine Similarity

In [45]:

from sklearn.metrics.pairwise import cosine_similarity

In [46]:
similarity = cosine_similarity(vector)

In [49]:
similarity

array([[1.        , 0.06454972, 0.14731391, ..., 0.07905694, 0.11572751,
        0.07537784],
       [0.06454972, 1.        , 0.09128709, ..., 0.        , 0.03984095,
        0.        ],
       [0.14731391, 0.09128709, 1.        , ..., 0.0248452 , 0.0727393 ,
        0.10660036],
       ...,
       [0.07905694, 0.        , 0.0248452 , ..., 1.        , 0.03253   ,
        0.03178209],
       [0.11572751, 0.03984095, 0.0727393 , ..., 0.03253   , 1.        ,
        0.04652421],
       [0.07537784, 0.        , 0.10660036, ..., 0.03178209, 0.04652421,
        1.        ]])

In [50]:
df[df['title'] =="Schindler's List"].index[0]

np.int64(3)

In [51]:

distance = sorted(list(enumerate(similarity[3])),reverse = True, key = lambda x : x[1])[:5]
distance

[(3, np.float64(0.9999999999999997)),
 (3351, np.float64(0.6488856845230502)),
 (2031, np.float64(0.5726562866781999)),
 (2697, np.float64(0.533113989983183)),
 (3665, np.float64(0.5298129428260175))]

Enumerate and List Conversion:

enumerate(similarity[3]): This creates an enumerated list of the elements in similarity[3]. Each element in this enumerated list is a tuple where the first element is the index and the second element is the value from similarity[3].
list(enumerate(similarity[3])): This converts the enumerated object into a list of tuples.
Sorting:

sorted(..., reverse=True, key=lambda x: x[1]): This sorts the list of tuples in descending order based on the second element of each tuple (x[1]). The lambda x: x[1] part specifies that the sort key is the second element of each tuple.
Selecting Top 5:

[:5]: After sorting, this selects the first 5 elements from the sorted list, which correspond to the 5 highest values in similarity[3].

In [52]:

def recommand(movie):
  movie_index = df[df["title"] == movie].index[0]
  distances = similarity[movie_index]
  movies_list = sorted(list(enumerate(distances)),reverse = True, key=lambda x : x[1])[1:6]

  for i in movies_list:
    print(df.iloc[i[0]].title)
     

In [53]:
recommand('Iron Man')

Iron Man 3
Star Wars: Episode III - Revenge of the Sith
Guardians of the Galaxy Vol. 2
Avengers: Age of Ultron
Iron Man 2


In [54]:
recommand('The Godfather')

The Godfather: Part II
Blood Ties
Bomb City
Joker
Felon


In [55]:
df.head(2)
     

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second..."


Saving the similarity model

In [56]:
import pickle

In [57]:
pickle.dump(df, open('movies.pkl','wb'))

In [58]:
pickle.dump(similarity, open('similarity.pkl','wb'))
     

In [59]:
pickle.load(open('movies.pkl','rb'))

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...
...,...,...,...
9995,10196,The Last Airbender,"The story follows the adventures of Aang, a yo..."
9996,331446,Sharknado 3: Oh Hell No!,The sharks take bite out of the East Coast whe...
9997,13995,Captain America,"During World War II, a brave, patriotic Americ..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,A man named Farmer sets out to rescue his kidn...


In [65]:
movie_index = df[df['title'] == 'The Godfather: Part II']['id']

In [66]:
movie_index

4    240
Name: id, dtype: int64