## Content Based Recomendation System for Movies

#### 1. Import libraries

In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel

### 2. import dataset

In [2]:
credit_df = pd.read_csv("dataset\\tmdb_5000_credits.csv")
movies_df = pd.read_csv("dataset\\tmdb_5000_movies.csv")

In [3]:
credit_df.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [4]:
movies_df.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


### 3. Prepare the dataset

In [5]:
credit_df.rename(columns={'movie_id' : 'id'}, inplace=True)
df_merge = movies_df.merge(credit_df, on = 'id')
df_merge.drop(columns=['homepage', 'title_x', 'title_y', 'status','production_countries'],
              inplace=True)
df_merge.head(2)

Unnamed: 0,budget,genres,id,keywords,original_language,original_title,overview,popularity,production_companies,release_date,revenue,runtime,spoken_languages,tagline,vote_average,vote_count,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Enter the World of Pandora.,7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]","At the end of the world, the adventure begins.",6.9,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [6]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   id                    4803 non-null   int64  
 3   keywords              4803 non-null   object 
 4   original_language     4803 non-null   object 
 5   original_title        4803 non-null   object 
 6   overview              4800 non-null   object 
 7   popularity            4803 non-null   float64
 8   production_companies  4803 non-null   object 
 9   release_date          4802 non-null   object 
 10  revenue               4803 non-null   int64  
 11  runtime               4801 non-null   float64
 12  spoken_languages      4803 non-null   object 
 13  tagline               3959 non-null   object 
 14  vote_average          4803 non-null   float64
 15  vote_count           

### 4. Making Content Based Recommendation System

In [7]:
df_merge.head(2)['overview']

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
Name: overview, dtype: object

#### use tfidf for transforming overview feature into some numbers

In [9]:
tfidf = TfidfVectorizer(
    min_df = 3, max_features= None,
    strip_accents= 'unicode', analyzer= 'word', token_pattern= r'\w{1,}',
    ngram_range = (1,3),
    stop_words='english'
)

# filling empty string in overview
df_merge['overview'] = df_merge['overview'].fillna('')

In [12]:
# fit tfidf on the 'overview' features
tfidf_mat = tfidf.fit_transform(df_merge['overview'])

In [17]:
tfidf_mat.shape

(4803, 10417)

#### find similarities values:

In [21]:
# compute the sigmoid kernel
sig = sigmoid_kernel(tfidf_mat, tfidf_mat)

In [37]:
sig.shape, sig[0]

((4803, 4803),
 array([0.76163447, 0.76159416, 0.76159416, ..., 0.76159416, 0.76159416,
        0.76159416]))

In [72]:
# Reverse mapping of indicies and movie titles
indices = pd.Series(df_merge.index, index = df_merge['original_title'])
indices

original_title
Avatar                                         0
Pirates of the Caribbean: At World's End       1
Spectre                                        2
The Dark Knight Rises                          3
John Carter                                    4
                                            ... 
El Mariachi                                 4798
Newlyweds                                   4799
Signed, Sealed, Delivered                   4800
Shanghai Calling                            4801
My Date with Drew                           4802
Length: 4803, dtype: int64

In [70]:
def get_rec(title, idx_series ,sigmoid_kernel, n_tops= 10):
    # get index corresponding to original title
    idx = idx_series[title]

    # get pairwise similarities
    sig_scores = list(enumerate(sigmoid_kernel[idx]))

    # sort the movies
    sig_scores = sorted(sig_scores, key = lambda x: x[1], reverse= True)

    # score of the most similiar movies
    sig_scores = sig_scores[1 : n_tops+1]

    # movie indices
    movie_indices = [i[0] for i in sig_scores]

    return idx_series[movie_indices]

In [89]:
indices.iloc[[np.random.choice(len(indices))]]

original_title
Indiana Jones and the Last Crusade    1006
dtype: int64

In [90]:
get_rec('Indiana Jones and the Last Crusade', indices, sig)

original_title
Raiders of the Lost Ark                 2085
The Secret Life of Walter Mitty          349
Bucky Larson: Born to Be a Star         3079
The Cry of the Owl                      2909
Tusk                                    3939
Austin Powers in Goldmember              673
Elsa & Fred                             3114
Mean Creek                              4528
Indiana Jones and the Temple of Doom    1697
Two Girls and a Guy                     4364
dtype: int64