#***Movie Recommender System with Content Based Filtering and Collaborative Filtering***

## **Importing Libraries and Datasets**

In [51]:
# from google.colab import drive
# drive.mount('/content/drive')

In [52]:
# !unzip '/content/drive/MyDrive/Colab Notebooks/tmdb_5000_credits.csv.zip' -d /content/data
# !unzip '/content/drive/MyDrive/Colab Notebooks/tmdb_5000_movies.csv.zip' -d /content/data

In [53]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from ast import literal_eval
import pickle
from nltk.stem.porter import PorterStemmer

In [54]:
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = pd.read_csv('tmdb_5000_movies.csv')

## Exploring Datasets

In [55]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [56]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


***now lets see which columns are likely to be considers for our recommendation system***

#**Content Based Filtering**

In [57]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

>### **in movies dataset we will be considering following columns**
>### genres, id, keywords, original_language, original_title, overview,production_companies



In [58]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


In this 2nd dataset all of the features are important , as you may like a movie because of actor or a perticular person this can affect our system

**now lets create a single dataset with all these variables so that it will be easy to proprocess**

In [59]:
movies.rename(columns={'id':'movie_id'},inplace=True)

In [60]:
content = movies[["genres", "movie_id", "keywords", "overview", "production_companies"]]

In [61]:
content = content.merge(credits, on= 'movie_id')

### **Data Preprocessing**

In [62]:
content.isnull().sum()
print(f"current shape: {content.shape}")

current shape: (4803, 8)


In [63]:
content = content.dropna()
print(f"shape after drop: {content.shape}")

shape after drop: (4800, 8)


> ## now in data preprocessing we will tranform content dataset such that it will have only 3 columns ["movie_id", "title", "tag"]



> ## we will be creating tags with following method
>### tag =  genre name + keyword name + production company name + overview + first 3 actor namein cast + first name in crew (director)



In [64]:
def convert_genres(genre):
  ans = []
  for x in literal_eval(genre):
    ans.append(x['name'])

  return ans

In [65]:
content.genres = content.genres.apply(convert_genres)

In [66]:
def convert_keywords(keywords):
  ans = []
  for x in literal_eval(keywords):
    ans.append(x['name'])

  return ans

In [67]:
content.keywords = content.keywords.apply(convert_keywords)

In [68]:
def convert_production_companies(pc):
  ans = []
  for x in literal_eval(pc):
    ans.append(x['name'])

  return ans

In [69]:
content.production_companies = content.production_companies.apply(convert_production_companies)

In [70]:
def convert_cast(casts):
  ans = []
  cnt = 0
  for x in literal_eval(casts):
    if cnt != 3:
      ans.append(x['name'])
      cnt += 1
    else:
      break

  return ans

In [71]:
content.cast = content.cast.apply(convert_cast)

In [72]:
def convert_crew(crews):
  ans = []
  for x in literal_eval(crews):
    if(x['job'] == 'Director'):
      ans.append(x['name'])
    else:
      break

  return ans

In [73]:
content.crew = content.crew.apply(convert_crew)

In [74]:
def collapse(rec):
  ans = []
  for x in rec:
    ans.append(x.replace(" ",""))

  return ans

In [75]:
content.genres = content.genres.apply(collapse)
content.keywords = content.keywords.apply(collapse)
content.cast = content.cast.apply(collapse)
content.crew = content.crew.apply(collapse)

In [76]:
content.overview = content.overview.apply(lambda x: x.split())

In [77]:
content['tag'] = content.overview + content.genres + content.keywords + content.cast + content.production_companies + content.crew

In [78]:
content = content.drop(columns= ["overview","crew","cast","keywords","production_companies","genres"])

In [79]:
new_df = content.copy()

In [80]:
new_df.tag = new_df.tag.apply(lambda x: " ".join(x))

In [81]:
new_df.tag = new_df.tag.apply(lambda x: x.lower())

In [82]:
ps = PorterStemmer()

def stem_string(s):
  ans = []
  for x in s.split():
    ans.append(ps.stem(x))

  return " ".join(ans)

In [83]:
new_df.tag = new_df.tag.apply(stem_string)

In [84]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [85]:
vector = cv.fit_transform(new_df.tag).toarray()

In [86]:
cv.get_feature_names_out()[40:100]

array(['2003', '2009', '20th', '21', '21st', '23', '24', '25', '2929',
       '30', '300', '3d', '40', '50', '500', '60', '70', '80', 'a24',
       'aaron', 'aaroneckhart', 'ab', 'abandon', 'abduct',
       'abigailbreslin', 'abil', 'abl', 'aboard', 'abov', 'absolut',
       'abu', 'abus', 'academ', 'academi', 'accept', 'access', 'accid',
       'accident', 'acclaim', 'accompani', 'accomplish', 'account',
       'accus', 'ace', 'achiev', 'acr', 'act', 'action', 'actionhero',
       'activ', 'activist', 'activities', 'actor', 'actress', 'actual',
       'ad', 'adam', 'adamsandl', 'adapt', 'add'], dtype=object)

## **Training finding nearest N matches**

In [87]:
similarity = cosine_similarity(vector)
similarity

array([[1.        , 0.10803395, 0.07204382, ..., 0.07418865, 0.        ,
        0.08170074],
       [0.10803395, 1.        , 0.07938842, ..., 0.04087596, 0.        ,
        0.06752237],
       [0.07204382, 0.07938842, 1.        , ..., 0.04543109, 0.        ,
        0.        ],
       ...,
       [0.07418865, 0.04087596, 0.04543109, ..., 1.        , 0.05676567,
        0.05796087],
       [0.        , 0.        , 0.        , ..., 0.05676567, 1.        ,
        0.08335142],
       [0.08170074, 0.06752237, 0.        , ..., 0.05796087, 0.08335142,
        1.        ]])

In [88]:
def recommend_movies(title):
    index = new_df[new_df['title'] == title].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(new_df.iloc[i[0]].title)


In [89]:
recommend_movies('Iron Man')

Iron Man 3
Iron Man 2
Avengers: Age of Ultron
Captain America: Civil War
The Avengers


In [92]:
pickle.dump(new_df, open('movies.pkl','wb'))

In [93]:
pickle.dump(similarity, open('similarity.pkl','wb'))