## Content Based Filter

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Reading movie file
movies = pd.read_csv(r"C:\Users\Shivani Dussa\Downloads\movies.csv",sep = ',',encoding = 'latin-1',usecols =['title','genres'])
movies.head()

Unnamed: 0,title,genres
0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji,Adventure|Children|Fantasy
2,Grumpier Old Men,Comedy|Romance
3,Waiting to Exhale,Comedy|Drama|Romance
4,Father of the Bride Part II,Comedy


In [2]:
m = pd.read_csv(r"C:\Users\Shivani Dussa\Downloads\movies.csv")

In [3]:
m.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,Adventure|Children|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama|Romance
4,5,Father of the Bride Part II,Comedy


In [4]:
m.shape

(9742, 3)

In [4]:
movies.shape

(9742, 2)

In [5]:
# Break up the big genres string into a strinf array
movies['genres'] = movies['genres'].str.split('|')
movies.head(2)

Unnamed: 0,title,genres
0,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,Jumanji,"[Adventure, Children, Fantasy]"


In [6]:
# convert genres to string value
movies['genres'] = movies['genres'].fillna("").astype("str")
movies.head(4)

Unnamed: 0,title,genres
0,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy..."
1,Jumanji,"['Adventure', 'Children', 'Fantasy']"
2,Grumpier Old Men,"['Comedy', 'Romance']"
3,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']"


## Recommendation Based on Genre

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(analyzer = 'word',ngram_range = (1,2),min_df = 0,stop_words = 'english')
tfv_matrix = tfv.fit_transform(movies['genres'])
tfv_matrix.shape

(9742, 177)

In [12]:
tfv

TfidfVectorizer(min_df=0, ngram_range=(1, 2), stop_words='english')

In [16]:
tfv_matrix

<9742x177 sparse matrix of type '<class 'numpy.float64'>'
	with 36628 stored elements in Compressed Sparse Row format>

In [17]:
print(tfv_matrix)

  (0, 63)	0.4051430286389587
  (0, 47)	0.3681884973089335
  (0, 34)	0.38369482677526473
  (0, 18)	0.4008862821540716
  (0, 108)	0.30254034715329503
  (0, 59)	0.16761357728391116
  (0, 46)	0.3162303113127544
  (0, 33)	0.32335863498874723
  (0, 17)	0.26110809240797916
  (1, 51)	0.5795995638728872
  (1, 19)	0.5337814180965866
  (1, 108)	0.36554429536140276
  (1, 46)	0.382085190978399
  (1, 17)	0.31548378439611124
  (2, 68)	0.7695974416123483
  (2, 160)	0.5242383036039113
  (2, 59)	0.36454626441402677
  (3, 103)	0.5645649298589199
  (3, 62)	0.5417511322516687
  (3, 96)	0.2904365851652309
  (3, 160)	0.4522400920963429
  (3, 59)	0.31447995130958456
  (4, 59)	1.0
  (5, 84)	0.604518892749723
  (5, 5)	0.5454388121871825
  :	:
  (9733, 38)	0.835677806885533
  (9733, 96)	0.23714974930952545
  (9733, 33)	0.495381266784903
  (9734, 62)	0.7846149876753742
  (9734, 96)	0.42063760299449465
  (9734, 59)	0.4554594691761476
  (9735, 33)	1.0
  (9736, 86)	1.0
  (9737, 2)	0.5335755137706529
  (9737, 35)	0.4

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
cos_s = cosine_similarity(tfv_matrix,tfv_matrix)

In [19]:
cos_s.shape

(9742, 9742)

In [23]:
print(cos_s)

[[1.         0.31379419 0.0611029  ... 0.         0.16123168 0.16761358]
 [0.31379419 1.         0.         ... 0.         0.         0.        ]
 [0.0611029  0.         1.         ... 0.         0.         0.36454626]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.16123168 0.         0.         ... 0.         1.         0.        ]
 [0.16761358 0.         0.36454626 ... 0.         0.         1.        ]]


In [24]:
movies.head(2)

Unnamed: 0,title,genres
0,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy..."
1,Jumanji,"['Adventure', 'Children', 'Fantasy']"


In [39]:
# Build a one dimensional array with movie titles
titles = movies['title']

# bulding a indices "series", reversing the index & title ....these will help to get an index, given a movie title
indices = pd.Series(movies.index,index = movies['title'])

In [40]:
print(indices.shape)
indices.head()

(9742,)


title
Toy Story                       0
Jumanji                         1
Grumpier Old Men                2
Waiting to Exhale               3
Father of the Bride Part II     4
dtype: int64

In [31]:
print(titles.shape)
titles.head()

(9742,)


0                      Toy Story 
1                        Jumanji 
2               Grumpier Old Men 
3              Waiting to Exhale 
4    Father of the Bride Part II 
Name: title, dtype: object

In [38]:
titles[1] ,titles[0],titles[1999],titles[2001],titles[9741]

('Jumanji ',
 'Toy Story ',
 'Thing from Another World, The ',
 'War of the Worlds, The ',
 'Andrew Dice Clay: Dice Rules ')

In [49]:
indices['Jumanji '],indices['Toy Story '],indices[ 'War of the Worlds, The ']

(1, 0, 2001)

In [65]:
ix = indices['War of the Worlds, The ']
similar_scores = cos_s[ix]
similar_scrs = list(enumerate(similar_scores))
similar_s = sorted(similar_scrs,key = lambda x:x[1], reverse = True)


In [62]:
print(len(similar_scores))
similar_scores

9742


array([0.        , 0.        , 0.        , ..., 0.18506379, 0.09700799,
       0.        ])

In [64]:
print(len(similar_scrs))
similar_scrs[0:5]

9742


[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.05374929591041445), (4, 0.0)]

In [67]:
print(len(similar_s))
similar_s[0:5]

9742


[(439, 1.0), (1905, 1.0), (2001, 1.0), (3926, 1.0), (8322, 1.0)]

In [103]:
idx = indices['War of the Worlds, The ']
sim_scores = cos_s[idx]
print(sim_scores)
sim_scores = list(enumerate(sim_scores))
print(sim_scores[0:5])
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
print(sim_scores[0:5])

[0.         0.         0.         ... 0.18506379 0.09700799 0.        ]
[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.05374929591041445), (4, 0.0)]
[(439, 1.0), (1905, 1.0), (2001, 1.0), (3926, 1.0), (8322, 1.0)]


In [74]:
movie_indices = [i[0] for i in similar_s]
movie_indices[0:5]

[439, 1905, 2001, 3926, 8322]

In [79]:
titles[movie_indices]

439                                         No Escape 
1905                               Planet of the Apes 
2001                           War of the Worlds, The 
3926                                       Rollerball 
8322                                      Snowpiercer 
                             ...                      
9730    Hommage Ã  Zgougou (et salut Ã  Sabine Mamou) 
9735                 Love Live! The School Idol Movie 
9736                Jon Stewart Has Left the Building 
9738                            No Game No Life: Zero 
9741                     Andrew Dice Clay: Dice Rules 
Name: title, Length: 9742, dtype: object

In [111]:
#  **Functions that get movie recommendations based on the cosine similarity score of movie genres**
def genre_recom(title):
    ix = indices[title]
    sim_scores= list(enumerate(cos_s[ix]))
    sim_scores = sorted(sim_scores,key = lambda x:x[1], reverse = True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]


In [112]:
genre_recom('Dark Knight ').head(21)

8387                          Need for Speed 
8149      Grandmaster, The (Yi dai zong shi) 
123                                Apollo 13 
8026                              Life of Pi 
8396                                    Noah 
38                           Dead Presidents 
341                              Bad Company 
347             Faster Pussycat! Kill! Kill! 
430                        Menace II Society 
568                          Substitute, The 
665                          Nothing to Lose 
1645                       Untouchables, The 
1696                           Monument Ave. 
2563                              Death Wish 
2574                        Band of the Hand 
3037                              Foxy Brown 
3124    Harley Davidson and the Marlboro Man 
3167                                Scarface 
3217                               Swordfish 
3301                           Above the Law 
Name: title, dtype: object

In [113]:
genre_recom('1984 (Nineteen Eighty-Four) ').head(21)

999                          Forbidden Planet 
1187                                  Contact 
1485                               Metropolis 
1578              1984 (Nineteen Eighty-Four) 
2267                               Last Night 
2762         Brother from Another Planet, The 
2982                              Unbreakable 
3455                           Altered States 
3849                           Silent Running 
4241               Man Who Fell to Earth, The 
4316                           Fahrenheit 451 
4510                     Handmaid's Tale, The 
4665    Dernier Combat, Le (Last Battle, The) 
5023            Babylon 5: The River of Souls 
5024                    Babylon 5: Thirdspace 
5116                                   Charly 
5347                                   Primer 
5511                           Day After, The 
6562                                Electroma 
6618                      Man from Earth, The 
Name: title, dtype: object

In [114]:
genre_recom('Jumanji ').head(21)

53                           Indian in the Cupboard, The 
109                           NeverEnding Story III, The 
767                             Escape to Witch Mountain 
1514                  Darby O'Gill and the Little People 
1556                                        Return to Oz 
1617                              NeverEnding Story, The 
1618         NeverEnding Story II: The Next Chapter, The 
1799                              Santa Claus: The Movie 
3574    Harry Potter and the Sorcerer's Stone (a.k.a. ...
6075    Chronicles of Narnia: The Lion, the Witch and ...
6389                                Bridge to Terabithia 
6629                                 Golden Compass, The 
6655                Water Horse: Legend of the Deep, The 
6751           Chronicles of Narnia: Prince Caspian, The 
7426                                 Alice in Wonderland 
7478    Chronicles of Narnia: The Voyage of the Dawn T...
8230                      Percy Jackson: Sea of Monsters 
8641          

## Recommendation Based on Title

In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfvtitle = TfidfVectorizer(analyzer = 'word',ngram_range = (1,2),min_df = 0,stop_words = 'english')
tfvtitle_matrix = tfv.fit_transform(movies['title'])
tfvtitle_matrix.shape

(9742, 20558)

In [89]:
from sklearn.metrics.pairwise import cosine_similarity
cos_sm = cosine_similarity(tfvtitle_matrix,tfvtitle_matrix)
cos_sm[:4,:4]

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [90]:
# Build a one dimensional array with movies
titles = movies['title']
indices = pd.Series(movies.index,index = movies['title'])

In [95]:
#titles.head()
#indices.head()

In [123]:
# Function that get movie recomendations based on the cosine similary scores of movie genre
def title_recom(title):
    idx = indices[title]
    similar_scrs = list(enumerate(cos_sm[idx]))
    similar_scrs = sorted(similar_scrs,key = lambda x:x[1], reverse = True)
    similar_scrs = similar_scrs[1:21]
    movie_indices = [i[0] for i in similar_scrs]
    return titles.iloc[movie_indices]

In [124]:
title_recom('Dark Knight ').head(20)

7768                     Dark Knight Rises, The 
8032    Batman: The Dark Knight Returns, Part 1 
8080    Batman: The Dark Knight Returns, Part 2 
140                                First Knight 
2417                         Cry in the Dark, A 
5778                          Alone in the Dark 
7375                             Knight and Day 
3576                               Black Knight 
3190                           Knight's Tale, A 
6858                       Alone in the Dark II 
4242                                  Dark Blue 
5060                                  Dark Days 
1305                                  Dark City 
5483                                  Dark Star 
6815                      Batman: Gotham Knight 
5934                                 Dark Water 
4749                        Shot in the Dark, A 
7877                               Dark Shadows 
8766                            The Dark Valley 
6690                      Taxi to the Dark Side 
Name: title, dtype: 

In [125]:
title_recom('Jumanji ')

9636    Jumanji: Welcome to the Jungle 
0                            Toy Story 
2                     Grumpier Old Men 
3                    Waiting to Exhale 
4          Father of the Bride Part II 
5                                 Heat 
6                              Sabrina 
7                         Tom and Huck 
8                         Sudden Death 
9                            GoldenEye 
10             American President, The 
11         Dracula: Dead and Loving It 
12                               Balto 
13                               Nixon 
14                    Cutthroat Island 
15                              Casino 
16               Sense and Sensibility 
17                          Four Rooms 
18      Ace Ventura: When Nature Calls 
19                         Money Train 
Name: title, dtype: object