In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [7]:
entertainment = pd.read_csv("Entertainment.csv", encoding = "utf8")
print(entertainment.columns)
entertainment.Category

Index(['Id', 'Titles', 'Category', 'Reviews'], dtype='object')


0                  Drama, Romance, School, Supernatural
1     Action, Adventure, Drama, Fantasy, Magic, Mili...
2     Action, Comedy, Historical, Parody, Samurai, S...
3                                      Sci-Fi, Thriller
4     Action, Comedy, Historical, Parody, Samurai, S...
5                Comedy, Drama, School, Shounen, Sports
6               Action, Adventure, Shounen, Super Power
7                        Drama, Military, Sci-Fi, Space
8     Action, Comedy, Historical, Parody, Samurai, S...
9     Action, Comedy, Historical, Parody, Samurai, S...
10    Drama, Fantasy, Romance, Slice of Life, Supern...
11                               Drama, School, Shounen
12    Action, Comedy, Historical, Parody, Samurai, S...
13    Action, Drama, Mecha, Military, Sci-Fi, Super ...
14               Comedy, Drama, School, Shounen, Sports
15                       Adventure, Drama, Supernatural
16               Drama, Music, Romance, School, Shounen
17    Adventure, Fantasy, Historical, Mystery, S

In [12]:
#let's see is there any null value or not.
entertainment["Category"].isnull().sum()
#so there is no null value. so need to imputation.

0

In [13]:
# Creating a Tfidf Vectorizer to remove all stop words
tfidf = TfidfVectorizer(stop_words = "english")   

# Preparing the Tfidf matrix by fitting and transforming
tfidf_matrix = tfidf.fit_transform(entertainment.Category)   #Transform a count matrix to a normalized tf or tf-idf representation
tfidf_matrix.shape 

(51, 34)

In [17]:
#we have to find similarity score , here we will calculate the cosine simalarity.
#cosine(x,y)= (x.y⊺)/(||x||.||y||)

# Computing the cosine similarity on Tfidf matrix
cosine_sim_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim_matrix

array([[1.        , 0.09421367, 0.        , ..., 0.12767481, 0.16772551,
        0.31295101],
       [0.09421367, 1.        , 0.16662513, ..., 0.22332745, 0.        ,
        0.        ],
       [0.        , 0.16662513, 1.        , ..., 0.13383076, 0.        ,
        0.        ],
       ...,
       [0.12767481, 0.22332745, 0.13383076, ..., 1.        , 0.47083158,
        0.17020003],
       [0.16772551, 0.        , 0.        , ..., 0.47083158, 1.        ,
        0.64107498],
       [0.31295101, 0.        , 0.        , ..., 0.17020003, 0.64107498,
        1.        ]])

In [23]:
#creating mapping
entertainment_index = pd.Series(entertainment.index, index = entertainment['Titles']).drop_duplicates()

ent_id = entertainment_index["Heat (1995)"]
ent_id

5

In [33]:
#Creating a custom function for getting recommendation.

def get_recommendations(Name, topN):    
    # topN = 10 
   
    ent_id = entertainment_index[Name]
    
    cosine_scores = list(enumerate(cosine_sim_matrix[ent_id]))
    
   
    cosine_scores = sorted(cosine_scores, key=lambda x:x[1], reverse = True)
    
   
    cosine_scores_N = cosine_scores[0: topN+1]
    
    # Getting the movie index 
    ent_idx  =  [i[0] for i in cosine_scores_N]
    ent_scores =  [i[1] for i in cosine_scores_N]
    
    # Similar movies and scores
    ent_similar_show = pd.DataFrame(columns=["Titles", "Score"])
    ent_similar_show["Titles"] = entertainment.loc[ent_idx, "Titles"]
    ent_similar_show["Score"] = ent_scores
    ent_similar_show.reset_index(inplace = True)  
   
    print (ent_similar_show)
  

In [34]:
get_recommendations("Heat (1995)", topN = 2)

   index                       Titles  Score
0      5                  Heat (1995)    1.0
1     14      Cutthroat Island (1995)    1.0
2     43  Seven (a.k.a. Se7en) (1995)    1.0
