In [32]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_csv("https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7")

df = df[['Title','Genre','Director','Actors']]
df['Total'] = df['Genre'] + " " + df['Director'] + " " + df['Actors']
df['ID'] = df.index

df = df[['ID', 'Title', 'Total']]

   ID  ...                                              Total
0   0  ...  Crime, Drama Frank Darabont Tim Robbins, Morga...
1   1  ...  Crime, Drama Francis Ford Coppola Marlon Brand...
2   2  ...  Crime, Drama Francis Ford Coppola Al Pacino, R...
3   3  ...  Action, Crime, Drama Christopher Nolan Christi...
4   4  ...  Crime, Drama Sidney Lumet Martin Balsam, John ...

[5 rows x 3 columns]
{'The Shawshank Redemption': 0, 'The Godfather': 1, 'The Godfather: Part II': 2, 'The Dark Knight': 3, '12 Angry Men': 4, "Schindler's List": 5, 'The Lord of the Rings: The Return of the King': 6, 'Pulp Fiction': 7, 'Fight Club': 8, 'The Lord of the Rings: The Fellowship of the Ring': 9, 'Forrest Gump': 10, 'Star Wars: Episode V - The Empire Strikes Back': 11, 'Inception': 12, 'The Lord of the Rings: The Two Towers': 13, "One Flew Over the Cuckoo's Nest": 14, 'Goodfellas': 15, 'The Matrix': 16, 'Star Wars: Episode IV - A New Hope': 17, 'Se7en': 18, "It's a Wonderful Life": 19, 'The Silence of the La

In [14]:
tfidf = TfidfVectorizer()

tfidf_matrix = tfidf.fit_transform(df['Total'])

cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

print(cosine_similarities)

[[1.         0.03147892 0.03073635 ... 0.00734021 0.00676205 0.00614639]
 [0.03147892 1.         0.45728632 ... 0.00720954 0.00664167 0.00603697]
 [0.03073635 0.45728632 1.         ... 0.00703947 0.006485   0.00589456]
 ...
 [0.00734021 0.00720954 0.00703947 ... 1.         0.00649063 0.00589967]
 [0.00676205 0.00664167 0.006485   ... 0.00649063 1.         0.00543498]
 [0.00614639 0.00603697 0.00589456 ... 0.00589967 0.00543498 1.        ]]


In [34]:
results = {} # dictionary created to store the result in a dictionary format (ID : (Score,item_id))

for idx, row in df.iterrows(): # iterates through all the rows
    # the below code 'similar_indice' stores similar ids based on cosine similarity. sorts them in ascending order. [:-5:-1] is then used so that the indices with most similarity are got. 0 means no similarity and 1 means perfect similarity
    similar_indices = cosine_similarities[idx].argsort()[ : -5 : -1] # stores 5 most similar books, you can change it as per your needs
    similar_items = [(cosine_similarities[idx][i], df['ID'][i]) for i in similar_indices]
    results[row['ID']] = similar_items[1 : ]
    
# below code 'function item(id)' returns a row matching the id along with Book Title. Initially it is a dataframe, then we convert it to a list
def item(id):
    return df.loc[df['ID'] == id]['Title'].tolist()[0]

def recommend(id, num):
    if (num == 0):
        print("Unable to recommend any book as you have not chosen the number of book to be recommended")
    elif (num == 1):
        print("Recommending " + str(num) + " book similar to " + item(id))
    else :
        print("Recommending " + str(num) + " books similar to " + item(id))
        
    print("----------------------------------------------------------")
    recs = results[id][ : num]

    for rec in recs:
        print("You may also like to read: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

# the first argument in the below function to be passed is the id of the book, second argument is the number of books you want to be recommended
recommend(3,5)

Recommending 5 books similar to The Dark Knight
----------------------------------------------------------
You may also like to read: Batman Begins (score:0.48206693382120697)
You may also like to read: The Prestige (score:0.44799557075377566)
You may also like to read: The Dark Knight Rises (score:0.34547266248648856)
