In [82]:
import numpy as np
import pandas as pd
import nltk
import json
import re
import csv
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

In [83]:
# df = pd.read_csv('booksummaries.txt', sep='\t', names=['wikiId', 'freeBaseId', 'title', 'author', 'date', 'genre', 'summary'])

data = []

with open("booksummaries.txt", 'r') as f:
    reader = csv.reader(f, dialect='excel-tab')
    for row in tqdm(reader):
        data.append(row)

16559it [00:00, 17200.01it/s]


In [84]:
book_index = []
book_id = []
book_author = []
book_name = []
summary = []
genre = []
for i in tqdm(data):
    book_id.append(i[0])
    book_name.append(i[2])
    book_author.append(i[3])
    genre.append(i[5])
    summary.append(i[6])

df = pd.DataFrame({'ID': book_id, 'BookTitle': book_name, 'Author': book_author,
                       'Genre': genre, 'Summary': summary})
df.head()

100%|██████████| 16559/16559 [00:00<00:00, 885378.03it/s]


Unnamed: 0,ID,BookTitle,Author,Genre,Summary
0,620,Animal Farm,George Orwell,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca..."
1,843,A Clockwork Orange,Anthony Burgess,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."
2,986,The Plague,Albert Camus,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...
3,1756,An Enquiry Concerning Human Understanding,David Hume,,The argument of the Enquiry proceeds by a ser...
4,2080,A Fire Upon the Deep,Vernor Vinge,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...


In [105]:
df.isna().sum()

df = df.drop(df[df['Genre'] == ''].index)
df = df.drop(df[df['Summary'] == ''].index)


genres_cleaned = []
for i in df['Genre']:
    genres_cleaned.append(" ".join(list(json.loads(i).values())))
df['Genres'] = genres_cleaned

In [106]:
def clean_summary(text):
    text = re.sub("\'", "", text)
    text = re.sub("[^a-zA-Z]"," ",text)
    text = ' '.join(text.split())
    text = text.lower()
    return text

df['clean_summary'] = df['Summary'].apply(lambda x: clean_summary(x))


In [107]:
df.head(10)

Unnamed: 0,ID,BookTitle,Author,Genre,Summary,Genres,clean_summary
0,620,Animal Farm,George Orwell,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca...",Roman à clef Satire Children's literature Spec...,old major the old boar on the manor farm calls...
1,843,A Clockwork Orange,Anthony Burgess,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan...",Science Fiction Novella Speculative fiction Ut...,alex a teenager living in near future england ...
2,986,The Plague,Albert Camus,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...,Existentialism Fiction Absurdist fiction Novel,the text of the plague is divided into five pa...
4,2080,A Fire Upon the Deep,Vernor Vinge,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...,Hard science fiction Science Fiction Speculati...,the novel posits that space around the milky w...
5,2152,All Quiet on the Western Front,Erich Maria Remarque,"{""/m/098tmk"": ""War novel"", ""/m/016lj8"": ""Roman...","The book tells the story of Paul Bäumer, a Ge...",War novel Roman à clef,the book tells the story of paul b umer a germ...
6,2890,A Wizard of Earthsea,Ursula K. Le Guin,"{""/m/0dwly"": ""Children's literature"", ""/m/01hm...","Ged is a young boy on Gont, one of the larger...",Children's literature Fantasy Speculative fict...,ged is a young boy on gont one of the larger i...
8,4081,Blade Runner 3: Replicant Night,K. W. Jeter,"{""/m/06n90"": ""Science Fiction"", ""/m/014dfn"": ""...","Living on Mars, Deckard is acting as a consul...",Science Fiction Speculative fiction,living on mars deckard is acting as a consulta...
9,4082,Blade Runner 2: The Edge of Human,K. W. Jeter,"{""/m/06n90"": ""Science Fiction"", ""/m/014dfn"": ""...",Beginning several months after the events in ...,Science Fiction Speculative fiction,beginning several months after the events in b...
15,4386,Book of Job,,"{""/m/02mdj1"": ""Religious text""}",The book of Job tells the story of an extreme...,Religious text,the book of job tells the story of an extremel...
20,6020,Crash,J. G. Ballard,"{""/m/014dfn"": ""Speculative fiction"", ""/m/02xlf...",The story is told through the eyes of narrato...,Speculative fiction Fiction Novel,the story is told through the eyes of narrator...


In [113]:
df['genres_clean_summary'] = df['Genres'] + " " + df['clean_summary']

In [114]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['genres_clean_summary'])

In [115]:
knn = NearestNeighbors(n_neighbors=5, algorithm='brute', metric='cosine')
knn.fit(X)

In [116]:
def recommend_books(book_title):
    book_index = df[df['BookTitle'] == book_title].index[0]
    distances, indices = knn.kneighbors(X[book_index], n_neighbors=6)

    print('Recommendations for', book_title)
    print('-----------------------------------')
    for i in range(1, len(distances.flatten())):
        recommended_book_title = df.loc[indices.flatten()[i], 'BookTitle']
        recommended_book_distance = distances.flatten()[i]
        print('{0}: {1}, with distance of {2}'.format(i, recommended_book_title, recommended_book_distance))

In [126]:
recommend_books("Harry Potter and the Order of the Phoenix")

Recommendations for Harry Potter and the Order of the Phoenix
-----------------------------------
1: The Falling Woman, with distance of 0.8844418249890911
2: Dust to Dust, with distance of 0.8856460067099035
3: The Strange World of Planet X, with distance of 0.8939669185922765
4: Journey by Moonlight, with distance of 0.8950291508670876
5: Deathstalker Destiny, with distance of 0.9014127384690566
