In [2]:
import pandas as pd
import numpy as np
import spacy
from spacy_fastlang import LanguageDetector
nlp = spacy.load('en_core_web_sm',disable=["ner"])
nlp.add_pipe("language_detector",  config={"threshold": 0.50, "default_language": "en"})
import os
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from sklearn.preprocessing import StandardScaler
from scipy import spatial



In [4]:
data = pd.read_csv(os.path.join("..","data","raw","book_data.csv"), usecols = ['book_authors', 'book_desc',
       'book_pages', 'book_rating', 'book_rating_count', 'book_review_count',
       'book_title', 'genres',"book_format"])
data = data.dropna()
data = data[data["book_review_count"] >= 50]
data["book_pages"] = data.book_pages.str.replace(" pages", "")
data["book_pages"] = data.book_pages.str.replace(" page", "")
data["book_desc"] = data.book_desc.str.replace("\r", "")
data["book_desc"] = data.book_desc.str.replace("\n", "")
scaler = StandardScaler()
data[['book_pages', 'book_rating', 'book_rating_count', 'book_review_count']] = scaler.fit_transform(data[['book_pages', 'book_rating', 'book_rating_count', 'book_review_count']])
book_format = ['Hardcover', 'Paperback']
data = data[data.book_format.isin(book_format)]

In [5]:
data.head(60)

Unnamed: 0,book_authors,book_desc,book_format,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres
0,Suzanne Collins,Winning will make you famous. Losing means cer...,Hardcover,0.078489,1.255539,21.54848,17.46639,The Hunger Games,Young Adult|Fiction|Science Fiction|Dystopia|F...
1,J.K. Rowling|Mary GrandPré,There is a door at the end of a silent corrido...,Paperback,2.050361,1.818092,7.815707,3.361315,Harry Potter and the Order of the Phoenix,Fantasy|Young Adult|Fiction
2,Harper Lee,The unforgettable novel of a childhood in a sl...,Paperback,-0.120288,1.030518,14.543218,8.473107,To Kill a Mockingbird,Classics|Fiction|Historical|Historical Fiction...
3,Jane Austen|Anna Quindlen|Mrs. Oliphant|George...,«È cosa ormai risaputa che a uno scapolo in po...,Paperback,-0.299188,0.955511,9.442793,5.69198,Pride and Prejudice,Classics|Fiction|Romance
4,Stephenie Meyer,About three things I was absolutely positive.F...,Paperback,0.571457,-1.557227,16.660156,10.525195,Twilight,Young Adult|Fantasy|Romance|Paranormal|Vampire...
5,Markus Zusak,Trying to make sense of the horrors of World W...,Hardcover,0.786137,1.36805,5.620219,10.838415,The Book Thief,Historical|Historical Fiction|Fiction|Young Adult
6,C.S. Lewis|Pauline Baynes,"Journeys to the end of the world, fantastic cr...",Paperback,1.64088,0.955511,1.482455,0.724402,The Chronicles of Narnia,Fantasy|Classics|Fiction|Young Adult|Childrens
7,George Orwell,مزرعة الحيوانات هي رائعة جورج أورويل الخالدة.....,Paperback,-0.923349,-0.357114,8.579797,4.345467,Animal Farm,Classics|Fiction|Science Fiction|Dystopia|Fant...
8,Margaret Mitchell,Gone with the Wind is a novel written by Marga...,Paperback,2.714277,1.105525,3.580759,1.611268,Gone with the Wind,Classics|Historical|Historical Fiction|Fiction...
10,John Green,Despite the tumor-shrinking medical miracle th...,Hardcover,-0.164019,0.918007,11.133071,15.979315,The Fault in Our Stars,Young Adult|Fiction|Romance|Contemporary


In [6]:
english = []
texts = []
for doc in tqdm(nlp.pipe(data.book_desc,n_process = -1)):
    english.append(doc._.language == 'en')
    texts.append(" ".join([token.lemma_ for token in doc]))

29517it [01:39, 297.83it/s]


In [7]:
data["text_processed"] = texts
data = data[english]
data = data.sort_values("book_review_count", ascending = False)
data = data.drop_duplicates(subset = ["book_title"])
data = data.reset_index(drop = True)
data["genres"] = data.genres.str.replace("|", ", ")

In [115]:
vectorizer = CountVectorizer()

In [129]:
vectorizer = CountVectorizer()
genre_vec = vectorizer.fit_transform((data["genres"]))

print(type(genre_vec))
np.save(os.path.join("..","data","processed","genre_vec.npy"), genre_vec)
genre_vec = np.load(os.path.join("..","data","processed","genre_vec.npy"), allow_pickle = True)
genre_vec = sparse.csr_matrix(genre_vec.all())

genre_v

<class 'scipy.sparse.csr.csr_matrix'>


In [101]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')

In [102]:
tfidf_vec = tf.fit_transform((data["text_processed"]))


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(tfidf_book_id, tfidf_book_id)

In [None]:
cos_sim_tfidf = cos_sim

cos_sim

In [None]:

# Storing indices of the data
indices_genre = pd.Series(data.book_title)
  
def recommendations_genre(title, cosine_sim = cos_sim_genre, indices = indices_genre):
    index = indices[indices == title].index[0]
    similarity_scores = pd.Series(cosine_sim[index])
    return similarity_scores

recs_genre = recommendations_genre("The Fault in Our Stars")

# Storing indices of the data
indices = pd.Series(data.book_title)
  
def recommendations(title, cosine_sim = cos_sim):
    index = indices[indices == title].index[0]
    similarity_scores = pd.Series(cosine_sim[index])
    return similarity_scores

In [None]:
def recommend(title,df = data, cos_sim_genre = cos_sim_genre,cos_sim_tfidf = cos_sim):
    indices = pd.Series(df.book_title)
    index = indices[indices == title].index[0]
    similarity_scores_tfidf = pd.Series(cos_sim_tfidf[index])
    similarity_scores_genre = pd.Series(cos_sim_genre[index])
    df["tfidf_sim"] = similarity_scores_tfidf
    df["genres_sim"] = similarity_scores_genre
    scaler = StandardScaler()
    df[['tfidf_sim', 'genres_sim']] = scaler.fit_transform(df[['tfidf_sim', 'genres_sim']])
    
    df["rec_score"] = df.tfidf_sim*1+df.genres_sim*0.5+df.book_review_count*0.2+df.book_rating*0.3
    
    df = df[["book_title","book_authors","genres","book_rating_count","book_rating","rec_score"]]
    return df[df.book_title != title].sort_values("rec_score", ascending = False).head(20)

In [None]:
recommend("1984")


In [None]:
data = data.drop(["rec_score","tfidf_sim","genres_sim","text_processed", "recs_genre", "recs_score","similarity"], axis = 1)

In [None]:
data.to_csv(os.path.join("..","data","processed","rec_catalog.csv"),index = False)

In [None]:
pd.read_csv(os.path.join("..","data","processed","rec_catalog.csv"))

In [None]:
cos_sim_tfidf.shape

In [54]:
genre_vec

<23276x674 sparse matrix of type '<class 'numpy.int64'>'
	with 138851 stored elements in Compressed Sparse Row format>

In [53]:
np.load(os.path.join("..","data","processed","genre_vec.npy"))

ValueError: Object arrays cannot be loaded when allow_pickle=False

In [None]:
test = np.load(os.path.join("..","data","processed","cos_sim_genre.npz"))

In [None]:
result = 1 - spatial.distance.cosine(dataSetI, dataSetII)

In [23]:
cos_sim_genre

NameError: name 'cos_sim_genre' is not defined

In [22]:
a = np.random.randint(0,10,(5,5))
np.array([[5, 2, 0, 4, 1],
       [4, 2, 8, 2, 4],
       [9, 7, 4, 9, 7],
       [4, 6, 0, 1, 3],
       [1, 1, 2, 5, 0]])

from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(a[None,:,3] , a.T[:-1])

array([[0.67449166, 0.68316129, 0.63100845, 1.        ]])

In [64]:
np.save(os.path.join("..","data","processed","genre_vec.npy"), genre_vec)

In [130]:
authors_vec = np.load(os.path.join("..","data","processed","authors_vec.npy"),allow_pickle  = True)

In [51]:
data["test"] = cosine_similarity(tfidf_book_id , tfidf_book_id[0]) + cosine_similarity(tfidf_book_id , tfidf_book_id[0])

In [52]:
data.sort_values("test",ascending = False)

Unnamed: 0,book_authors,book_desc,book_format,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,text_processed,test
0,Suzanne Collins,Winning will make you famous. Losing means cer...,Paperback,0.078489,1.255539,21.561603,17.472588,The Hunger Games,"Young Adult, Fiction, Science Fiction, Dystopi...",win will make you famous . lose mean certain d...,2.000000
8,Suzanne Collins,MY NAME IS KATNISS EVERDEEN. WHY AM I NOT DEAD...,Paperback,0.142098,0.130433,7.696353,10.590384,Mockingjay,"Young Adult, Science Fiction, Dystopia, Fictio...",MY name be KATNISS EVERDEEN . why be I not dea...,0.238693
10,Suzanne Collins,"Against all odds, Katniss Everdeen and Peeta M...",Paperback,0.499897,1.105525,8.158029,9.749782,Catching Fire,"Young Adult, Science Fiction, Dystopia, Fictio...","against all odd , Katniss Everdeen and Peeta M...",0.233473
14407,Kate Egan,"The definitive, richly illustrated, full-color...",Hardcover,-0.645060,1.818092,-0.177976,-0.292400,The World of the Hunger Games,"Young Adult, Science Fiction, Dystopia","the definitive , richly illustrate , full - co...",0.181738
10634,Emily Seife,The New York Times bestselling Hunger Games is...,Paperback,-0.899495,1.518064,0.069080,-0.270043,The Hunger Games Tribute Guide,"Young Adult, Science Fiction, Dystopia",the New York Times bestselle Hunger Games be n...,0.143714
...,...,...,...,...,...,...,...,...,...,...,...
12394,Jack Campbell,Admiral Geary and Captain Desjani return to th...,Paperback,0.301120,0.130433,-0.201619,-0.282107,Dreadnaught,"Science Fiction, Space, Space Opera, War, Mili...",Admiral Geary and Captain Desjani return to th...,0.000000
22100,Plato|Nicholas P. White,A fluent and accurate new translation of the d...,Paperback,-0.835887,0.430461,-0.239363,-0.312987,Sophist,"Philosophy, Classics, Nonfiction, Literature, ...",a fluent and accurate new translation of the d...,0.000000
12384,Elmore Leonard,"Before there was Raylan, there was Sisco... U....",Paperback,0.054636,-0.244603,-0.209544,-0.281997,Out of Sight,"Fiction, Mystery, Crime, Mystery, Thriller, My...","before there be Raylan , there be Sisco ... U....",0.000000
6995,Robert McKee,Robert McKee's screenwriting workshops have ea...,Hardcover,0.444240,0.955511,-0.208739,-0.223891,"Story: Substance, Structure, Style, and the Pr...","Language, Writing, Nonfiction, Culture, Film",Robert McKee 's screenwrite workshop have earn...,0.000000


<1x2811889 sparse matrix of type '<class 'numpy.float64'>'
	with 105 stored elements in Compressed Sparse Row format>

In [89]:
data = pd.read_csv(os.path.join("..","data","processed","rec_catalog.csv"))
def recommend(title,df = data, genre_vec = genre_vec, tfidf_vec = tfidf_vec):
    indices = pd.Series(df.book_title)
    index = indices[indices == title].index[0]
    print(index)
    df["tfidf_sim"] = cosine_similarity(tfidf_vec , tfidf_vec[index])
    df["genres_sim"] = cosine_similarity(genre_vec , genre_vec[index])

    scaler = StandardScaler()
    df[['tfidf_sim', 'genres_sim']] = scaler.fit_transform(df[['tfidf_sim', 'genres_sim']])
    
    df["rec_score"] = df.tfidf_sim*1+df.genres_sim*0.5+df.book_review_count*0.2+df.book_rating*0.3
    
    df = df[["book_title","book_authors","genres","book_rating_count","book_rating","rec_score"]]
    return df[df.book_title != title].sort_values("rec_score", ascending = False).head(20)

In [90]:
recommend("1984")

41


IndexError: too many indices for array: array is 0-dimensional, but 1 were indexed

In [113]:
genre_vec = np.load(os.path.join("..","data","processed","genre_vec.npy"),allow_pickle  = True)

In [114]:
cosine_similarity(tfidf_vec , tfidf_vec[41])

IndexError: too many indices for array: array is 0-dimensional, but 1 were indexed

In [142]:
"hello|world".split("|")

['hello', 'world']

In [165]:
vectorizer = CountVectorizer(tokenizer = test)

In [166]:
vectorizer.fit_transform(data.book_authors)

<23276x14662 sparse matrix of type '<class 'numpy.int64'>'
	with 31432 stored elements in Compressed Sparse Row format>

<method 'split' of 'str' objects>


In [158]:
callable(str.split("|"))

False

In [167]:
def split_authors(string):
    vector =  string.split("|")
    return vector

In [164]:
callable(test)

True