In [1]:
# https://github.com/nikitaa30/Content-based-Recommender-System/blob/master/recommender_system.py
# https://heartbeat.fritz.ai/recommender-systems-with-python-part-i-content-based-filtering-5df4940bd831
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [3]:
ds = pd.read_csv('Data/reviews10k_grouped_full.csv')
ds.head(15)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,original_publication_year,title,language_code,...,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,authors_y,original_title_y,review_text
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,2008.0,"The Hunger Games (The Hunger Games, #1)",eng,...,4942365,155254,66715,127936,560092,1481305,2706317,Suzanne Collins,The Hunger Games,I cracked and finally picked this up. Very enj...
1,2,3,3,4640799,491,439554934,9780440000000.0,1997.0,Harry Potter and the Sorcerer's Stone (Harry P...,eng,...,4800065,75867,75504,101676,455024,1156318,3011543,"J.K. Rowling, Mary GrandPré",Harry Potter and the Philosopher's Stone,Tuve el gusto de leerlo antes de que saliera l...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,2005.0,"Twilight (Twilight, #1)",en-US,...,3916824,95009,456191,436802,793319,875073,1355439,Stephenie Meyer,Twilight,"If I was 15, I would have LOVED this. It wasn'..."
3,4,2657,2657,3275794,487,61120081,9780061000000.0,1960.0,To Kill a Mockingbird,eng,...,3340896,72586,60427,117415,446835,1001952,1714267,Harper Lee,To Kill a Mockingbird,"Still relevant and beautiful.,R.I.P. Ms. Harpe..."
4,5,4671,4671,245494,1356,743273567,9780743000000.0,1925.0,The Great Gatsby,eng,...,2773745,51992,86236,197621,606158,936012,947718,F. Scott Fitzgerald,The Great Gatsby,I read this classic twice in high school. Its ...
5,6,11870085,11870085,16827462,226,525478817,9780525000000.0,2012.0,The Fault in Our Stars,eng,...,2478609,140739,47994,92723,327550,698471,1311871,John Green,The Fault in Our Stars,So.. well I know everyone's obsessed with this...
6,7,5907,5907,1540236,969,618260307,9780618000000.0,1937.0,The Hobbit,en-US,...,2196809,37653,46023,76784,288649,665635,1119718,J.R.R. Tolkien,The Hobbit or There and Back Again,Couldn't finish it. I was 5/6 of the way done ...
7,8,5107,5107,3036731,360,316769177,9780317000000.0,1951.0,The Catcher in the Rye,eng,...,2120637,44920,109383,185520,455042,661516,709176,J.D. Salinger,The Catcher in the Rye,"Certainly not his best, but it's Salinger so I..."
8,9,960,960,3338963,311,1416524797,9781417000000.0,2000.0,"Angels & Demons (Robert Langdon, #1)",en-CA,...,2078754,25112,77841,145740,458429,716569,680175,Dan Brown,Angels & Demons,"A quick read, and probably better than The Da ..."
9,10,1885,1885,3060926,3455,679783261,9780680000000.0,1813.0,Pride and Prejudice,eng,...,2191465,49152,54700,86485,284852,609755,1155673,Jane Austen,Pride and Prejudice,This great classic's color paints vivid pictur...


In [None]:
tf = TfidfVectorizer(ngram_range = (1, 3), min_df = 0, stop_words = 'english')
tfidf_matrix = tf.fit_transform(ds['review_text'])

In [None]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
results = {}

for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], ds['goodreads_book_id'][i]) for i in similar_indices]
    
    results[row['goodreads_book_id']] = similar_items[1:]
    
print('done!')

def item(id):
    return ds.loc[ds['goodreads_book_id'] == id]['review_text'].tolist()[0].split(' - ')[0]

def recommend(book_id, num):
    print('Recommending ' + str(num) + ' books similar to ' + item(book_id) + '...')
    print('-------')
    recs = results[book_id][:num]
    for rec in recs:
        print('Recommended: ' + item(rec[1]) + ' (score:' + str(rec[0]) + ')')
        
recommend(book_id = 2767052, num = 5)

## Building NLP Content-based RS
##### https://medium.com/@armandj.olivares/building-nlp-content-based-recommender-systems-b104a709c042


### Clean Text

In [None]:
import pandas as pd
import numpy as np
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop_words_ = set(stopwords.words('english'))
wn = WordNetLemmatizer()


In [None]:
def black_txt(token):
    return  token not in stop_words_ and token not in list(string.punctuation)  and len(token)>2   
  
def clean_txt(text):
  clean_text = []
  clean_text2 = []
  text = re.sub("'", "",text)
  text=re.sub("(\\d|\\W)+"," ",text) 
  text = text.replace("nbsp", "")
  clean_text = [ wn.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) if black_txt(word)]
  clean_text2 = [word for word in clean_text if black_txt(word)]
  return " ".join(clean_text2)

In [None]:
ds = pd.read_csv('reviews_10k_grouped.csv')

In [None]:
ds['review_text'] = ds['review_text'].map(str).apply(clean_txt)
ds.head()

### Extract features from text

In [None]:
# for tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

tfidf_rev = tfidf_vectorizer.fit_transform((ds['review_text']))
tfidf_rev

In [None]:
# for CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()

count_rev = count_vectorizer.fit_transform((ds['review_text']))
count_rev

### Evaluating the recommendations

In [None]:
#insert goodreads_book_id
g = "The Hunger Games"
index = np.where(ds['original_title'] == g)[0][0]
read_book = ds.iloc[[index]]
read_book
# read_book = ds.loc[ds['goodreads_book_id'] == g]

### Content based RS with tfidf

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
book_tfidf = tfidf_vectorizer.transform(read_book['review_text'])
cos_similarity_tfidf = map(lambda x: cosine_similarity(book_tfidf, x), tfidf_rev)
output = list(cos_similarity_tfidf)

### Content based RS with CountVectorizer

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
book_count = count_vectorizer.transform(read_book['review_text'])
cos_similarity_countv = map(lambda x: cosine_similarity(book_count, x), count_rev)
output2 = list(cos_similarity_countv)

### Top recommendations

In [None]:
def get_recommendation(top, ds, scores):
  recommendation = pd.DataFrame(columns = ['goodreads_book_id', 'original_title', 'score'])
  count = 0
  for i in top:
      recommendation.at[count, 'original_title'] = g
      recommendation.at[count, 'goodreads_book_id'] = df_all['goodreads_book_id'][i]
      recommendation.at[count, 'score'] =  scores[count]
      count += 1
  return recommendation

In [None]:
# for tfidf
top = sorted(range(len(output)), key=lambda i: output2[i], reverse=True)[:10]
list_scores = [output[i][0][0] for i in top]
get_recommendation(top, ds, list_scores)

In [None]:
# for count
top = sorted(range(len(output2)), key=lambda i: output2[i], reverse=True)[:10]
list_scores = [output2[i][0][0] for i in top]
get_recommendation(top, ds, list_scores)

In [None]:
# build a wordcloud

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
bunch_text = " ".join(text for text in ds.review_text(10000).text)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="white", colormap= "magma").generate(bunch_text)

In [None]:
plt.figure(figsize=[11,11])
plt.imshow(wordcloud, interpolation="sinc")
plt.axis("off")
plt.show()

## How to build a content-based RS with Natural Language Processing
##### https://towardsdatascience.com/how-to-build-from-scratch-a-content-based-movie-recommender-with-natural-language-processing-25ad400eb243

In [None]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_columns', 100)
df = pd.read_csv('reviews_10k_grouped.csv')
df.head()

In [None]:
# initializing the new column
df['Key_words'] = ""

for index, row in df.iterrows():
    review = row['review_text']
    
    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntuation characters as well
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(review)

    # getting the dictionary with key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column for the corresponding movie
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the review_text column
df.drop(columns = ['review_text'], inplace = True)
df.head()

In [None]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['Key_words'])

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
# creating a Series for the book titles so they are associated to an ordered numerical
# list I will use in the function to match the book_ids
book_ids = pd.Series(df.goodreads_book_id)

#  defining the function that takes in book title 
# as input and returns the top 10 recommended books
def recommendations(book_id, cosine_sim = cosine_sim):
    
    # initializing the empty list of recommended books
    recommended_books = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_books.append(list(df.index)[i])
        
    return recommended_books