In [1]:
import pandas as pd
import numpy as np

In [2]:
# This pickled were engineered in last file (you can check it in BX-EDA)
books = pd.read_pickle('books_cleaned.pkl')
user_reviews = pd.read_pickle('user_reviews.pkl')

Below some reminder of files 

In [3]:
books.head()

Unnamed: 0,book_id,title,author,published,publisher,image_small,image_medium,image_large
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
5,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...


In [4]:
user_reviews.head()

Unnamed: 0,user_id,age,income_grp,economy,Longitude,Latitude,book_id,score
0,2,18.0,2,6,-64.803015,17.955006,195153448,0
1,20,19.0,2,6,-64.803015,17.955006,425163091,0
2,42,17.0,2,6,-64.803015,17.955006,553582747,7
3,51,34.0,2,6,-64.803015,17.955006,440225701,9
4,56,24.0,2,6,-64.803015,17.955006,671623249,7


In [5]:
print(user_reviews.shape)
print(books.shape)

(596550, 8)
(182759, 8)


In [6]:
# Combine all three (initial) datasets
combined = pd.merge(user_reviews, books, on = 'book_id')

In [7]:
combined.head()

Unnamed: 0,user_id,age,income_grp,economy,Longitude,Latitude,book_id,score,title,author,published,publisher,image_small,image_medium,image_large
0,2,18.0,2,6,-64.803015,17.955006,195153448,0,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,20,19.0,2,6,-64.803015,17.955006,425163091,0,Chocolate Jesus,Stephan Jaramillo,1998,Berkley Publishing Group,http://images.amazon.com/images/P/0425163091.0...,http://images.amazon.com/images/P/0425163091.0...,http://images.amazon.com/images/P/0425163091.0...
2,95703,31.0,2,6,-64.803015,17.955006,425163091,9,Chocolate Jesus,Stephan Jaramillo,1998,Berkley Publishing Group,http://images.amazon.com/images/P/0425163091.0...,http://images.amazon.com/images/P/0425163091.0...,http://images.amazon.com/images/P/0425163091.0...
3,198776,35.0,2,6,-64.803015,17.955006,425163091,0,Chocolate Jesus,Stephan Jaramillo,1998,Berkley Publishing Group,http://images.amazon.com/images/P/0425163091.0...,http://images.amazon.com/images/P/0425163091.0...,http://images.amazon.com/images/P/0425163091.0...
4,275970,46.0,2,6,-64.803015,17.955006,425163091,0,Chocolate Jesus,Stephan Jaramillo,1998,Berkley Publishing Group,http://images.amazon.com/images/P/0425163091.0...,http://images.amazon.com/images/P/0425163091.0...,http://images.amazon.com/images/P/0425163091.0...


For recommendation algorithm, we do not need some column (images, concrete title, author and publisher( Later on, I will try to extract some information that might be useful during recommendation))

In [8]:
combined.drop(['image_small', 'image_medium', 'image_large'], axis = 1, inplace = True)

In [9]:
# After many deliberation, I decided to remove all 0 rated cases, cause I think it's not rows that I can rely on
combined = combined.loc[combined['score'] != 0].copy()

# Only 200,000 rows from 1,200,000 were not removed during cleaning (may be I am too strict ;( )

In [10]:
# also remove books that were rated only with 0
books = books[books['book_id'].isin(combined.book_id.unique())].copy()

Instead of working with object type columns like author, publisher; I decided to get median values for them

In [11]:
# Here I am creating dictionary (hash-map) of publishers as key and their median rating scores as values.
by_publisher = dict(combined.groupby('publisher')['score'].median())

# Then I will give these values to new column: avg_publisher_score for every match in combined table

# So here I take publisher value from every row, and get it's average rating score assigning that value to dictionary
combined['avg_publisher_score'] = combined['publisher'].apply(lambda x: by_publisher[x])

In [12]:
# Finally let's do the same for author
by_author = dict(combined.groupby('author')['score'].median())

combined['avg_author_score'] = combined['author'].apply(lambda x: by_author[x])

In [13]:
# I will need Author, publisher and title data after model building part of this project, So I will create new final dataframe

df = combined.drop(['title', 'author', 'publisher'], axis = 1).copy()

In [14]:
# so this how final df looks like
df.tail(6)

Unnamed: 0,user_id,age,income_grp,economy,Longitude,Latitude,book_id,score,published,avg_publisher_score,avg_author_score
531243,121030,49.0,3,6,11.788629,-0.5866,2264031476,6,2001,7.0,7.0
531244,121030,49.0,3,6,11.788629,-0.5866,2264032405,7,2001,7.0,7.0
531245,140933,30.0,2,2,7.406277,43.752746,2226109501,8,1999,8.0,7.5
531247,247624,26.0,5,7,32.36908,1.274693,974433926,7,2004,7.0,7.0
531251,209389,34.0,5,7,46.704737,-19.371896,2253053287,9,1990,8.0,7.0
531252,209389,34.0,5,7,46.704737,-19.371896,2253097888,7,1994,8.0,7.0


In [15]:
df['user_id'].nunique()

29331

In [16]:
df.shape

(193058, 11)

In [17]:
df['book_id'].nunique()

93700

For now, let's take one step back, and create sparse matrix with rows as unique user, column as unique book id and value as rating of this user to this book.

Here, what i am going to do is remove data that I can't trust. It means books that were rated with very few people, or users who rated only few books

In [18]:
by_book = pd.DataFrame(df.groupby('book_id')['user_id'].count().sort_values(ascending = False)).reset_index()
by_book = by_book.rename(columns={'user_id': 'count'})
by_book = by_book[by_book['count'] >= 5]


In [19]:
by_user = pd.DataFrame(df.groupby('user_id')['book_id'].count().sort_values(ascending = False)).reset_index()
by_user = by_user.rename(columns={'book_id': 'count'})
by_user = by_user[by_user['count'] >= 10]

In [20]:
df = df[(df['user_id'].isin(by_user['user_id'].unique())) & (df['book_id'].isin(by_book['book_id'].unique()))].copy()

In [21]:
user_book_pivot = df.pivot_table(columns='book_id',index='user_id', values='score').fillna(0)

In [22]:
from scipy.sparse import csr_matrix

user_book_sparse=csr_matrix(user_book_pivot)

In [23]:
user_book_sparse[0].todense()

matrix([[0., 0., 0., ..., 0., 0., 0.]])

In [24]:
from sklearn.neighbors import NearestNeighbors

In [25]:
neighbors = NearestNeighbors()

In [26]:
neighbors.fit(user_book_sparse)

NearestNeighbors()

In [27]:
distances,suggestions=neighbors.kneighbors(user_book_pivot.iloc[666,:].values.reshape(1,-1))

In [28]:
distances

array([[ 0.        , 51.30302135, 51.7107339 , 51.75905718, 51.90375709]])

In [29]:
suggestions

array([[ 666, 2054, 1057,  234, 2094]], dtype=int64)

In [30]:
for i in range(len(suggestions)):
    print(user_book_pivot.index[suggestions[i]])

Int64Index([55892, 165759, 86680, 19969, 168464], dtype='int64', name='user_id')


In [31]:
ff = pd.DataFrame(user_book_pivot.iloc[54])

In [32]:
user_book_pivot.head()

book_id,0006485200,0006551971,0006742939,0007110928,0007122039,0007154615,000716226X,0007170866,0020198817,0020198906,...,849550152X,8495618605,8806142100,880781210X,9074336329,950491036X,9580464162,9681501225,9726101794,9871138148
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
recommended_books = {}

curr = 666

suggestions = suggestions[0]

suggestions = suggestions[suggestions != curr]

In [34]:
suggestions

array([2054, 1057,  234, 2094], dtype=int64)

In [35]:


for i in suggestions:
    
    sim_user = pd.DataFrame(user_book_pivot.iloc[i])
    to_rename = sim_user.columns[0]
    
    sim_user = sim_user.rename(columns={to_rename: 'score'})
    sim_user = sim_user.reset_index()
    
    only_scored = sim_user[sim_user['score'] > 0].copy()
    
    for book in only_scored['book_id'].unique():
        
        if book in recommended_books.keys():
            
            recommended_books[book][0] += 1
            
            recommended_books[book][1] += only_scored.loc[only_scored['book_id'] == book, 'score'].values[0] 
        else:
            recommended_books[book] = [1, only_scored.loc[only_scored['book_id'] == book, 'score'].values[0]]
    
    

In [36]:
recommended_books 

{'044022165X': [1, 5.0],
 '0345402871': [1, 6.0],
 '044023722X': [1, 5.0],
 '034540288X': [1, 3.0],
 '0446532231': [1, 7.0],
 '0812548051': [1, 8.0],
 '0064407667': [1, 10.0],
 '0064407683': [1, 7.0],
 '0380709562': [1, 7.0],
 '0440407524': [1, 10.0],
 '0842321942': [1, 5.0]}