In [51]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split_test_split

####  Read the books dataset and explore it


In [2]:
#reading in csv files
books=pd.read_csv("BX-Books.csv", encoding="utf-8") 
books.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
0,312956762,Die Hard: With a Vengeance - A Novel,D. Chiel,1995,St Martins Pr
1,034545006X,The Taking,J. D. Landis,2003,Ballantine Books
2,1579651372,Ruskin's Rose: A Venetian Love Story,Mimma Balia,2000,Artisan Publishers
3,416652700,The Pooh cook book: inspired by \Winnie-the-Po...,";Katie Stewart""",1971,Methuen
4,1853114103,Lord Have Mercy,"\\""Ron\""""",2001,Canterbury Press


In [3]:
books.tail()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
271353,1573222623,Drinking Coffee Elsewhere (Today Show Book Clu...,ZZ Packer,2003,Riverhead Books
271354,1573222348,Drinking Coffee Elsewhere (Alex Awards (Awards)),ZZ Packer,2003,Riverhead Books
271355,1573223786,Drinking Coffee Elsewhere,Zz Packer,2004,Riverhead Books
271356,9627982032,The Credit Suisse Guide to Managing Your Perso...,,1995,Edinburgh Financial Publishing
271357,751352497,A+ Quiz Masters:01 Earth,,1999,Dorling Kindersley


In [4]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271358 entries, 0 to 271357
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   isbn                 271358 non-null  object
 1   book_title           271358 non-null  object
 2   book_author          271356 non-null  object
 3   year_of_publication  271358 non-null  int64 
 4   publisher            271356 non-null  object
dtypes: int64(1), object(4)
memory usage: 10.4+ MB


#### Clean up NaN values

In [5]:
books[books.isna().any(axis=1)]# records containing null values

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
70531,193169656X,Tyrant Moon,Elaine Corvidae,2002,
159822,1931696993,Finders Keepers,Linnea Sinclair,2001,
271356,9627982032,The Credit Suisse Guide to Managing Your Perso...,,1995,Edinburgh Financial Publishing
271357,751352497,A+ Quiz Masters:01 Earth,,1999,Dorling Kindersley


In [6]:
#dropping records
books=books.dropna()

In [7]:
books.shape# 4 records have been dropped 

(271354, 5)

#### Read the data where ratings are given by users



In [8]:
ratings=pd.read_csv("BX-Book-Ratings.csv",encoding="latin-1")# utf-8 encoding doesn't work

In [9]:
ratings.head()


Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6


In [10]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   user_id  1048575 non-null  int64 
 1   isbn     1048575 non-null  object
 2   rating   1048575 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 24.0+ MB


In [13]:
#taking the first 10,000 rows of the dataset because of Out of Memory error

ratings=ratings[0:10000]


In [17]:
ratings.head()

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6


In [18]:
#merging with books dataset 
df_merged=pd.merge(ratings, books, on='isbn')

In [20]:
df_merged.head()

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,276726,155061224,5,Rites of Passage,Judith Rae,2001,Heinle
2,276727,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books
3,278418,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books
4,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press


In [21]:
df_merged.user_id.nunique(), df_merged.isbn.nunique()# unique users and books

(828, 8051)


* Convert ISBN variables to numeric numbers in the correct order

* Convert the user_id variable to numeric numbers in the correct order

* Convert both user_id and ISBN to the ordered list, i.e., from 0...n-1


In [22]:
unique_users_list=df_merged.user_id.unique()
unique_books_list=df_merged.isbn.unique()


In [28]:
#creating dictionary to map sequential values from 0 to n-1
unique_users_dict={user:index for index,user in enumerate(unique_users_list) }
unique_books_dict={book:index for index,book in enumerate(unique_books_list)}

In [37]:
unique_users_dict[276726]

1

In [45]:
#mapping function
def mapping_users(key):
    return unique_users_dict[key]
def mapping_books(key):
    return unique_books_dict[key]
    

In [46]:
# adding new index values to dataframe
df_merged['ordered_user_id']=df_merged['user_id'].apply(mapping_users)
df_merged['ordered_book_id']=df_merged['isbn'].apply(mapping_books)

In [48]:
df_merged.head()

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher,ordered_user_id,ordered_book_id
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,0,0
1,276726,155061224,5,Rites of Passage,Judith Rae,2001,Heinle,1,1
2,276727,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,2,2
3,278418,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,3,2
4,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,4,3


Re-index the columns to build a matrix

In [49]:
df_merged=df_merged.reindex(columns=['ordered_user_id','ordered_book_id','rating','book_title','book_author','year_of_publication','publisher'])

In [50]:
df_merged.head()

Unnamed: 0,ordered_user_id,ordered_book_id,rating,book_title,book_author,year_of_publication,publisher
0,0,0,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,1,1,5,Rites of Passage,Judith Rae,2001,Heinle
2,2,2,0,The Notebook,Nicholas Sparks,1996,Warner Books
3,3,2,0,The Notebook,Nicholas Sparks,1996,Warner Books
4,4,3,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press


Split your data into two sets (training and testing)



In [52]:
df_train,df_test=train_test_split(df_merged,test_size=0.2)

Unnamed: 0,ordered_user_id,ordered_book_id,rating,book_title,book_author,year_of_publication,publisher
1266,268,1022,8,Why I Am a Catholic,Garry Wills,2002,Houghton Mifflin Company
6595,3,5968,0,Dinosaurs and More Dinosaurs,M. Jean Craig,1986,Scholastic Paperbacks (Mm)
6145,3,5524,0,Grievous Sin (Peter Decker &amp; Rina Lazarus ...,Faye Kellerman,1994,Fawcett Books
3218,533,2692,0,Agent 146: The True Story of a Nazi Spy in Ame...,Erich Gimpel,2003,Berkley Publishing Group
7681,3,7048,0,The Monday to Friday Cookbook,Michele Urvater,1991,Workman Publishing
...,...,...,...,...,...,...,...
2222,362,1827,0,Point of Origin,Patricia Daniels Cornwell,1998,Putnam Pub Group
79,28,67,0,Die Scheibenwelt. Zwei Romane in einem Band. D...,Terry Pratchett,1993,Heyne
2275,373,1870,6,The House on Mango Street (Vintage Contemporar...,SANDRA CISNEROS,1991,Vintage
1567,8,1263,0,The 7 Powers of Questions: Secrets to Successf...,Dorothy Leeds,2000,Perigee Books


graveyard

In [None]:
sorted_unique_isbn=sorted(ratings["isbn"].unique())# chage to ratings isbn, not unique books. Not every book has a rating


len(sorted_unique_isbn)

In [None]:
# importing users dataset

users=pd.read_csv("BX-Users.csv",encoding="latin-1")

In [None]:
users.info()

In [None]:
user_ids=users["user_id"].astype(str)
sorted_unique_user_ids=sorted(user_ids)

In [None]:
# assigning unique values from 0 to n-1 for isbns and user_ids

enumerate(sorted_unique_isbn)

In [None]:
# ordered lists:
numeric_isbn={isbn:index for index, isbn in enumerate(sorted_unique_isbn)}
numeric_user_ids={user_ids:index for index, user_ids in enumerate(sorted_unique_user_ids)}

#### Build Matrix

In [None]:
import pandas as pd
import numpy as np

# Initialize an empty matrix
user_item_matrix = pd.DataFrame(np.zeros((len(sorted_user_ids), len(sorted_isbns))), index=sorted_user_ids, columns=sorted_isbns)

# Fill in the matrix with ratings from your dataset
for index, row in ratings_df.iterrows():
    user_id = row['user_id']
    isbn = row['ISBN']
    rating = row['rating']
    user_item_matrix.at[user_id, isbn] = rating


In [None]:
#Users are Rows, Books are Columns, Ratings are values
user_books_matrix=pd.DataFrame(np.zeros((len(sorted_unique_isbn),len(sorted_unique_user_ids))))


In [None]:
len(sorted_unique_isbn)

In [None]:
sorted_unique_isbn

In [None]:
ratings["user_id"].nunique(), ratings["isbn"].nunique()

In [None]:
sorted_unique_user_ids

In [None]:
numeric_user_ids

In [None]:
len(numeric_user_ids)

In [None]:
len(numeric_user_ids.keys())

In [None]:
len(sorted_unique_user_ids)