# Matrix Factorization
Using Singular Value Decomposition

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from numpy import int64
import warnings
warnings.filterwarnings("ignore")

import requests
import IPython.display as Disp
import plotly.express as px

import sklearn
from IPython.core.display import display, HTML
from sklearn.decomposition import TruncatedSVD

In [2]:
#Cargamos los datasets
books = pd.read_csv('Books.csv', sep=';', error_bad_lines=False, encoding="latin-1")
books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
users = pd.read_csv('Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
users.columns = ['userID', 'Location', 'Age']
ratings = pd.read_csv('Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
ratings.columns = ['userID', 'ISBN', 'bookRating']

print("Books Data:    ", books.shape)
print("Users Data:    ", users.shape)
print("Books-ratings: ", ratings.shape)

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'


Books Data:     (271360, 8)
Users Data:     (278858, 3)
Books-ratings:  (1149780, 3)


In [4]:
#No necesitamos las columnas de imagenes
books.drop(['imageUrlS', 'imageUrlL'], axis=1, inplace=True)


In [5]:
books.isnull().sum()

ISBN                 0
bookTitle            0
bookAuthor           1
yearOfPublication    0
publisher            2
imageUrlM            0
dtype: int64

In [6]:
#Corregimos
books.at[187689 ,'bookAuthor'] = 'Other'

books.at[128890 ,'publisher'] = 'Other'
books.at[129037 ,'publisher'] = 'Other'

pd.set_option('display.max_colwidth', None)
books.at[209538 ,'publisher'] = 'DK Publishing Inc'
books.at[209538 ,'yearOfPublication'] = 2000
books.at[209538 ,'bookTitle'] = 'DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)'
books.at[209538 ,'bookAuthor'] = 'Michael Teitelbaum'

books.at[221678 ,'publisher'] = 'DK Publishing Inc'
books.at[221678 ,'yearOfPublication'] = 2000
books.at[209538 ,'bookTitle'] = 'DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)'
books.at[209538 ,'bookAuthor'] = 'James Buckley'

books.at[220731 ,'publisher'] = 'Gallimard'
books.at[220731 ,'yearOfPublication'] = '2003'
books.at[209538 ,'bookTitle'] = 'Peuple du ciel - Suivi de Les bergers '
books.at[209538 ,'bookAuthor'] = 'Jean-Marie Gustave Le ClÃ?Â©zio'

books['yearOfPublication'] = books['yearOfPublication'].astype(int)
books.loc[books['yearOfPublication'] > 2021, 'yearOfPublication'] = 2002
books.loc[books['yearOfPublication'] == 0, 'yearOfPublication'] = 2002

In [7]:
books['ISBN'] = books['ISBN'].str.upper()
books.drop_duplicates(keep='last', inplace=True) 
books.reset_index(drop = True, inplace = True)
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271047 entries, 0 to 271046
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   ISBN               271047 non-null  object
 1   bookTitle          271047 non-null  object
 2   bookAuthor         271047 non-null  object
 3   yearOfPublication  271047 non-null  int64 
 4   publisher          271047 non-null  object
 5   imageUrlM          271047 non-null  object
dtypes: int64(1), object(5)
memory usage: 12.4+ MB


In [8]:
required = users[users['Age'] <= 90]
required = required[required['Age'] >= 10]

mean = round(required['Age'].mean())

users.loc[users['Age'] > 80, 'Age'] = mean
users.loc[users['Age'] < 10, 'Age'] = mean
users['Age'] = users['Age'].fillna(mean)
users['Age'] = users['Age'].astype(int)

In [9]:
#Para corregir la columna de ubicación, dividimos en dif columnas
list_ = users.Location.str.split(', ')

city = []
state = []
country = []
count_no_state = 0    
count_no_country = 0

#Quitamos entradas inválidas y las reemplazamos por Other
for i in range(0,len(list_)):
    if list_[i][0] == ' ' or list_[i][0] == '' or list_[i][0]=='n/a' or list_[i][0] == ',': 
        city.append('other')
    else:
        city.append(list_[i][0].lower())

    if(len(list_[i])<2):
        state.append('other')
        country.append('other')
        count_no_state += 1
        count_no_country += 1
    else:
        if list_[i][1] == ' ' or list_[i][1] == '' or list_[i][1]=='n/a' or list_[i][1] == ',':  
            state.append('other')
            count_no_state += 1            
        else:
            state.append(list_[i][1].lower())
        
        if(len(list_[i])<3):
            country.append('other')
            count_no_country += 1
        else:
            if list_[i][2] == ''or list_[i][1] == ',' or list_[i][2] == ' ' or list_[i][2] == 'n/a':
                country.append('other')
                count_no_country += 1
            else:
                country.append(list_[i][2].lower())
        
users = users.drop('Location',axis=1)

#Corregimos cuando lugares ya fueron especificados en otra columna
temp = []
for ent in city:
    c = ent.split('/') 
    temp.append(c[0])

df_city = pd.DataFrame(temp,columns=['City'])
df_state = pd.DataFrame(state,columns=['State'])
df_country = pd.DataFrame(country,columns=['Country'])

users = pd.concat([users, df_city], axis=1)
users = pd.concat([users, df_state], axis=1)
users = pd.concat([users, df_country], axis=1)

In [10]:
users.drop_duplicates(keep='last', inplace=True)
users.reset_index(drop=True, inplace=True)

In [12]:
import re

#Revisamos sintaxis de ISBN
flag = 0
k =[]
reg = "[^A-Za-z0-9]"

for x in ratings['ISBN']:
    z = re.search(reg,x)    
    if z:
        flag = 1

if flag == 1:
    print("False")
else:
    print("True")

False


In [13]:
#Los valores extra de ISBN los quitamos cuando ya existan en el ds de Books
bookISBN = books['ISBN'].tolist() 
reg = "[^A-Za-z0-9]" 
for index, row_Value in ratings.iterrows():
    z = re.search(reg, row_Value['ISBN'])    
    if z:
        f = re.sub(reg,"",row_Value['ISBN'])
        if f in bookISBN:
            ratings.at[index , 'ISBN'] = f

In [14]:
ratings['ISBN'] = ratings['ISBN'].str.upper()

ratings.drop_duplicates(keep='last', inplace=True)
ratings.reset_index(drop=True, inplace=True)

In [15]:
ratings.head()

Unnamed: 0,userID,ISBN,bookRating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [16]:
#Terminamos de corregir los datasets, ahora para convertirlos en uno solo
dataset = pd.merge(books, ratings, on='ISBN', how='inner')
dataset = pd.merge(dataset, users, on='userID', how='inner')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1031609 entries, 0 to 1031608
Data columns (total 12 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   ISBN               1031609 non-null  object
 1   bookTitle          1031609 non-null  object
 2   bookAuthor         1031609 non-null  object
 3   yearOfPublication  1031609 non-null  int64 
 4   publisher          1031609 non-null  object
 5   imageUrlM          1031609 non-null  object
 6   userID             1031609 non-null  int64 
 7   bookRating         1031609 non-null  int64 
 8   Age                1031609 non-null  int64 
 9   City               1031609 non-null  object
 10  State              1031609 non-null  object
 11  Country            1031609 non-null  object
dtypes: int64(4), object(8)
memory usage: 102.3+ MB


In [18]:
dataset1 = dataset[dataset['bookRating'] != 0]
dataset1 = dataset1.reset_index(drop = True)
dataset2 = dataset[dataset['bookRating'] == 0]
dataset2 = dataset2.reset_index(drop = True)

In [19]:
dataset1.head()

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher,imageUrlM,userID,bookRating,Age,City,State,Country
0,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg,8,5,35,timmins,ontario,canada
1,074322678X,Where You'll Find Me: And Other Stories,Ann Beattie,2002,Scribner,http://images.amazon.com/images/P/074322678X.01.MZZZZZZZ.jpg,8,5,35,timmins,ontario,canada
2,0887841740,The Middle Stories,Sheila Heti,2004,House of Anansi Press,http://images.amazon.com/images/P/0887841740.01.MZZZZZZZ.jpg,8,5,35,timmins,ontario,canada
3,1552041778,Jane Doe,R. J. Kaiser,1999,Mira Books,http://images.amazon.com/images/P/1552041778.01.MZZZZZZZ.jpg,8,5,35,timmins,ontario,canada
4,1567407781,The Witchfinder (Amos Walker Mystery Series),Loren D. Estleman,1998,Brilliance Audio - Trade,http://images.amazon.com/images/P/1567407781.01.MZZZZZZZ.jpg,8,6,35,timmins,ontario,canada


In [20]:
df = dataset1
df.head(2)

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher,imageUrlM,userID,bookRating,Age,City,State,Country
0,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg,8,5,35,timmins,ontario,canada
1,074322678X,Where You'll Find Me: And Other Stories,Ann Beattie,2002,Scribner,http://images.amazon.com/images/P/074322678X.01.MZZZZZZZ.jpg,8,5,35,timmins,ontario,canada


In [21]:
df.describe()

Unnamed: 0,yearOfPublication,userID,bookRating,Age
count,384074.0,384074.0,384074.0,384074.0
mean,1995.797294,136033.307285,7.626864,36.162047
std,7.421494,80482.520076,1.84129,10.106724
min,1376.0,8.0,1.0,10.0
25%,1993.0,67591.0,7.0,31.0
50%,1998.0,133811.0,8.0,35.0
75%,2001.0,206219.0,9.0,40.0
max,2021.0,278854.0,10.0,80.0


### Maximum Ratings?  

In [22]:
df_1 = pd.DataFrame(df.groupby('bookTitle')['bookRating'].count().sort_values(ascending=False).nlargest(6))

In [25]:
df_1['Image'] = " "
def path_to_image_html(path):
    return '<img src="'+ path + '" width="60" >'

In [26]:
for i in range(1,len(df_1.index)):
    url = df[df['bookTitle'] == df_1.index[i]]['imageUrlM'].unique()
    df_1['Image'][i] = url[0]

In [27]:
image_cols = ['Image']

format_dict = {}
for image_col in image_cols:
    format_dict[image_col] = path_to_image_html

display(HTML(df_1[1:].to_html(escape=False, formatters=format_dict)))

Unnamed: 0_level_0,bookRating,imageUrlM,Image
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Wild Animus,581,,
The Da Vinci Code,495,,
The Secret Life of Bees,413,,
The Nanny Diaries: A Novel,393,,
The Red Tent (Bestselling Backlist),383,,


In [None]:
df_pivot = df.pivot_table(values='bookRating', index='userID', columns='bookTitle', fill_value=0)