# Criando um Sistema de Recomendação de Livros

In [1]:
# Importando os pacotes a serem utilizados
import pandas as pd
import numpy as np


In [2]:
# Importar o arquivo com os livros e visualizar as primeiras linhas
livros = pd.read_csv("BX_Books.csv", sep=";", encoding="latin1")
livros.head(3)


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


In [3]:
livros.shape

(271379, 8)

In [4]:
# Importando o arquivo de avaliações e avaliando as primeiras linhas
avaliacoes = pd.read_csv("BX-Book-Ratings.csv", sep=";", encoding="latin1")
avaliacoes.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


#Pré Processamento dos Dados

In [5]:
# Filtrando somente as colunas necessárias e renomeando nome das variaveis

# Seleciona somente as variaveis que iremos utilizar
livros = livros [['ISBN','Book-Title','Book-Author']]

# Renomeia as variaveis
livros.rename(columns={'ISBN':'ID_LIVRO','Book-Title':'TITULO','Book-Author':'AUTOR'}, inplace = True)

# Exibe as primeiras linhas do arquivo tratado
livros.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  livros.rename(columns={'ISBN':'ID_LIVRO','Book-Title':'TITULO','Book-Author':'AUTOR'}, inplace = True)


Unnamed: 0,ID_LIVRO,TITULO,AUTOR
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [6]:
# Filtrando somente as colunas necessários e renomeando nome das variaveis

# Seleciona somente as variaveis que iremos utilizar
avaliacoes = avaliacoes [['User-ID','ISBN','Book-Rating']]

# Renomeia as variaveis
avaliacoes.rename(columns = {'User-ID':'ID_USUARIO','ISBN':'ID_LIVRO','Book-Rating':'AVALIACAO'}, inplace = True)

# Exibe as primeiras linhas do arquivo tratado
avaliacoes.head()



Unnamed: 0,ID_USUARIO,ID_LIVRO,AVALIACAO
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [7]:
# Contar a quantidade de avaliações igual a 0
quantidade_avaliacao_zero = (avaliacoes['AVALIACAO'] == 0).sum()

# Exibir a quantidade de avaliações igual a 0
print("Quantidade de avaliações igual a 0:", quantidade_avaliacao_zero)



Quantidade de avaliações igual a 0: 716109


In [8]:
# Excluir as instâncias com avaliação igual a 0
avaliacoes = avaliacoes[avaliacoes['AVALIACAO'] != 0]

# Exibir o DataFrame resultante após a exclusão
print(avaliacoes)

         ID_USUARIO     ID_LIVRO  AVALIACAO
1            276726   0155061224          5
3            276729   052165615X          3
4            276729   0521795028          6
6            276736   3257224281          8
7            276737   0600570967          6
...             ...          ...        ...
1149773      276704   0806917695          5
1149775      276704   1563526298          9
1149777      276709   0515107662         10
1149778      276721   0590442449         10
1149779      276723  05162443314          8

[433671 rows x 3 columns]


In [9]:
# Calcular a contagem de avaliações por ID_LIVRO
contagem_avaliacoes = avaliacoes.groupby('ID_LIVRO').size().reset_index(name='QTDE_AVALIACOES')

# Exibir o resultado
print(contagem_avaliacoes)


              ID_LIVRO  QTDE_AVALIACOES
0           0330299891                1
1           0375404120                1
2           9022906116                1
3             #6612432                1
4         '9607092910'                1
...                ...              ...
185968  \8888809228\""                1
185969  \9170010242\""                1
185970      ooo7156103                1
185971     ´3499128624                1
185972       Ô½crosoft                1

[185973 rows x 2 columns]


In [10]:
# Mesclar a base de dados "livros" com a contagem de avaliações usando o ID_LIVRO como chave de junção
livros = pd.merge(livros, contagem_avaliacoes, on='ID_LIVRO', how='left', suffixes=('_livros', '_avaliacoes'))

# Exibir o DataFrame resultante
livros.head(10)



Unnamed: 0,ID_LIVRO,TITULO,AUTOR,QTDE_AVALIACOES
0,0195153448,Classical Mythology,Mark P. O. Morford,
1,0002005018,Clara Callan,Richard Bruce Wright,9.0
2,0060973129,Decision in Normandy,Carlo D'Este,2.0
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,6.0
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,
5,0399135782,The Kitchen God's Wife,Amy Tan,17.0
6,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,1.0
7,0671870432,PLEADING GUILTY,Scott Turow,1.0
8,0679425608,Under the Black Flag: The Romance and the Real...,David Cordingly,
9,074322678X,Where You'll Find Me: And Other Stories,Ann Beattie,1.0


In [11]:
livros.shape

(271379, 4)

In [12]:
# Verificando se há valores nulos
livros.isna().sum()

ID_LIVRO                0
TITULO                  0
AUTOR                   2
QTDE_AVALIACOES    121537
dtype: int64

In [13]:
# Excluindo os valores nulos
livros.dropna(inplace = True)

In [14]:
# Verificando se há valores nulos
livros.isna().sum()

ID_LIVRO           0
TITULO             0
AUTOR              0
QTDE_AVALIACOES    0
dtype: int64

In [15]:
livros.shape

(149840, 4)

In [16]:
# Verificando se há valores nulos
avaliacoes.isna().sum()

ID_USUARIO    0
ID_LIVRO      0
AVALIACAO     0
dtype: int64

In [17]:
# Verificando a quantidade de avaliacoes por usuarios
avaliacoes['ID_USUARIO'].value_counts()

ID_USUARIO
11676     8524
98391     5802
153662    1969
189835    1906
23902     1395
          ... 
114079       1
114081       1
114096       1
114115       1
276723       1
Name: count, Length: 77805, dtype: int64

In [18]:
# Vamos utilizar o ID_USUARIO somente de usuários que fizeram mais de 9 avaliações
qt_avaliacoes = avaliacoes['ID_USUARIO'].value_counts() > 9
y = qt_avaliacoes[qt_avaliacoes].index
y.shape

(7334,)

In [19]:
# Visualizando os usuarios selecionados
y

Index([ 11676,  98391, 153662, 189835,  23902,  76499, 171118, 235105,  16795,
       248718,
       ...
       197664, 168999,   5741, 127168,  92486,  28372,  33832, 178880, 241306,
       204790],
      dtype='int64', name='ID_USUARIO', length=7334)

In [20]:
# visualizando o tamanho do dataset Avaliações
avaliacoes.shape

(433671, 3)

In [21]:
# Pegando somente avaliacoes dos usuarios que avaliaram mais de 9 vezes
avaliacoes = avaliacoes[avaliacoes['ID_USUARIO'].isin(y)]

In [22]:
# visualizando o tamanho do dataset Avaliações
avaliacoes.shape

(295561, 3)

In [23]:
# Visualizando os DataFrame Avaliacoes
avaliacoes.head()

Unnamed: 0,ID_USUARIO,ID_LIVRO,AVALIACAO
133,276822,60096195,10
134,276822,141310340,9
135,276822,142302198,10
136,276822,156006065,9
137,276822,375821813,9


In [24]:
livros.head()

Unnamed: 0,ID_LIVRO,TITULO,AUTOR,QTDE_AVALIACOES
1,2005018,Clara Callan,Richard Bruce Wright,9.0
2,60973129,Decision in Normandy,Carlo D'Este,2.0
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,6.0
5,399135782,The Kitchen God's Wife,Amy Tan,17.0
6,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,1.0


In [25]:
# Vamos usar os livros que possuem somente uma quantidade de avaliações superior a 9 avaliações
livros = livros[livros['QTDE_AVALIACOES'] > 9]

In [26]:
# Agrupando a quantidade de livros por autor
livros_autor = livros['AUTOR'].value_counts()
livros_autor.head(20)

AUTOR
Stephen King            113
Nora Roberts             93
Danielle Steel           57
Mary Higgins Clark       50
Dean R. Koontz           47
Tom Clancy               43
V.C. Andrews             41
James Patterson          40
Anne Rice                32
Anne McCaffrey           31
Lilian Jackson Braun     30
Sandra Brown             28
Johanna Lindsey          27
John Grisham             26
John Sandford            26
Sue Grafton              25
Janet Evanovich          25
Douglas Adams            24
Maeve Binchy             23
Catherine Coulter        23
Name: count, dtype: int64

In [27]:
# Visualizar os tipos de dados das variaveis
livros.info()
avaliacoes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5444 entries, 5 to 131842
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID_LIVRO         5444 non-null   object 
 1   TITULO           5444 non-null   object 
 2   AUTOR            5444 non-null   object 
 3   QTDE_AVALIACOES  5444 non-null   float64
dtypes: float64(1), object(3)
memory usage: 212.7+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 295561 entries, 133 to 1149747
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   ID_USUARIO  295561 non-null  int64 
 1   ID_LIVRO    295561 non-null  object
 2   AVALIACAO   295561 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 9.0+ MB


In [28]:
# Remover as letras da coluna ID_LIVRO e manter apenas os números (dataset livros)
livros['ID_LIVRO'] = livros['ID_LIVRO'].str.replace(r'\D+', '')

# Converter a coluna ID_LIVRO para o tipo inteiro
#livros['ID_LIVRO'] = livros['ID_LIVRO'].astype(int)

# Visualizar os tipos de dados das variaveis
livros.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5444 entries, 5 to 131842
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID_LIVRO         5444 non-null   object 
 1   TITULO           5444 non-null   object 
 2   AUTOR            5444 non-null   object 
 3   QTDE_AVALIACOES  5444 non-null   float64
dtypes: float64(1), object(3)
memory usage: 212.7+ KB


In [29]:
# Remover as letras da coluna ID_LIVRO e manter apenas os números
avaliacoes['ID_LIVRO'] = avaliacoes['ID_LIVRO'].str.replace(r'\D+', '')

# Remover as linhas com valores vazios na coluna ID_LIVRO
avaliacoes = avaliacoes[avaliacoes['ID_LIVRO'] != '']

# Converter a coluna ID_LIVRO para o tipo inteiro
#avaliacoes['ID_LIVRO'] = avaliacoes['ID_LIVRO'].astype(int)

# Visualizar os tipos de dados das variaveis
print(avaliacoes.dtypes)

ID_USUARIO     int64
ID_LIVRO      object
AVALIACAO      int64
dtype: object


In [30]:
# Concatenando os dataframes
avaliacoes_e_livros = avaliacoes.merge(livros, on = 'ID_LIVRO')
avaliacoes_e_livros.head()

Unnamed: 0,ID_USUARIO,ID_LIVRO,AVALIACAO,TITULO,AUTOR,QTDE_AVALIACOES
0,276822,60096195,10,The Boy Next Door,Meggin Cabot,53.0
1,278554,60096195,9,The Boy Next Door,Meggin Cabot,53.0
2,7125,60096195,8,The Boy Next Door,Meggin Cabot,53.0
3,7346,60096195,8,The Boy Next Door,Meggin Cabot,53.0
4,8067,60096195,10,The Boy Next Door,Meggin Cabot,53.0


In [31]:
# Verificando a quantidade de livros com avaliacoes pelo tamanho do arquivo
avaliacoes_e_livros.shape

(88402, 6)

In [32]:
# Verificando se há valores nulos
avaliacoes_e_livros.isna().sum()

ID_USUARIO         0
ID_LIVRO           0
AVALIACAO          0
TITULO             0
AUTOR              0
QTDE_AVALIACOES    0
dtype: int64

In [33]:
# Visualizando as primeiras 20 linhas do arquivo
avaliacoes_e_livros.head(10)

Unnamed: 0,ID_USUARIO,ID_LIVRO,AVALIACAO,TITULO,AUTOR,QTDE_AVALIACOES
0,276822,60096195,10,The Boy Next Door,Meggin Cabot,53.0
1,278554,60096195,9,The Boy Next Door,Meggin Cabot,53.0
2,7125,60096195,8,The Boy Next Door,Meggin Cabot,53.0
3,7346,60096195,8,The Boy Next Door,Meggin Cabot,53.0
4,8067,60096195,10,The Boy Next Door,Meggin Cabot,53.0
5,13552,60096195,7,The Boy Next Door,Meggin Cabot,53.0
6,15819,60096195,10,The Boy Next Door,Meggin Cabot,53.0
7,25409,60096195,9,The Boy Next Door,Meggin Cabot,53.0
8,28204,60096195,8,The Boy Next Door,Meggin Cabot,53.0
9,35320,60096195,8,The Boy Next Door,Meggin Cabot,53.0


In [34]:
# Vamos descartar os valores duplicados, para que não tenha problemas de termos o mesmo usuário avaliando o mesmo livro mais de uma vez
avaliacoes_e_livros.drop_duplicates(['ID_USUARIO','ID_LIVRO'], inplace = True)

In [35]:
# Visualizando se houve alteração na quantidade de registros
avaliacoes_e_livros.shape

(88402, 6)

In [36]:
# Exclusão da variavel ID_LIVRO porque não iremos utiliza-la
del avaliacoes_e_livros['ID_LIVRO']

In [37]:
# DataFrame sem a variavel ID_LIVRO
avaliacoes_e_livros.head(50)

Unnamed: 0,ID_USUARIO,AVALIACAO,TITULO,AUTOR,QTDE_AVALIACOES
0,276822,10,The Boy Next Door,Meggin Cabot,53.0
1,278554,9,The Boy Next Door,Meggin Cabot,53.0
2,7125,8,The Boy Next Door,Meggin Cabot,53.0
3,7346,8,The Boy Next Door,Meggin Cabot,53.0
4,8067,10,The Boy Next Door,Meggin Cabot,53.0
5,13552,7,The Boy Next Door,Meggin Cabot,53.0
6,15819,10,The Boy Next Door,Meggin Cabot,53.0
7,25409,9,The Boy Next Door,Meggin Cabot,53.0
8,28204,8,The Boy Next Door,Meggin Cabot,53.0
9,35320,8,The Boy Next Door,Meggin Cabot,53.0


In [38]:
# PIVOT - para que cada ID_USUARIO seja uma coluna com o respectivo valor de nota para cada livro avaliado
livros_pivot = avaliacoes_e_livros.pivot_table(columns = 'ID_USUARIO', index = 'TITULO', values = 'AVALIACAO')


# Avaliar o arquivo transformado para PIVOT
livros_pivot.head(20)

ID_USUARIO,242,243,254,388,446,503,505,507,625,638,...,278314,278356,278390,278418,278535,278554,278582,278633,278843,278851
TITULO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Salem's Lot,,,,,,,,,,,...,,,,,,,,,,
10 Lb. Penalty,,,,,,,,,,,...,,,,,,,,,,
100 Selected Poems by E. E. Cummings,,,,,,,,,,,...,,,,,,,,,,
"14,000 Things to Be Happy About",,,,,,,,,,,...,,,,,,,,,,5.0
16 Lighthouse Road,,,,,,,,,,,...,,,,,,,,,,
1984,,,9.0,,,,,,,,...,,,,,,,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,10.0,,,,,
2001: A Space Odyssey,,,,,,,,,,,...,,,,,,,,,,
2010: Odyssey Two,,,,,,,,,,,...,,,,,,,,,,
203 Ways to Drive a Man Wild in Bed,,,,,,,,,,,...,,,,,,,,,,


In [39]:
# Os valores que são nulos iremos preencher com ZERO
livros_pivot.fillna(0, inplace = True)
livros_pivot.head()

ID_USUARIO,242,243,254,388,446,503,505,507,625,638,...,278314,278356,278390,278418,278535,278554,278582,278633,278843,278851
TITULO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Salem's Lot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100 Selected Poems by E. E. Cummings,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"14,000 Things to Be Happy About",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
# Vamos importar o csr_matrix do pacote SciPy
# Esse método possibilita criarmos uma matriz sparsa
from scipy.sparse import csr_matrix


# Vamos transformar o nosso dataset em uma matriz sparsa
livros_sparse = csr_matrix(livros_pivot)

In [41]:
# Tipo do objeto
type(livros_sparse)

In [42]:
# Vamos importar o algoritmo KNN do SciKit Learn
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

In [43]:
# Dividir os dados em conjunto de treinamento e teste
train_data, test_data = train_test_split(livros_sparse, test_size=0.2, random_state=42)


In [44]:
# Criando e treinando o modelo preditivo
modelo = NearestNeighbors(algorithm = 'brute')
modelo.fit(train_data)

In [45]:
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

# Fazendo previsões
distances, indices = modelo.kneighbors(test_data)

# Calculando RMSE
test_predictions = np.zeros(test_data.shape)
for i in range(test_data.shape[0]):
    test_predictions[i, :] = np.mean(train_data[indices[i], :], axis=0)

rmse = np.sqrt(mean_squared_error(test_data.toarray(), test_predictions))
print("RMSE:", rmse)

# Converte as avaliações em binário: 1 se a avaliação for maior que o limite, 0 caso contrário
threshold = 3.5
predicted_binary = (test_predictions > threshold).astype(int)
actual_binary = (test_data > threshold).astype(int)

# Calcula as métricas de classificação
precision = precision_score(actual_binary, predicted_binary, average='weighted')
recall = recall_score(actual_binary, predicted_binary, average='weighted')
f1 = f1_score(actual_binary, predicted_binary, average='weighted')

print("Precisão:", precision)
print("Recall:", recall)
print("F1-score:", f1)

RMSE: 0.4140654415775425


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Precisão: 0.13567778260678398
Recall: 0.04006171819033449
F1-score: 0.048296443292717636


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


## Previsão de sugestões de livros

In [46]:
#The Boy Next Door
distances, sugestions = modelo.kneighbors(livros_pivot.filter(items = ['The Boy Next Door'], axis=0).values.reshape(1, -1))

for i in range(len(sugestions)):
    print(livros_pivot.index[sugestions[i]])

Index(['The GREAT GATSBY (Scribner Classic)', 'Brave the Wild Wind',
       'The Carnivorous Carnival (A Series of Unfortunate Events, Book 9)',
       'Outlander', 'Amy and Isabelle'],
      dtype='object', name='TITULO')


In [47]:
#Artemis Fowl (Artemis Fowl, Book 1)
distances, sugestions = modelo.kneighbors(livros_pivot.filter(items = ['Artemis Fowl (Artemis Fowl, Book 1)'], axis=0).values.reshape(1, -1))

for i in range(len(sugestions)):
    print(livros_pivot.index[sugestions[i]])

Index(['Mansfield Park (Penguin Popular Classics)', 'Devil's Claw',
       'Midnight In the Garden of Good and Evil', 'Outlander',
       'I'll Take Manhattan'],
      dtype='object', name='TITULO')


In [48]:
#Hoot (Newbery Honor Book)
distances, sugestions = modelo.kneighbors(livros_pivot.filter(items = ['Hoot (Newbery Honor Book)'], axis=0).values.reshape(1, -1))

for i in range(len(sugestions)):
    print(livros_pivot.index[sugestions[i]])

Index(['Tales from Watership Down', 'Outlander', 'I'll Take Manhattan',
       'In Cold Blood (Vintage International)', 'The Alibi'],
      dtype='object', name='TITULO')
