# Sistema basado encontenido

In [20]:
# La librería pandas es un paquete de Python que proporciona estructuras de datos similares a los dataframes de R.
# Pandas depende de Numpy, la librería que añade un potente tipo matricial a Python
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm 

# Datos de importación

In [21]:
# Constantes
PATH = 'data.csv'
df = pd.read_csv(PATH)
df.shape

(100000, 10)

In [22]:
df.head()

Unnamed: 0,book_id,author_id,book_genre,reader_id,num_pages,book_rating,publisher_id,publish_year,book_price,text_lang
0,655,52,4,11482,300,4,8,2012,94,7
1,2713,90,3,6479,469,1,8,2012,33,5
2,409,17,2,25472,435,1,12,2001,196,4
3,1150,234,10,23950,529,2,23,2019,79,2
4,2424,390,5,13046,395,2,20,2010,200,4


# Recomendación de libros

**Normalización de datos**

In [23]:
def normalize(data):
    min_val = min(data)
    if min_val < 0:
        data = [x + abs(min_val) for x in data]
    max_val = max(data)
    return [x/max_val for x in data]

**Normaliza las columnas num_pages, ratings, price**

In [24]:
df['num_pages_norm'] = normalize(df['num_pages'].values)
df['book_rating_norm'] = normalize(df['book_rating'].values)
df['book_price_norm'] = normalize(df['book_price'].values)

**Variables Categóricas**

In [25]:
def ohe(df, enc_col):
    ohe_df = pd.get_dummies(df[enc_col])
    ohe_df.reset_index(drop = True, inplace = True)
    return pd.concat([df, ohe_df], axis = 1)

**One hot encode en año_de_publicación y género**

In [26]:
df = ohe(df = df, enc_col = 'publish_year')
df = ohe(df = df, enc_col = 'book_genre')
df = ohe(df = df, enc_col = 'text_lang')

**Elimina columnas redundantes**

In [27]:
cols = ['publish_year', 'book_genre', 'num_pages', 'book_rating', 'book_price', 'text_lang']
df.drop(columns = cols, inplace = True)
df.set_index('book_id', inplace = True)

**Cálculo de similitud de coseno**

In [32]:
class CBRecommend():
    def __init__(self, df):
        self.df = df
        
    def cosine_sim(self, v1,v2):
        
        return dot(v1,v2)/(norm(v1)*norm(v2))
    
    def recommend(self, book_id, n_rec):
       
        
        # calculate similarity of input book_id vector w.r.t all other vectors
        inputVec = self.df.loc[book_id].values
        self.df['sim']= self.df.apply(lambda x: self.cosine_sim(inputVec,x.values), axis=1)
        
        # returns top n user specified books
        return self.df.nlargest(columns='sim',n=n_rec)

In [33]:
t = df.sample(1000).copy()
cbr = CBRecommend(df = t)

In [34]:
cbr.df.head()

Unnamed: 0_level_0,author_id,reader_id,publisher_id,num_pages_norm,book_rating_norm,book_price_norm,2000,2001,2002,2003,...,8,9,10,1,2,3,4,5,6,7
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
481,64,29554,16,0.485714,0.1,0.165,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
297,178,10703,17,0.534286,0.9,0.735,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1964,166,7168,31,0.38,0.8,0.295,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2404,168,2599,9,0.418571,0.3,0.54,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2175,255,27763,18,0.342857,0.5,0.125,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [38]:
cbr.recommend(book_id = t.index[0], n_rec =5 )

Unnamed: 0_level_0,author_id,reader_id,publisher_id,num_pages_norm,book_rating_norm,book_price_norm,2000,2001,2002,2003,...,9,10,1,2,3,4,5,6,7,sim
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
481,64,29554,16,0.485714,0.1,0.165,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1.0
507,63,27557,14,0.417143,0.3,0.955,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1.0
1612,55,24444,18,0.107143,0.7,0.85,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1.0
2831,52,26984,17,0.25,0.1,0.08,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1.0
95,39,18265,16,0.49,0.3,0.385,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1.0
