In [1]:
import random
import warnings
from utils import *
warnings.filterwarnings("ignore")

## Reading sources

In [2]:
df_books_complete = pd.read_csv('../datos/Books.csv', low_memory=False)
df_books_complete.columns = df_books_complete.columns.str.lower()
df_books = df_books_complete[['isbn', 'book-title', 'book-author', 'year-of-publication', 'publisher', 'image-url-l']].copy()
df_books = df_books[~df_books['year-of-publication'].isin(['DK Publishing Inc', 'Gallimard'])]
df_books['year-of-publication'] = df_books['year-of-publication'].astype(int)
df_books['book-author'] = df_books['book-author'].map(lambda x: capitalize_words(x))
# Generamos una variable que nos indique la antigüedad del libro y eliminamos aquellos libros que aún no hayan sido publicados.
df_books['antiquity'] = np.where(df_books['year-of-publication'] != 0,
                                 datetime.datetime.now().year - df_books['year-of-publication'],
                                 df_books['year-of-publication'])
df_books = df_books[df_books['antiquity'] > 0].drop(['year-of-publication', 'publisher'], axis=1).drop_duplicates().copy()
print(f'Number of records: {df_books.shape}')
df_books.head(2)

Number of records: (266726, 5)


Unnamed: 0,isbn,book-title,book-author,image-url-l,antiquity
0,195153448,Classical Mythology,Mark P. O. Morford,http://images.amazon.com/images/P/0195153448.0...,21
1,2005018,Clara Callan,Richard Bruce Wright,http://images.amazon.com/images/P/0002005018.0...,22


In [4]:
print(f"Valores unicos de la variable antiquity para cada libro: {df_books.groupby('isbn').agg({'antiquity': 'nunique'})['antiquity'].unique()[0]}")
print(f"Valores unicos de la variable book-title para cada libro: {df_books.groupby('isbn').agg({'book-title': 'nunique'})['book-title'].unique()[0]}")

Valores unicos de la variable antiquity para cada libro: 1
Valores unicos de la variable book-title para cada libro: 1


In [5]:
df_ratings = pd.read_csv('../datos/Ratings.csv', low_memory=False)
df_ratings.columns = df_ratings.columns.str.lower()
print('Number of records: {0}'.format(df_ratings.shape))
df_ratings.head(2)

Number of records: (1149780, 3)


Unnamed: 0,user-id,isbn,book-rating
0,276725,034545104X,0
1,276726,0155061224,5


In [6]:
df_ratings_bybooks = df_ratings.groupby('isbn')['book-rating'].mean().reset_index()
df_ratings_bybooks.head(2)

Unnamed: 0,isbn,book-rating
0,330299891,3.0
1,375404120,1.5


In [7]:
df_users_books = df_ratings.groupby('user-id').agg({'isbn': 'nunique'}).reset_index()
df_users_books.columns = ['user-id', 'nbooks']
df_users_books.head(2)

Unnamed: 0,user-id,nbooks
0,2,1
1,7,1


In [8]:
df_users = pd.read_csv('../datos/Users.csv', low_memory=False)
df_users.columns = df_users.columns.str.lower()
df_users['country'] = df_users['location'].apply(lambda x: x.split(',')[-1])
print(f'Number of records: {df_users.shape}')
print(df_users.isna().sum().to_dict())
df_users.head(2)

Number of records: (278858, 4)
{'user-id': 0, 'location': 0, 'age': 110762, 'country': 0}


Unnamed: 0,user-id,location,age,country
0,1,"nyc, new york, usa",,usa
1,2,"stockton, california, usa",18.0,usa


# Collaborative Filtering
---

https://medium.com/swlh/how-to-build-simple-recommender-systems-in-python-647e5bcd78bd

https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-recommendation-engine-python/

https://unipython.com/como-desarrollar-un-sistema-de-recomendacion-en-python/?utm_content=cmp-true

notebook de ejemplo: https://www.kaggle.com/code/hilalmleykeyuksel/book-recommender

In [9]:
df_coll_filter = pd.merge(df_users, df_ratings, on='user-id', how='inner')
df_coll_filter = pd.merge(df_coll_filter, df_books, on=['isbn'], how='inner')
df_coll_filter = df_coll_filter.sort_values('user-id')\
    .reset_index(drop=True)\
    .drop(['location', 'age', 'country', 'isbn'], axis=1)
print(f"shape: {df_coll_filter.shape}")
df_coll_filter.head()

shape: (1017069, 6)


Unnamed: 0,user-id,book-rating,book-title,book-author,image-url-l,antiquity
0,2,0,Classical Mythology,Mark P. O. Morford,http://images.amazon.com/images/P/0195153448.0...,21
1,8,0,The Kitchen God's Wife,Amy Tan,http://images.amazon.com/images/P/0399135782.0...,32
2,8,0,What If?: The World's Foremost Military Histor...,Robert Cowley,http://images.amazon.com/images/P/0425176428.0...,23
3,8,0,Decision in Normandy,Carlo D'este,http://images.amazon.com/images/P/0060973129.0...,32
4,8,0,PLEADING GUILTY,Scott Turow,http://images.amazon.com/images/P/0671870432.0...,30


In [10]:
# Eliminamos los registros que hayan votado menos de 250 veces.
df_filter = df_coll_filter[df_coll_filter['user-id'].map(df_coll_filter['user-id'].value_counts()) > 100].copy()

# Realizamos un pivoteo de la tabla a partir del user-id, en esta nueva tabla tendremos en cada columna todos los posibles libros,
# y cada fila hace referencia a cada uno de los usuarios donde el valor de cada columna será el valor dado por cada uno de estos usuario al libro concreto, 
# en caso de no haber puntuado el libro se le asignará el valor 0
df_users_pivot = df_filter.pivot_table(index=["user-id"],columns=["book-title"],values="book-rating").copy()
df_users_pivot.fillna(0,inplace=True)
print(df_users_pivot.shape)
df_users_pivot.head(2)

(1621, 175413)


book-title,"A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)",Always Have Popsicles,Apple Magic (The Collector's series),Beyond IBM: Leadership Marketing and Finance for the 1990s,Clifford Visita El Hospital (Clifford El Gran Perro Colorado),Dark Justice,Deceived,"Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth",Final Fantasy Anthology: Official Strategy Guide (Brady Games),Flight of Fancy: American Heiresses (Zebra Ballad Romance),...,Ã?coute ma diffÃ©rence (Le Temps des femmes),Ã?ngeles fugaces (Falling Angels),Ã?Â?ber das Fernsehen.,Ã?Â?ber die Freiheit.,Ã?Â?ber die Pflicht zum Ungehorsam gegen den Staat.,Ã?Â?berraschung am Valentinstag.,Ã?Â?lpiraten.,Ã?Â?rger mit Produkt X. Roman.,Ã?Â?stlich der Berge.,Ã?Â?thique en toc
user-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Información del usuario al cual le vamos a realizar la recomendación.
user_id = random.choice(df_filter["user-id"].values)
_, location, edad, country = df_users[df_users['user-id'] == user_id].values.tolist()[0]
all_books_read_user_id = df_filter[df_filter['user-id'] == user_id]['book-title'].tolist()
user_choice_df = pd.DataFrame(user_favs(id=user_id, df_filter=df_filter))
user_favorite = user_favs(id=user_id, df_filter=df_filter)
n = len(user_choice_df["book-title"].values)
print(f"🟦 USER ID: {user_id}; {edad} años de {location}")
print("Libros favoritos son:")
display(get_info_book(df_books=df_books_complete, listbooks=user_favorite['book-title'].values.tolist()))

user_based_rec = get_users_with_highest_similarity(new_df=df_filter, id=user_id, df_users_pivot=df_users_pivot, df_coll_filter=df_coll_filter)
books_for_user = get_recomendation_collaborative_filtering(new_df=df_filter, most_similar_users=user_based_rec, books_reading_user_id=all_books_read_user_id, n_books=5)
print("Las recomendaciones son:")
display(get_info_book(df_books=df_books_complete, listbooks=books_for_user))

🟦 USER ID: 63714; 29.0 años de milton keynes, england, united kingdom
Libros favoritos son:


Unnamed: 0,book-title,book-author,publisher,year-of-publication
0,Hogfather,Terry Pratchett,HarperTorch,1999
1,"Transformers, Book 1",Scott Ciencin,I Books,2003
2,Dawkins vs. Gould : Survival of the Fittest,Kim Sterelny,Icon Books UK,2001
3,Cretaceous-Tertiary Mass Extinctions: Biotic a...,Norman MacLeod,W. W. Norton &amp; Company,1996
4,Evolution: A Case of Stating the Obvious,Derek Hough,Berkley Publishing Group,1997


Las recomendaciones son:


Unnamed: 0,book-title,book-author,publisher,year-of-publication
0,Life of Pi,Yann Martel,Harcourt,2002
1,The Da Vinci Code,Dan Brown,Doubleday,2003
2,Siddhartha,Hermann Hesse,New Directions Publishing Corporation,1951
3,And Then There Were None,Agatha Christie,Pocket,1984
4,"The Waste Lands (The Dark Tower, Book 3)",Stephen King,Donald m Grant,1991


# Based on the content
---

In [12]:
df_based_content = pd.merge(df_users, df_ratings, on='user-id', how='inner')
df_based_content = pd.merge(df_based_content, df_books, on=['isbn'], how='inner')
df_based_content = df_based_content.sort_values('user-id')\
    .reset_index(drop=True)\
    .drop(['location', 'age', 'country', 'isbn'], axis=1)
print(f"shape: {df_based_content.shape}")
df_based_content.head()

shape: (1017069, 6)


Unnamed: 0,user-id,book-rating,book-title,book-author,image-url-l,antiquity
0,2,0,Classical Mythology,Mark P. O. Morford,http://images.amazon.com/images/P/0195153448.0...,21
1,8,0,The Kitchen God's Wife,Amy Tan,http://images.amazon.com/images/P/0399135782.0...,32
2,8,0,What If?: The World's Foremost Military Histor...,Robert Cowley,http://images.amazon.com/images/P/0425176428.0...,23
3,8,0,Decision in Normandy,Carlo D'este,http://images.amazon.com/images/P/0060973129.0...,32
4,8,0,PLEADING GUILTY,Scott Turow,http://images.amazon.com/images/P/0671870432.0...,30


In [13]:
# Miramos cuantas veces aparece cada uno de los libros para así no tener en cuenta aquellos que aparezcan muy poco.
df_count_books = pd.DataFrame(df_based_content["book-title"].value_counts())
df_rare = df_count_books[df_count_books["book-title"]<= 100].index
df_common_books = df_based_content[~df_based_content["book-title"].isin(df_rare)]

In [14]:
# Pivotamos la información teniendo la información de cada usuario a nivel registro y cada libro a recomendar es una de las columnas del nuevo dataset.
df_common_books_pivot = df_common_books.pivot_table(index=["user-id"], columns=["book-title"], values="book-rating")
df_common_books_pivot.head()

book-title,1984,1st to Die: A Novel,24 Hours,2nd Chance,4 Blondes,A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash,A Bend in the Road,A Case of Need,"A Child Called \It\"": One Child's Courage to Survive""",A Civil Action,...,Without Remorse,"Wizard and Glass (The Dark Tower, Book 4)",Women Who Run with the Wolves,"Word Freak: Heartbreak, Triumph, Genius, and Obsession in the World of Competitive Scrabble Players",Wuthering Heights,Year of Wonders,You Belong To Me,Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,Zoya,"\O\"" Is for Outlaw"""
user-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
14,,,,,,,,,,,...,,,,,,,,,,
16,,,,,,,,,,,...,,,,,,,,,,
17,,,,,,,,,,,...,,,,,,,,,,


In [15]:
# Recomendaciones
bookTitle = 'Me Talk Pretty One Day'
recommendation_based_content(df_common_books_pivot=df_common_books_pivot, df_based_content=df_based_content, bookTitle=bookTitle)

Unnamed: 0,book-title,book-rating,book-author,year-of-publication
0,The Shelters of Stone (Earth's Children Series...,3.827273,Jean M. Auel,2002
1,"The Eye of the World (The Wheel of Time, Book 1)",3.733333,Robert Jordan,1990
2,Atlantis Found,3.308411,R. Garcia Y. Robertson,1997
3,The Face,3.104762,Dan Mcneill,1998
4,The Woman Next Door,2.847134,Barbara Delinsky,2001
