In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
ratings = pd.read_csv('./data/train.csv')
ratings.head()

Unnamed: 0,book_id,user_id,rating
0,7260,20145,3.5
1,243238,85182,4.0
2,9135,45973,1.0
3,18671,63554,3.0
4,243293,81002,5.0


In [3]:
books = pd.read_csv('./data/books_augmented.csv')
books.head()

Unnamed: 0,ISBN,book_id,title,subtitle,authors,publisher,published_date,description,categories,language
0,374157065,3,Flu,The Story of the Great Influenza Pandemic of 1...,['Gina Bari Kolata'],Macmillan,1999,"""Scientists have recently discovered shards of...",['Medical'],en
1,440234743,18,The Testament,,['John Grisham'],Island,1999,Heart of darkness... In a plush Virginia offic...,['Adventure stories'],en
2,452264464,19,Beloved,A Novel,['Toni Morrison'],Plume Books,1988,WINNER OF THE NOBEL PRIZE IN LITERATURE.,['Fiction'],en
3,609804618,20,Our Dumb Century,,['Scott Dikkers'],Crown,1999,The Onion has quickly become the world's most ...,['Humor'],en
4,1841721522,21,New Vegetarian,Bold and Beautiful Recipes for Every Occasion,['Celia Brooks Brown'],,2001,In New Vegetarian Celia Brooks Brown presents ...,['International cooking'],en


In [4]:
# User Features
user_features = ratings.groupby('user_id').agg(
    num_ratings=('rating', 'count'),
    avg_rating=('rating', 'mean'),
    std_rating=('rating', 'std'),
    max_rating=('rating', 'max'),
    min_rating=('rating', 'min')
).reset_index()
user_features.head()

Unnamed: 0,user_id,num_ratings,avg_rating,std_rating,max_rating,min_rating
0,37,11,1.409091,0.700649,2.5,1.0
1,70,5,1.8,0.447214,2.5,1.5
2,76,7,1.428571,0.449868,2.0,1.0
3,96,7,2.5,1.154701,5.0,1.5
4,122,11,2.681818,0.40452,3.5,2.0


In [5]:
# Book Features
books_features = ratings.groupby('book_id').agg(
    num_ratings=('rating', 'count'),
    avg_rating=('rating', 'mean'),
    std_rating=('rating', 'std'),
    max_rating=('rating', 'max'),
    min_rating=('rating', 'min')
).reset_index()

books_features.head()

Unnamed: 0,book_id,num_ratings,avg_rating,std_rating,max_rating,min_rating
0,1,3,2.333333,0.763763,3.0,1.5
1,3,1,1.5,,1.5,1.5
2,5,6,1.833333,0.983192,3.5,1.0
3,18,72,2.256944,0.884105,4.5,1.0
4,19,35,2.028571,0.984758,5.0,1.0


In [6]:
# Merging with books dataset
books_features = books_features.merge(books[['book_id', 'published_date', 'categories', 'publisher']], on='book_id', how='left')

# Extracting publish year
books_features['publish_year'] = books_features['published_date'].str.extract(r'(\d{4})')

# Keep the first category
books_features['categories'] = books_features['categories'].apply(lambda x: eval(x) if pd.notnull(x) else [])
books_features['categories'] = books_features['categories'].apply(lambda x: x[0] if x else None)
books_features['categories'] = books_features['categories'].fillna("Unknown")


books_features.head()

Unnamed: 0,book_id,num_ratings,avg_rating,std_rating,max_rating,min_rating,published_date,categories,publisher,publish_year
0,1,3,2.333333,0.763763,3.0,1.5,,Unknown,,
1,3,1,1.5,,1.5,1.5,1999.0,Medical,Macmillan,1999.0
2,5,6,1.833333,0.983192,3.5,1.0,,Unknown,,
3,18,72,2.256944,0.884105,4.5,1.0,1999.0,Adventure stories,Island,1999.0
4,19,35,2.028571,0.984758,5.0,1.0,1988.0,Fiction,Plume Books,1988.0


In [7]:
# Joining ratings and books for category-related features
ratings_books = ratings.merge(books[['book_id', 'categories']], on='book_id', how='left')

# Pick the first category from the list
ratings_books['categories'] = ratings_books['categories'].apply(lambda x: eval(x) if pd.notnull(x) else [])
ratings_books['categories'] = ratings_books['categories'].apply(lambda x: x[0] if x else None)

ratings_books['categories'] = ratings_books['categories'].fillna("Unknown")

ratings_books.head()

Unnamed: 0,book_id,user_id,rating,categories
0,7260,20145,3.5,California
1,243238,85182,4.0,Unknown
2,9135,45973,1.0,Biography & Autobiography
3,18671,63554,3.0,Juvenile Fiction
4,243293,81002,5.0,Unknown


In [8]:
# Average rating of each category
category_avg_rating = ratings_books.groupby('categories')['rating'].mean().reset_index()
category_avg_rating.rename(columns={'rating': 'avg_category_rating'}, inplace=True)
books_features = books_features.merge(category_avg_rating, on='categories', how='left')

# Average rating of each publisher
publisher_avg_rating = ratings_books.merge(books[['book_id', 'publisher']], on='book_id').groupby('publisher')['rating'].mean().reset_index()
publisher_avg_rating.rename(columns={'rating': 'avg_publisher_rating'}, inplace=True)
books_features = books_features.merge(publisher_avg_rating, on='publisher', how='left')

books_features.head()

Unnamed: 0,book_id,num_ratings,avg_rating,std_rating,max_rating,min_rating,published_date,categories,publisher,publish_year,avg_category_rating,avg_publisher_rating
0,1,3,2.333333,0.763763,3.0,1.5,,Unknown,,,2.869511,
1,3,1,1.5,,1.5,1.5,1999.0,Medical,Macmillan,1999.0,1.88961,2.192081
2,5,6,1.833333,0.983192,3.5,1.0,,Unknown,,,2.869511,
3,18,72,2.256944,0.884105,4.5,1.0,1999.0,Adventure stories,Island,1999.0,2.159468,2.256944
4,19,35,2.028571,0.984758,5.0,1.0,1988.0,Fiction,Plume Books,1988.0,2.15541,2.102612


In [9]:
# drop duplicates
books_features = books_features.drop_duplicates(subset='book_id')
user_features = user_features.drop_duplicates(subset='user_id')

# save features
books_features.to_csv('./data/books_features.csv', index=False)
user_features.to_csv('./data/user_features.csv', index=False)