In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import torch
import pickle
import matplotlib.pyplot as plt

from utils import SEED

In [None]:
plt.style.use('seaborn-v0_8')

In [None]:
df = pd.read_csv('data/Book-Crossing/BX-Book-Ratings.csv', sep=';', encoding='latin-1', on_bad_lines='skip')
users = pd.read_csv('data/Book-Crossing/BX-Users.csv', sep=';', encoding='latin-1', on_bad_lines='skip')
books = pd.read_csv('data/Book-Crossing/BX-Books.csv', sep=';', encoding='latin-1', on_bad_lines='skip')

In [None]:
df.head()

In [None]:
users.head()

In [None]:
books.head()

In [None]:
bookid_title = pd.Series(books['Book-Title'].values, index=books.ISBN).to_dict()
bookid_author = pd.Series(books['Book-Author'].values, index=books.ISBN).to_dict()

In [None]:
df = df.loc[
    df['ISBN'].isin(books['ISBN'].unique()) 
    & df['User-ID'].isin(users['User-ID'].unique())
    ]

In [None]:
df = df[df['Book-Rating'] >= 8].sample(100000, random_state=SEED)

In [None]:
books = books[books['ISBN'].isin(df['ISBN'])]
users = users[users['User-ID'].isin(df['User-ID'])]

In [None]:
publisher_counts = books.groupby('Publisher').agg(publisher_count=('ISBN', 'count')).sort_values(by='publisher_count', ascending=False)
publisher_counts.head(10)

In [None]:
figname = 'publishers_count'

fig, ax = plt.subplots(1, 2, figsize=(9, 4))
ax[0].bar(publisher_counts.head(20).index, publisher_counts.head(20)['publisher_count'])
ticks = ax[0].get_xticklabels()
ax[0].set_xticks(
    [t.get_position()[0] for t in ticks], 
    [t.get_text() for t in ticks],
    rotation=90
)
ax[1].hist(publisher_counts.values, bins=30)
ax[1].set_yscale('log')

ax[0].set_xlabel('Publisher', fontweight='bold')
ax[0].set_ylabel('Number of Occurrences', fontweight='bold')
ax[0].set_title('Top-20 Publishers', fontweight='bold')

ax[1].set_xlabel('Number of Occurrences', fontweight='bold')
ax[1].set_ylabel('Number of Publishers', fontweight='bold')
ax[1].set_title('Publishers Histogram', fontweight='bold');

plt.tight_layout()

In [None]:
books = books.merge(publisher_counts, how='left', left_on='Publisher', right_index=True)
books.head()

In [None]:
mean_publisher_counts = publisher_counts['publisher_count'].median()

In [None]:
books['publisher_count'] = books['publisher_count'].fillna(mean_publisher_counts)

In [None]:
author_counts = books.groupby('Book-Author').agg(author_count=('ISBN', 'count')).sort_values(by='author_count', ascending=False)
author_counts.head(10)

In [None]:
figname = 'author_count'

fig, ax = plt.subplots(1, 2, figsize=(9, 4))
ax[0].bar(author_counts.head(20).index, author_counts.head(20)['author_count'])
ticks = ax[0].get_xticklabels()
ax[0].set_xticks(
    [t.get_position()[0] for t in ticks], 
    [t.get_text() for t in ticks],
    rotation=90
)
ax[1].hist(author_counts.values, bins=30)
ax[1].set_yscale('log')

ax[0].set_xlabel('Author', fontweight='bold')
ax[0].set_ylabel('Number of Occurrences', fontweight='bold')
ax[0].set_title('Top-20 Authors', fontweight='bold')

ax[1].set_xlabel('Number of Occurrences', fontweight='bold')
ax[1].set_ylabel('Number of Authors', fontweight='bold')
ax[1].set_title('Authors Histogram', fontweight='bold');

plt.tight_layout()

In [None]:
books = books.merge(author_counts, how='left', left_on='Book-Author', right_index=True)
books.head()

In [None]:
books = books.drop(columns=['Book-Title', 'Book-Author', 'Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L'])

In [None]:
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(int)

In [None]:
mean_year_of_publication = books[
    (books['Year-Of-Publication'] >= 1950)
    & (books['Year-Of-Publication'] <= 2004)
    ]['Year-Of-Publication'].mean()
mean_year_of_publication = int(mean_year_of_publication)

In [None]:
books.loc[
    (books['Year-Of-Publication'] < 1950) | (books['Year-Of-Publication'] > 2004), 'Year-Of-Publication'
    ] = mean_year_of_publication

In [None]:
figname = 'years_of_publication'

fig, ax = plt.subplots(figsize=(4, 4))

ax.hist(books['Year-Of-Publication'], bins=10);

ax.set_xlabel('Year', fontweight='bold')
ax.set_ylabel('Number of Occurrences', fontweight='bold')
ax.set_title('Year Of Publication Histogram', fontweight='bold')

plt.tight_layout()

In [None]:
books_year_scaler = MinMaxScaler()
books_year_scaled = books_year_scaler.fit_transform(books['Year-Of-Publication'].values.reshape(-1, 1))
books['Year-Of-Publication'] = books_year_scaled

In [None]:
books_publisher_count_scaler = MinMaxScaler()
books_publisher_count_scaled = books_year_scaler.fit_transform(books['publisher_count'].values.reshape(-1, 1))
books['publisher_count'] = books_publisher_count_scaled

In [None]:
books['author_count'].hist()

In [None]:
books['author_count'] = np.log(books['author_count'])

In [None]:
books['author_count'].hist()

In [None]:
books_author_count_scaler = MinMaxScaler()
books_author_count_scaled = books_year_scaler.fit_transform(books['author_count'].values.reshape(-1, 1))
books['author_count'] = books_author_count_scaled

In [None]:
books.head()

In [None]:
mean_author_count = books['author_count'].mean()
mean_author_count

In [None]:
books['author_count'] = books['author_count'].fillna(mean_author_count)

In [None]:
users.head()

In [None]:
users.loc[:, 'Country'] = users['Location'].apply(lambda x: x.split(', ')[-1])

In [None]:
country_counts = users.groupby('Country').agg(country_count=('User-ID', 'count')).sort_values(by='country_count', ascending=False)
country_counts.head(10)

In [None]:
country_counts.iloc[1:].sum()

In [None]:
country_counts.iloc[0]

In [None]:
figname = 'country_count'

fig, ax = plt.subplots(1, 2, figsize=(9, 4))
ax[0].bar(
    country_counts[country_counts.index!=','].head(20).index, 
    country_counts[country_counts.index!=','].head(20)['country_count']
    )
ticks = ax[0].get_xticklabels()
ax[0].set_xticks(
    [t.get_position()[0] for t in ticks], 
    [t.get_text() for t in ticks],
    rotation=90
)
ax[1].hist(author_counts.values, bins=30)
# ax[1].set_yscale('log')

ax[0].set_xlabel('Country', fontweight='bold')
ax[0].set_ylabel('Number of Occurrences', fontweight='bold')
ax[0].set_title('Top-20 Countries', fontweight='bold')

ax[1].set_xlabel('Number of Occurrences', fontweight='bold')
ax[1].set_ylabel('Number of Countries', fontweight='bold')
ax[1].set_title('Countries Histogram', fontweight='bold');

plt.tight_layout()

In [None]:
users = users.merge(country_counts, how='left', left_on='Country', right_index=True)
users.head()

In [None]:
users['Age'].isna().sum() / users['Age'].count()

In [None]:
users.loc[users['Age'] < 10, 'Age'] = np.nan
users.loc[users['Age'] > 90, 'Age'] = np.nan

In [None]:
users_mean_age = int(users['Age'].mean())
users_mean_age

In [None]:
users.loc[:, 'Age'].fillna(users_mean_age, inplace=True)

In [None]:
figname = 'users_age'

fig, ax = plt.subplots(figsize=(4, 4))

ax.hist(users['Age'], bins=10);

ax.set_xlabel('Age', fontweight='bold')
ax.set_ylabel('Number of Occurrences', fontweight='bold')
ax.set_title('Useres Age Histogram', fontweight='bold')

plt.tight_layout()

In [None]:
users_age_scaler = MinMaxScaler()
users_age_scaled = users_age_scaler.fit_transform(users['Age'].values.reshape(-1, 1))
users['Age'] = users_age_scaled

In [None]:
users['country_count'].hist()

In [None]:
users['country_count'] = np.where(users['country_count'] > 15000, 1, 0)

In [None]:
users.drop(columns=['Location', 'Country'], inplace=True)

In [None]:
users.head()

In [None]:
user_mapping = {userid: i for i, userid in enumerate(df['User-ID'].unique())}
item_mapping = {isbn: i for i, isbn in enumerate(df['ISBN'].unique())}

user_mapping_inv = {v: k for k, v in user_mapping.items()}
item_mapping_inv = {v: k for k, v in item_mapping.items()}

In [None]:
num_users = len(user_mapping)
num_items = len(item_mapping)
num_total = num_users + num_items

user_ids = torch.LongTensor([user_mapping[i] for i in df['User-ID']])
item_ids = torch.LongTensor([item_mapping[i] for i in df['ISBN']])
edge_index = torch.stack((user_ids, item_ids))

In [None]:
users.loc[:, 'ID'] = users.loc[:, 'User-ID'].apply(lambda i: user_mapping[i])
users.sort_values(by='ID', inplace=True)

In [None]:
books.loc[:, 'ID'] = books.loc[:, 'ISBN'].apply(lambda i: item_mapping[i])
books.sort_values(by='ID', inplace=True)

In [None]:
users_features = torch.Tensor(users.drop(columns=['User-ID', 'ID']).values)
items_features = torch.Tensor(books.drop(columns=['ISBN', 'ID']).values)

In [None]:
train_index, val_index = train_test_split(range(len(df)), test_size=0.25, random_state=SEED)

train_edge_index = edge_index[:, train_index]
val_edge_index = edge_index[:, val_index]

In [None]:
book_crossing_dataset = {
    'users_features': users_features,
    'items_features': items_features,
    'train_edge_index': train_edge_index,
    'val_edge_index': val_edge_index
}

In [None]:
cold_users_mask = ~torch.isin(val_edge_index[0, :].unique(), train_edge_index[0, :].unique())
cold_users = val_edge_index[0, :].unique()[cold_users_mask]
n_cold_users = len(cold_users)
n_users = val_edge_index[0, :].unique().shape[0]
n_cold_users / n_users

In [None]:
cold_items_mask = ~torch.isin(val_edge_index[1, :].unique(), train_edge_index[1, :].unique())
cold_items = val_edge_index[1, :].unique()[cold_items_mask]
n_cold_items = len(cold_items)
n_items = val_edge_index[0, :].unique().shape[0]
n_cold_items / n_items

In [None]:
with open('datasets/book_crossing_dataset.bin', 'wb') as f:
    pickle.dump(book_crossing_dataset, f)