# Book Recommendation System
###  [Dataset](https://www.kaggle.com/datasets/arashnic/book-recommendation-dataset?resource=download)

In [1]:
import numpy as np
import pandas as pd

In [2]:
books = pd.read_csv("Books.csv")
ratings = pd.read_csv("Ratings.csv")
users = pd.read_csv("Users.csv")

  books = pd.read_csv("Books.csv")


## Подготовка данных
### Books

In [3]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271359 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [4]:
books = books.drop(['Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1)
books.rename(columns={'Book-Author': 'Author', 'Year-Of-Publication': 'Year'}, inplace=True)

In [5]:
from sklearn.preprocessing import StandardScaler
from statistics import median

books['Author'] = pd.factorize(books['Author'])[0]

books['Year'] = pd.to_numeric(books['Year'], errors='coerce')
books['Year'] = books['Year'].map(lambda x: books['Year'].mean() if x == 0 else x)
books['Year'] = StandardScaler().fit_transform(np.array(books['Year']).reshape(-1,1)) 

In [6]:
books.head()

Unnamed: 0,ISBN,Book-Title,Author,Year
0,195153448,Classical Mythology,0,0.95069
1,2005018,Clara Callan,1,0.843722
2,60973129,Decision in Normandy,2,-0.225955
3,374157065,Flu: The Story of the Great Influenza Pandemic...,3,0.629787
4,393045218,The Mummies of Urumchi,4,0.629787


### Users

In [7]:
from sklearn.preprocessing import MinMaxScaler

users['Location'] = users['Location'].map(lambda x: x.split(',')[-1].lstrip())
users['Location'] = pd.factorize(users['Location'])[0]

users['Age'] = users['Age'].fillna(users['Age'].mean())
users['Age'] = MinMaxScaler().fit_transform(users[['Age']])

In [8]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,0,0.142424
1,2,0,0.07377
2,3,1,0.142424
3,4,2,0.069672
4,5,3,0.142424


### Ratings

In [9]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [10]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


### Объединение датасетов

In [11]:
df = ratings.merge(users, on='User-ID').merge(books, on='ISBN')

In [12]:
df

Unnamed: 0,User-ID,ISBN,Book-Rating,Location,Age,Book-Title,Author,Year
0,276725,034545104X,0,0,0.142424,Flesh Tones: A Novel,2027,0.950690
1,2313,034545104X,5,0,0.094262,Flesh Tones: A Novel,2027,0.950690
2,6543,034545104X,0,0,0.139344,Flesh Tones: A Novel,2027,0.950690
3,8680,034545104X,5,0,0.008197,Flesh Tones: A Novel,2027,0.950690
4,10314,034545104X,9,0,0.142424,Flesh Tones: A Novel,2027,0.950690
...,...,...,...,...,...,...,...,...
1031131,276688,0517145553,0,0,0.142424,Mostly Harmless,153,0.201916
1031132,276688,1575660792,7,0,0.142424,Gray Matter,6628,0.308884
1031133,276690,0590907301,0,0,0.176230,Triplet Trouble and the Class Trip (Triplet Tr...,2720,0.415852
1031134,276704,0679752714,0,0,0.142424,A Desert of Pure Feeling (Vintage Contemporaries),13043,0.415852


Мощности моего пк недостаточно для вычислений с датасетом такого объема, поэтому обрежем датасет

In [13]:
df_cut = df.iloc[:200000, :]
df_cut

Unnamed: 0,User-ID,ISBN,Book-Rating,Location,Age,Book-Title,Author,Year
0,276725,034545104X,0,0,0.142424,Flesh Tones: A Novel,2027,0.950690
1,2313,034545104X,5,0,0.094262,Flesh Tones: A Novel,2027,0.950690
2,6543,034545104X,0,0,0.139344,Flesh Tones: A Novel,2027,0.950690
3,8680,034545104X,5,0,0.008197,Flesh Tones: A Novel,2027,0.950690
4,10314,034545104X,9,0,0.142424,Flesh Tones: A Novel,2027,0.950690
...,...,...,...,...,...,...,...,...
199995,242118,0440224675,8,0,0.188525,Hannibal,335,0.736755
199996,242247,0440224675,0,0,0.172131,Hannibal,335,0.736755
199997,244708,0440224675,8,3,0.142424,Hannibal,335,0.736755
199998,245604,0440224675,8,0,0.176230,Hannibal,335,0.736755


## Коллаборативная фильтрация

In [14]:
user_item_matrix = df_cut.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating')

In [15]:
user_item_matrix = user_item_matrix.fillna(0)
user_item_matrix

ISBN,0002005018,0002240114,000225669X,0002558122,0002740230,0006379702,0006485294,0006542808,0006543545,0006546684,...,9508521481,9681500555,9681500830,9722100718,9722509713,9724115380,9724119378,9726101794,9871138016,9995585227
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Косинусное сходство между пользователями

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

users_similarity = cosine_similarity(user_item_matrix)
users_similarity_df = pd.DataFrame(users_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

In [17]:
users_similarity_df

User-ID,2,8,9,10,12,14,16,17,19,20,...,278832,278836,278838,278843,278844,278846,278849,278851,278852,278854
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
278849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
278851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
278852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


#### Функция для вывода названий книг по их id

In [18]:
def book_titles_by_id(id_list):
    titles_list = [books.loc[books['ISBN'] == i, 'Book-Title'].values[0] for i in id_list]
    return titles_list

####  Рекомендация

In [19]:
def collaborative_recommendations(user_id, n_recs=10):
    # Оценки текущего пользователя
    user_ratings = user_item_matrix.loc[user_id]
    # Вычисление взвешенной суммы оценок по схожим пользователям
    similar_users = users_similarity_df[user_id]
    weighted_ratings = user_item_matrix.T.dot(similar_users)
    weighted_ratings = weighted_ratings / similar_users.sum()
    # Исключение оцененных книг
    recommendations = weighted_ratings[~user_ratings.index.isin(user_ratings[user_ratings > 0].index)]
    # Выбор n рекомендаций
    top_recs = recommendations.sort_values(ascending=False).head(n_recs)
    top_recs_dict = top_recs.iloc[:].to_dict()
    return top_recs_dict

In [20]:
# Пример для пользователя с id=12
book_titles_by_id(collaborative_recommendations(user_id=12).keys())

['Clara Callan',
 'The Conquest',
 'Blood and Water and Other Tales',
 'GARDEN OF SHADOWS (Dollanger Saga (Paperback))',
 'FAST LANES (Washington Square)',
 'Tank Sergeant',
 'ARIZONA AMES',
 'Hotel New Hampshire',
 "Eddie's Menagerie",
 'Samantha Slade: Monster-Sitter (Samantha Slade, No 1)']

## Контентная фильтрация
#### По названиям книг

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Сокращение датасета книг из-за нехватки памяти для вычислений
books_cut = books.iloc[:30000, :]

# TF-IDF матрица для названий книг
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books_cut['Book-Title'])

# Вычисление сходства между книгами
title_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)
title_similarity_df = pd.DataFrame(title_similarity, index=books_cut['ISBN'], columns=books_cut['ISBN'])

In [22]:
def get_recs(book_id, similarity_df, n_recs=10):
    sim_scores = similarity_df[book_id]
    top_similar_books = sim_scores.sort_values(ascending=False).head(n_recs + 1)
    top_similar_dict = top_similar_books.iloc[:].to_dict()
    return top_similar_dict

def title_recs(book_id, n_recs=10):
    return get_recs(book_id, title_similarity_df, n_recs)

#### По автору и году издания

In [23]:
import scipy.sparse as sp

# Сокращение датасета книг из-за нехватки памяти для вычислений
books_cut_2 = books.iloc[:5000, :]

# Нормализация "id" авторов
author_scaled = MinMaxScaler().fit_transform(books_cut_2[['Author']])

# Объединение признаков в единый вектор с назначеним весов
book_features = sp.hstack([sp.csr_matrix(author_scaled)*0.4, 
                        sp.csr_matrix(books_cut_2['Year'].to_numpy().reshape(-1, 1))*0.6])
books_similarity = linear_kernel(book_features, book_features)
books_similarity_df = pd.DataFrame(books_similarity, index=books_cut_2['ISBN'], columns=books_cut_2['ISBN'])

In [24]:
def author_and_year_recs(book_id, n_recs=10):
    return get_recs(book_id, books_similarity_df, n_recs)

#### Объединим в единое целое

In [25]:
def content_based_recommendations(book_id, num_recs=10):
    recs_title = title_recs(book_id, num_recs)
    recs_book = author_and_year_recs(book_id, num_recs)  
    # Объединяем словари, складывая веса, если книга встречается в обоих
    for key, value in recs_book.items():
        if key in recs_title:
            recs_title[key] += value
        else:
            recs_title[key] = value
    
    top_recs = sorted(recs_title.items(), key=lambda x: x[1], reverse=True)[:num_recs]
    top_recs_dict = {i[0]: i[1] for i in top_recs}
    
    return top_recs_dict

In [26]:
# Пример для книги с id=0374157065
book_titles_by_id(content_based_recommendations(book_id='0374157065'))

['Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It',
 'Red Dwarf',
 'Funny in Farsi : A Memoir of Growing Up Iranian in America',
 'The Wedding Knight',
 'A Death in Vienna',
 'Good Grief : A Novel',
 'Three Weeks with My Brother',
 'The Art of Mending : A Novel (Berg, Elizabeth)',
 'Brick Lane : A Novel',
 'The Red Hat Society(TM) : Fun and Friendship After Fifty']

## Гибридная фильтрация

In [27]:
def hybrid_recommendations(user_id, book_id, num_recs=10):
    # Рекомендаций на основе коллаборативной фильтрации
    collab_recs = collaborative_recommendations(user_id, num_recs)
    # Рекомендаций на основе контентной фильтрации
    content_recs = content_based_recommendations(book_id, num_recs)
    # Объединяем результаты, складывая веса, если книга встречается в обоих
    hybrid_recs = collab_recs.copy()
    for key, value in content_recs.items():
        if key in hybrid_recs:
            hybrid_recs[key] += value
        else:
            hybrid_recs[key] = value
            
    sorted_recs = sorted(hybrid_recs.items(), key=lambda x: x[1], reverse=True)
    recs = [rec[0] for rec in sorted_recs][:num_recs]
    
    return book_titles_by_id(recs)

In [28]:
# Пример
hybrid_recommendations(user_id=12, book_id='0374157065')

['Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It',
 'Red Dwarf',
 'Funny in Farsi : A Memoir of Growing Up Iranian in America',
 'The Wedding Knight',
 'A Death in Vienna',
 'Good Grief : A Novel',
 'Three Weeks with My Brother',
 'The Art of Mending : A Novel (Berg, Elizabeth)',
 'Brick Lane : A Novel',
 'The Red Hat Society(TM) : Fun and Friendship After Fifty']