In [1]:
import numpy as np
import pandas as pd 


import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from tqdm import tqdm
from scipy.sparse import coo_matrix
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.preprocessing import Normalizer

**Загрузим данные**

In [2]:
books = pd.read_csv('input/books.csv', encoding = "ISO-8859-1")
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPrÃ©",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [3]:
books.shape

(10000, 23)

In [4]:
books.columns

Index(['id', 'book_id', 'best_book_id', 'work_id', 'books_count', 'isbn',
       'isbn13', 'authors', 'original_publication_year', 'original_title',
       'title', 'language_code', 'average_rating', 'ratings_count',
       'work_ratings_count', 'work_text_reviews_count', 'ratings_1',
       'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 'image_url',
       'small_image_url'],
      dtype='object')

In [5]:
ratings = pd.read_csv('input/ratings.csv', encoding = "ISO-8859-1")
ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [6]:
book_tags = pd.read_csv('input/book_tags.csv', encoding = "ISO-8859-1")
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [7]:
tags = pd.read_csv('input/tags.csv')
tags.tail()

Unnamed: 0,tag_id,tag_name
34247,34247,Ｃhildrens
34248,34248,Ｆａｖｏｒｉｔｅｓ
34249,34249,Ｍａｎｇａ
34250,34250,ＳＥＲＩＥＳ
34251,34251,ｆａｖｏｕｒｉｔｅｓ


In [8]:
tags_join_DF = pd.merge(book_tags, tags, left_on='tag_id', right_on='tag_id', how='inner')
tags_join_DF.head()

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,2,30574,24549,to-read
2,3,30574,496107,to-read
3,5,30574,11909,to-read
4,6,30574,298,to-read


In [9]:
tags_join_DF.sort_values('goodreads_book_id')

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
258362,1,33165,338,youth
250046,1,17213,347,kindle
246919,1,27535,348,shelfari-favorites
245888,1,16799,351,juvenile
...,...,...,...,...
918363,33288638,2541,9,angsty
956326,33288638,1126,7,5-star-reads
358501,33288638,18680,11,loved
954279,33288638,29125,10,sweet-romance


In [10]:
to_read = pd.read_csv('input/to_read.csv')
to_read.head()

Unnamed: 0,user_id,book_id
0,1,112
1,1,235
2,1,533
3,1,1198
4,1,1874


Функция **TfidfVectorizer** из scikit-learn, которая преобразует **текст в векторы признаков**, которые можно использовать в качестве входных данных для модели.

  **Косинусное сходство** для вычисления числового значения, обозначающего сходство между двумя книгами.

In [11]:
books['authors']

0                    Suzanne Collins
1       J.K. Rowling, Mary GrandPrÃ©
2                    Stephenie Meyer
3                         Harper Lee
4                F. Scott Fitzgerald
                    ...             
9995                   Ilona Andrews
9996                  Robert A. Caro
9997                 Patrick O'Brian
9998                 Peggy Orenstein
9999                     John Keegan
Name: authors, Length: 10000, dtype: object

In [12]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(books['authors'])#(books['title'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [13]:
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

Функция, которая возвращает 20 наиболее похожих книг на основе оценки косинусного сходства.

In [14]:
titles = books['title']
indices = pd.Series(books.index, index=books['title'])

# Функция, возвращающая рекомендацию книг, основанную на cos sim авторов книг
def authors_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

In [15]:
authors_recommendations('The Hobbit').head(20)

18      The Fellowship of the Ring (The Lord of the Ri...
154            The Two Towers (The Lord of the Rings, #2)
160     The Return of the King (The Lord of the Rings,...
188     The Lord of the Rings (The Lord of the Rings, ...
963     J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
4975        Unfinished Tales of NÃºmenor and Middle-Earth
2308                               The Children of HÃºrin
610              The Silmarillion (Middle-Earth Universe)
8271                   The Complete Guide to Middle-Earth
1128     The History of the Hobbit, Part One: Mr. Baggins
465                             The Hobbit: Graphic Novel
0                 The Hunger Games (The Hunger Games, #1)
1       Harry Potter and the Sorcerer's Stone (Harry P...
2                                 Twilight (Twilight, #1)
3                                   To Kill a Mockingbird
4                                        The Great Gatsby
5                                  The Fault in Our Stars
7             

Будем рекомендовать книги, используя теги, предоставленные для книг.

In [16]:
books_with_tags = pd.merge(books, tags_join_DF, left_on='book_id', right_on='goodreads_book_id', how='inner')

In [17]:
books_with_tags

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,goodreads_book_id,tag_id,count,tag_name
0,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,30574,11314,to-read
1,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,11305,10836,fantasy
2,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,11557,50755,favorites
3,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,8717,35418,currently-reading
4,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,33114,25968,young-adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999907,10000,8914,8914,11817,31,375700455,9.780376e+12,John Keegan,1998.0,The First World War,...,345,2031,4138,3069,https://images.gr-assets.com/books/1403194704m...,https://images.gr-assets.com/books/1403194704s...,8914,16529,4,john-keegan
999908,10000,8914,8914,11817,31,375700455,9.780376e+12,John Keegan,1998.0,The First World War,...,345,2031,4138,3069,https://images.gr-assets.com/books/1403194704m...,https://images.gr-assets.com/books/1403194704s...,8914,32805,3,world-war
999909,10000,8914,8914,11817,31,375700455,9.780376e+12,John Keegan,1998.0,The First World War,...,345,2031,4138,3069,https://images.gr-assets.com/books/1403194704m...,https://images.gr-assets.com/books/1403194704s...,8914,32156,3,war-ww1
999910,10000,8914,8914,11817,31,375700455,9.780376e+12,John Keegan,1998.0,The First World War,...,345,2031,4138,3069,https://images.gr-assets.com/books/1403194704m...,https://images.gr-assets.com/books/1403194704s...,8914,20285,3,modern-european-history


In [18]:
tf1 = TfidfVectorizer(max_df=0.95, analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix1 = tf1.fit_transform(books_with_tags['tag_name'].head(10000))
cosine_sim1 = linear_kernel(tfidf_matrix1, tfidf_matrix1)

In [19]:
cosine_sim1

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [20]:

titles1 = books['title']
indices1 = pd.Series(books.index, index=books['title'])

def tags_recommendations(title):
    idx = indices1[title]
    sim_scores = list(enumerate(cosine_sim1[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

In [21]:
tags_recommendations('The Hobbit').head(20)

16                  Catching Fire (The Hunger Games, #2)
31                                       Of Mice and Men
107         Confessions of a Shopaholic (Shopaholic, #1)
125                            Dune (Dune Chronicles #1)
149                                         The Red Tent
206               One for the Money (Stephanie Plum, #1)
214                                     Ready Player One
231                  The Gunslinger (The Dark Tower, #1)
253               Shiver (The Wolves of Mercy Falls, #1)
313                              Inkheart (Inkworld, #1)
325                                       White Oleander
405       The New Drawing on the Right Side of the Brain
412                                 The Three Musketeers
425                              A Confederacy of Dunces
505                          The One (The Selection, #3)
513                    The Adventures of Sherlock Holmes
525                  Darkly Dreaming Dexter (Dexter, #1)
566                            

Рекомендация книг с использованием авторов и тегов одновременно.

In [22]:
temp_df = books_with_tags.groupby('book_id')['tag_name'].apply(' '.join).reset_index()
temp_df.head()

Unnamed: 0,book_id,tag_name
0,1,to-read fantasy favorites currently-reading yo...
1,2,to-read fantasy favorites currently-reading yo...
2,3,to-read fantasy favorites currently-reading yo...
3,5,to-read fantasy favorites currently-reading yo...
4,6,to-read fantasy young-adult fiction harry-pott...


In [23]:
books = pd.merge(books, temp_df, left_on='book_id', right_on='book_id', how='inner')

In [24]:
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,tag_name
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,to-read fantasy favorites currently-reading yo...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPrÃ©",1997.0,Harry Potter and the Philosopher's Stone,...,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,to-read fantasy favorites currently-reading yo...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,to-read fantasy favorites currently-reading yo...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,to-read favorites currently-reading young-adul...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,to-read favorites currently-reading young-adul...


In [25]:
books['corpus'] = (pd.Series(books[['authors', 'tag_name']]
                .fillna('')
                .values.tolist()
                ).str.join(' '))

books['corpus'].head()

0    Suzanne Collins to-read fantasy favorites curr...
1    J.K. Rowling, Mary GrandPrÃ© to-read fantasy f...
2    Stephenie Meyer to-read fantasy favorites curr...
3    Harper Lee to-read favorites currently-reading...
4    F. Scott Fitzgerald to-read favorites currentl...
Name: corpus, dtype: object

In [27]:
tf_corpus = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix_corpus = tf_corpus.fit_transform(books['corpus'])
cosine_sim_corpus = linear_kernel(tfidf_matrix_corpus, tfidf_matrix_corpus)

titles = books['title']
indices = pd.Series(books.index, index=books['title'])

def corpus_recommendations(title):
    idx = indices1[title]
    sim_scores = list(enumerate(cosine_sim_corpus[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

corpus_recommendations("The Hobbit")


KeyboardInterrupt



In [None]:
corpus_recommendations("Twilight (Twilight, #1)")

In [None]:
corpus_recommendations("Romeo and Juliet")

In [None]:
ratings

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4
...,...,...,...
981751,10000,48386,5
981752,10000,49007,4
981753,10000,49383,5
981754,10000,50124,5


In [None]:
books

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,tag_name,corpus
0,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,to-read fantasy favorites currently-reading yo...,Suzanne Collins to-read fantasy favorites curr...
1,2,3,3,4640799,491,439554934,9.780440e+12,"J.K. Rowling, Mary GrandPrÃ©",1997.0,Harry Potter and the Philosopher's Stone,...,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,to-read fantasy favorites currently-reading yo...,"J.K. Rowling, Mary GrandPrÃ© to-read fantasy f..."
2,3,41865,41865,3212258,226,316015849,9.780316e+12,Stephenie Meyer,2005.0,Twilight,...,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,to-read fantasy favorites currently-reading yo...,Stephenie Meyer to-read fantasy favorites curr...
3,4,2657,2657,3275794,487,61120081,9.780061e+12,Harper Lee,1960.0,To Kill a Mockingbird,...,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,to-read favorites currently-reading young-adul...,Harper Lee to-read favorites currently-reading...
4,5,4671,4671,245494,1356,743273567,9.780743e+12,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,to-read favorites currently-reading young-adul...,F. Scott Fitzgerald to-read favorites currentl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,7130616,7130616,7392860,19,441019455,9.780441e+12,Ilona Andrews,2010.0,Bayou Moon,...,1180,105,575,3538,7860,6778,https://images.gr-assets.com/books/1307445460m...,https://images.gr-assets.com/books/1307445460s...,to-read fantasy favorites currently-reading fi...,Ilona Andrews to-read fantasy favorites curren...
9996,9997,208324,208324,1084709,19,067973371X,9.780680e+12,Robert A. Caro,1990.0,Means of Ascent,...,395,303,551,1737,3389,6972,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...,to-read favorites currently-reading books-i-ow...,Robert A. Caro to-read favorites currently-rea...
9997,9998,77431,77431,2393986,60,039330762X,9.780393e+12,Patrick O'Brian,1977.0,The Mauritius Command,...,374,11,111,1191,4240,5180,https://images.gr-assets.com/books/1455373531m...,https://images.gr-assets.com/books/1455373531s...,to-read favorites currently-reading fiction bo...,Patrick O'Brian to-read favorites currently-re...
9998,9999,8565083,8565083,13433613,7,61711527,9.780062e+12,Peggy Orenstein,2011.0,Cinderella Ate My Daughter: Dispatches from th...,...,1988,275,1002,3765,4577,2375,https://images.gr-assets.com/books/1279214118m...,https://images.gr-assets.com/books/1279214118s...,to-read favorites currently-reading books-i-ow...,Peggy Orenstein to-read favorites currently-re...


Напоминание про ДЗ 1


### ДЗ 2. Content-based рекомендация.  Гибкий дедлайн 11 октября. Жесткий дедлайн 18 октября (оценка - 1 балл)



1. Приведите данные датасета ratings к виду датафрейма со строками-пользователями, столбцами-книгами и рейтингами на пересечении

user_vectors = #TO DO

(проверка: размерность датасета должна быть (53424, 10000) )

In [76]:
ratings = pd.read_csv('input/ratings.csv', encoding = "ISO-8859-1")
ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [77]:
bs = max(ratings.book_id)
us = max(ratings.user_id)
dct = dict(zip([i for i in tqdm(range(1, bs + 1))], [np.zeros(us, dtype='int8') for _ in range(bs)]))
for ind, line in tqdm(ratings.iterrows()):
    dct[line['book_id']][line['user_id'] - 1] = line['rating']

100%|████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<?, ?it/s]
981756it [01:02, 15658.53it/s]


In [78]:
df = pd.DataFrame(dct, index=range(1, us + 1))

In [79]:
df.shape

(53424, 10000)

2. Создайте векторы, характеризующие пользоваталей (для content-based лучше это делать по дополнительным данным о пользователях, но так как тут у нас их нет - сделаем вектора-признаки (размерностью 100) на основе оценок) 

Делать будем с помощью метода понижения размерности PCA до 100.

In [80]:
pca = PCA(n_components=100)
user_embedd = pca.fit_transform(df)

In [81]:
user_embedd.shape

(53424, 100)

In [82]:
user_embed = pd.DataFrame({'user_id': df.index, 'user_vec': user_embedd.tolist()})
user_embed.head()

Unnamed: 0,user_id,user_vec
0,1,"[-0.3057577790596991, -0.33992516896680086, -0..."
1,2,"[-0.3130608866418843, -0.3409979701362326, -0...."
2,3,"[-0.30871348543166083, -0.3337901464927845, -0..."
3,4,"[-0.2814574950511793, -0.26087864199609656, -0..."
4,5,"[-0.3150963619909823, -0.3159408937866643, -0...."


3. Объедините три датасета:  
    * ratings
    * pca_user_vectors(векторы-признаки для каждого пользователя)
    * tf-idf на основе books['corpus'] для каждой книги
    
В итоге у вас должен получиться датафрейм с вектором пользователя, вектором книги и таргет-рейтинг.

In [83]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=5, stop_words='english', max_features=300)

In [84]:
new_books = pd.DataFrame({'book_vec':vectorizer.fit_transform(books['corpus']).todense().tolist(), 'book_id':books['book_id']})

In [85]:
big_ds = user_embed.merge(ratings, how='inner', on=['user_id'])
big_ds = big_ds.merge(new_books, how='inner', on=['book_id'])
big_ds.head()

Unnamed: 0,user_id,user_vec,book_id,rating,book_vec
0,2,"[-0.3130608866418843, -0.3409979701362326, -0....",9762,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,977,"[-0.3149784823122985, -0.3657540689036181, -0....",9762,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1867,"[-0.31575035595912593, -0.3512254925718979, -0...",9762,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,2602,"[0.5142370501891345, 3.256770881079572, -0.787...",9762,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,2674,"[-0.31244143431435295, -0.341435855758464, -0....",9762,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


4. Разбейте на train/valid set

In [86]:
u_ids = np.unique(big_ds['user_id'])
train_index = np.random.choice(u_ids, replace=False, size=int(len(u_ids) * 0.75))
test_index = list(set(u_ids) - set(train_index))

In [87]:
assert(len(train_index) + len(test_index) == len(u_ids))

In [88]:
train = big_ds[big_ds['user_id'].isin(train_index)]
test = big_ds[big_ds['user_id'].isin(test_index)]

train['emb'] = train['user_vec'] + train['book_vec']
test['emb'] = test['user_vec'] + test['book_vec']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['emb'] = train['user_vec'] + train['book_vec']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['emb'] = test['user_vec'] + test['book_vec']


In [89]:
np.array(list(map(lambda x: np.array(x, dtype=np.float32), train['emb'].tolist())))

array([[-0.31306088, -0.34099796, -0.14852278, ...,  0.        ,
         0.        ,  0.        ],
       [-0.31497848, -0.36575407, -0.18939672, ...,  0.        ,
         0.        ,  0.        ],
       [-0.31575036, -0.3512255 , -0.15600841, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.36413676, -0.25569016,  1.7298709 , ...,  0.        ,
         0.        ,  0.        ],
       [-0.2725889 ,  1.2990155 ,  5.6456695 , ...,  0.        ,
         0.        ,  0.        ],
       [-0.21346605,  0.80348694,  3.0620763 , ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)

5. Обучите любую модель машинного обучения (для задачи регрессии (линейную или ансамбль деревьев)), сделайте прогноз и посчитайте метрики RMSE, MSE

In [90]:
clf1 = linear_model.Ridge()

In [91]:
clf1.fit(train['emb'].tolist(), train.rating.tolist())

In [92]:
y_pred = clf1.predict(test['emb'].tolist())

In [93]:
y_pred

array([3.93490489, 3.66362206, 3.69217879, ..., 3.97387575, 4.35682138,
       3.83512   ])

In [94]:
print(f"MSE: {np.square(y_pred - test.rating).sum() / test.rating.shape[0] : .4f}")

MSE:  0.9330


6. Добавьте другие признаки по книгам из books

In [95]:
# big_ds.drop('original_publication_year', axis=1, inplace=True)
# big_ds.drop('emb', axis=1, inplace=True)

In [96]:
big_ds.head()

Unnamed: 0,user_id,user_vec,book_id,rating,book_vec
0,2,"[-0.3130608866418843, -0.3409979701362326, -0....",9762,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,977,"[-0.3149784823122985, -0.3657540689036181, -0....",9762,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1867,"[-0.31575035595912593, -0.3512254925718979, -0...",9762,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,2602,"[0.5142370501891345, 3.256770881079572, -0.787...",9762,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,2674,"[-0.31244143431435295, -0.341435855758464, -0....",9762,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [97]:
big_ds = big_ds.merge(books[['book_id', 'original_publication_year']], how='inner', on='book_id')
big_ds['original_publication_year'] = big_ds['original_publication_year'].fillna(0)

In [98]:
big_ds['emb'] = big_ds['user_vec'] + big_ds['book_vec']

In [99]:
for ind, line in big_ds.iterrows():
    big_ds.iloc[ind]['emb'].append(line['original_publication_year'])

In [100]:
u_ids = np.unique(big_ds['user_id'])
train_index = np.random.choice(u_ids, replace=False, size=int(len(u_ids) * 0.75))
test_index = list(set(u_ids) - set(train_index))

train = big_ds[big_ds['user_id'].isin(train_index)]
test = big_ds[big_ds['user_id'].isin(test_index)]

In [101]:
train.head()

Unnamed: 0,user_id,user_vec,book_id,rating,book_vec,original_publication_year,emb
0,2,"[-0.3130608866418843, -0.3409979701362326, -0....",9762,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2002.0,"[-0.3130608866418843, -0.3409979701362326, -0...."
2,1867,"[-0.31575035595912593, -0.3512254925718979, -0...",9762,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2002.0,"[-0.31575035595912593, -0.3512254925718979, -0..."
3,2602,"[0.5142370501891345, 3.256770881079572, -0.787...",9762,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2002.0,"[0.5142370501891345, 3.256770881079572, -0.787..."
5,3854,"[-0.3116485299628897, -0.34931491510786905, -0...",9762,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2002.0,"[-0.3116485299628897, -0.34931491510786905, -0..."
7,5408,"[-0.3236632806281596, -0.3553714009352343, -0....",9762,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2002.0,"[-0.3236632806281596, -0.3553714009352343, -0...."


In [102]:
clf = linear_model.Ridge(alpha=1e-2)
clf.fit(train['emb'].tolist(), train.rating.tolist())

In [103]:
y_pred = clf.predict(test['emb'].tolist())

In [104]:
print(f"MSE: {np.square(y_pred - test.rating).sum() / test.rating.shape[0] : .4f}")

MSE:  0.9468


7. Сделайте тестовый датасет для пользователей и книг, которые находятся в датасете to-read:
    * pca_user_vectors(векторы-признаки для каждого пользователя)
    * tf-idf на основе books['corpus'] для каждой книги

In [105]:
to_read = pd.read_csv('input/to_read.csv', encoding = "ISO-8859-1")
to_read.head()

Unnamed: 0,user_id,book_id
0,1,112
1,1,235
2,1,533
3,1,1198
4,1,1874


In [106]:
to_read = to_read.merge(user_embed, how='inner', on=['user_id'])
to_read = to_read.merge(new_books, how='inner', on=['book_id'])

In [107]:
to_read.head()

Unnamed: 0,user_id,book_id,user_vec,book_vec
0,1,112,"[-0.3057577790596991, -0.33992516896680086, -0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,9,112,"[-0.275616743152314, -0.32306771061669637, -0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,72,112,"[-0.3098891275509174, -0.3348491672872679, -0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,101,112,"[-0.28957342102018785, -0.32928933921105424, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,164,112,"[-0.17179962138589017, -0.24655312627930281, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [108]:
to_read = to_read.merge(books[['book_id', 'original_publication_year']], how='inner', on='book_id')
#to_read['original_publication_year'] = to_read['original_publication_year'].fillna(0)
to_read['emb'] = to_read['user_vec'] + to_read['book_vec']
# for ind, line in to_read.iterrows():
#     to_read.iloc[ind]['emb'].append(line['original_publication_year'])

8. Сделайте для них прогноз

In [109]:
res = clf1.predict(to_read['emb'].tolist())
print(len(res[res > 4]))
print(len(res[res < 4]))

6698
80795


In [110]:
res

array([3.67447856, 3.677854  , 3.67255607, ..., 3.64295889, 3.59992268,
       3.93821997])