# Эксперименты с books доменом

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm

from scipy import sparse
from sklearn.preprocessing import normalize

import implicit
from lightfm import LightFM



In [3]:
data_path = Path('../data')

In [4]:
df_books_rating = pd.read_parquet(data_path / 'df_books_rating.parquet')

In [530]:
df_books_rating['Book-Rating'] = df_books_rating['Book-Rating'].replace(0, np.nan)

In [540]:
df_books_rating['Book-Rating'] = (df_books_rating.groupby('ISBN', sort=False)['Book-Rating']
                                  .apply(lambda x: x.fillna(x.mean())))

In [5]:
df_books_rating

Unnamed: 0,User-ID,ISBN,Book-Rating
10,276746,b0425115801,0
11,276746,b0449006522,0
12,276746,b0553561618,0
13,276746,b055356451X,0
16,276747,b0060976845,9
...,...,...,...
1149766,276704,b0441007813,0
1149767,276704,b0446353957,0
1149768,276704,b0446605409,0
1149771,276704,b0743211383,7


In [31]:
rows, r_pos = np.unique(df_books_rating.values[:, 0], return_inverse=True)
cols, c_pos = np.unique(df_books_rating.values[:, 1], return_inverse=True)

In [32]:
sparse_interactions = sparse.csc_matrix((df_books_rating.values[:, 2].astype(int), (r_pos, c_pos)))

In [33]:
sparse_interactions

<27498x9673 sparse matrix of type '<class 'numpy.intc'>'
	with 430478 stored elements in Compressed Sparse Column format>

In [187]:
Pui = normalize(sparse_interactions, norm='l2', axis=1)

In [188]:
sim = Pui.T * Pui

In [189]:
sim

<9673x9673 sparse matrix of type '<class 'numpy.float64'>'
	with 11437274 stored elements in Compressed Sparse Column format>

In [14]:
books_df = pd.read_parquet(data_path/'books_bd.parquet')

In [15]:
books_df[books_df['Book-Title'].str.contains('Dorian')]

Unnamed: 0,Book-Title,Book-Author,ISBN,Year-Of-Publication,Image-URL-M
109090,Le portrait de Dorian Gray,Oscar Wilde,b2253002887,1972,http://images.amazon.com/images/P/2253002887.0...
145420,Picture of Dorian Gray,Oscar Wilde,b0140620338,0,http://images.amazon.com/images/P/0140620338.0...
145423,Picture of Dorian Gray (Wordsworth Classics),Oscar Wilde,b1853260150,1997,http://images.amazon.com/images/P/1853260150.0...
209725,The Picture of Dorian Gray,Oscar Wilde,b1593080255,1983,http://images.amazon.com/images/P/1593080255.0...
209726,The Picture of Dorian Gray (Bantam Classics),OSCAR WILDE,b0553212540,1982,http://images.amazon.com/images/P/0553212540.0...
209728,The Picture of Dorian Gray (Dover Thrift Editi...,Oscar Wilde,b0486278077,1993,http://images.amazon.com/images/P/0486278077.0...
209731,The Picture of Dorian Gray (Modern Library (Pa...,OSCAR WILDE,b0375751513,1998,http://images.amazon.com/images/P/0375751513.0...
209734,The Picture of Dorian Gray (Penguin Classic),Oscar Wilde,b014043187X,1985,http://images.amazon.com/images/P/014043187X.0...


In [63]:
book_isbn = ['b0802130119', 'b0553211757', 'b1853260150']

book_id = np.where(np.isin(cols,book_isbn))[0]
book_id

array([5916, 8348, 9386], dtype=int64)

In [342]:
a = sim[book_id].toarray()

In [343]:
recs = a.argsort()[0][-20:]

In [344]:
recs = cols[recs]
recs

array(['b0061099155', 'b0375708278', 'b0385486804', 'b0385494785',
       'b006103004X', 'b0345435796', 'b1400032806', 'b0688171877',
       'b0553266055', 'b039304016X', 'b0385489129', 'b0743203178',
       'b0142002267', 'b0151001006', 'b0140280553', 'b0618002235',
       'b0618002227', 'b0345417623', 'b0345436911', 'b0393307050'],
      dtype=object)

In [345]:
books_df[books_df['ISBN'].isin(recs)]

Unnamed: 0,Book-Title,Book-Author,ISBN,Year-Of-Publication,Image-URL-M
10555,"Ahab's Wife: Or, The Star-Gazer",Sena Jeter Naslund,b0688171877,1999,http://images.amazon.com/images/P/0688171877.0...
11670,All Things Wise and Wonderful,James Herriot,b0553266055,0,http://images.amazon.com/images/P/0553266055.0...
25927,Blind Man's Bluff: The Untold Story of America...,Sherry Sontag,b006103004X,1998,http://images.amazon.com/images/P/006103004X.0...
75147,Galileo's Daughter: A Historical Memoir of Sci...,Dava Sobel,b0140280553,1999,http://images.amazon.com/images/P/0140280553.0...
96788,Into Thin Air : A Personal Account of the Mt. ...,JON KRAKAUER,b0385494785,1998,http://images.amazon.com/images/P/0385494785.0...
96873,Into the Wild,Jon Krakauer,b0385486804,1996,http://images.amazon.com/images/P/0385486804.0...
97740,"Isaac's Storm: A Man, a Time, and the Deadlies...",Erik Larson,b0375708278,2000,http://images.amazon.com/images/P/0375708278.0...
121725,Master and Commander (Aubrey-Maturin (Paperback)),Patrick O'Brian,b0393307050,1990,http://images.amazon.com/images/P/0393307050.0...
130732,My Losing Season (Alex Awards (Awards)),Pat Conroy,b0385489129,2002,http://images.amazon.com/images/P/0385489129.0...
136228,Nothing Like It In the World : The Men Who Bui...,Stephen E. Ambrose,b0743203178,2001,http://images.amazon.com/images/P/0743203178.0...


In [323]:
book_id = [8348, 579, 445, 875]

In [373]:
a = np.array(sim[book_id].sum(axis=0))

In [377]:
sorted(a[0])[-20:]

[0.2175628337591385,
 0.2334142099946892,
 0.2608695652173913,
 0.26118459829586455,
 0.27639086259242884,
 0.2861643329692475,
 0.3038208337718157,
 0.3337833805882951,
 0.33405340894127217,
 0.49438767541180056,
 0.495575221238938,
 0.4970017713928928,
 0.4999999999999999,
 0.5013613279534618,
 0.5040160411276088,
 0.5043529258691772,
 0.5160671781734036,
 1.4976203163043833,
 2.0285944218812055,
 5.870760869792546]

In [378]:
a.argsort()[0]

array([   0, 5917, 5916, ..., 3165, 3146, 8348], dtype=int64)

In [326]:
recs = a.argsort()[0][-20:]

In [327]:
recs

array([1495,  545, 7112, 8837, 2719,  893,  981, 3085, 7418, 5426, 2734,
       1677, 6478, 5411, 4252,  316,  579,  875,  445, 8348], dtype=int64)

In [328]:
recs = cols[recs]
recs

array(['b0330332775', 'b0061097357', 'b0671617028', 'b0877017883',
       'b0380789035', 'b0140430725', 'b0142001805', 'b0385494785',
       'b0671880187', 'b0452282152', 'b0380791994', 'b0345348036',
       'b0553579312', 'b0452280621', 'b0441003257', 'b0060955775',
       'b0062513982', 'b0140366784', 'b0061044431', 'b0802130119'],
      dtype=object)

In [329]:
books_df[books_df['ISBN'].isin(recs)]

Unnamed: 0,Book-Title,Book-Author,ISBN,Year-Of-Publication,Image-URL-M
13015,American Gods,Neil Gaiman,b0380789035,2001,http://images.amazon.com/images/P/0380789035.0...
22552,Beloved,Toni Morrison,b0452280621,0,http://images.amazon.com/images/P/0452280621.0...
28916,Bridget Jones's Diary,Helen Fielding,b0330332775,1996,http://images.amazon.com/images/P/0330332775.0...
30598,By the River Piedra I Sat Down and Wept,Paulo Coelho,b0062513982,1996,http://images.amazon.com/images/P/0062513982.0...
57473,Dreaming of the Bones,Deborah Crombie,b0553579312,1998,http://images.amazon.com/images/P/0553579312.0...
77553,Girl with a Pearl Earring,Tracy Chevalier,b0452282152,2001,http://images.amazon.com/images/P/0452282152.0...
79153,Good Omens,Neil Gaiman,b0441003257,1996,http://images.amazon.com/images/P/0441003257.0...
80729,Griffin &amp; Sabine: An Extraordinary Corresp...,Nick Bantock,b0877017883,1991,http://images.amazon.com/images/P/0877017883.0...
96788,Into Thin Air : A Personal Account of the Mt. ...,JON KRAKAUER,b0385494785,1998,http://images.amazon.com/images/P/0385494785.0...
99143,Jane Eyre (Puffin Classics),Charlotte Bronte,b0140366784,1991,http://images.amazon.com/images/P/0140366784.0...


In [10]:
class CosineDistanceRecommender:
    def __init__(self):
        pass
    
    def fit(self, interactions_data):
        self.rows, r_pos = np.unique(interactions_data['user'].values, return_inverse=True)
        self.cols, c_pos = np.unique(interactions_data['item'].values, return_inverse=True)
        
        sparse_interactions = sparse.csc_matrix((df_books_rating.values[:, 2].astype(int), (r_pos, c_pos)))
        
        Pui = normalize(sparse_interactions, norm='l2', axis=1)
        
        self.simularity = Pui.T * Pui
        
    def recommend(self, user_history, n_to_recommend, mode='probabilistic'):
        book_id = np.where(np.isin(self.cols, user_history))[0]
        a = np.array(self.simularity[book_id].sum(axis=0))
        if mode == 'probabilistic':
            recs = np.random.choice(cols, p=(a[0]/a.sum()), size=n_to_recommend+len(book_id), replace=False)
        elif mode == 'deterministic':
            recs = a.argsort()[0][-(n_to_recommend+len(book_id)):]
            recs = self.cols[recs]
        recs = recs[~np.isin(recs, user_history)][-n_to_recommend:]
        return recs[::-1], a

In [11]:
recomender = CosineDistanceRecommender()

In [12]:
recomender.fit(df_books_rating.rename({'User-ID': 'user', 'ISBN': 'item'}, axis=1))

In [16]:
recs, a = recomender.recommend(['b0802130119', 'b0553211757', 'b1853260150'], 20, mode='deterministic')
for i in recs:
    print(books_df[books_df['ISBN'] == i]['Book-Title'].item())

Brave New World
Good Omens
Beloved
The Princess Bride: S Morgenstern's Classic Tale of True Love and High Adventure
A Pale View of Hills (Vintage International)
Westing Game
Into Thin Air : A Personal Account of the Mt. Everest Disaster
Griffin &amp; Sabine: An Extraordinary Correspondence
The Poisonwood Bible
One Hundred Years of Solitude
Naked
American Gods
Orlando: A Biography
Rabbit at Rest
A Clockwork Orange (Norton Paperback Fiction)
Waiting: The True Confessions of a Waitress
Persepolis : The Story of a Childhood (Alex Awards (Awards))
Frankenstein (Dover Thrift Editions)
Filth
In the Beauty of the Lilies


In [562]:
res = 0
for _ in range(1000):
    try:
        res += np.where('b0374522928'==np.random.choice(cols, p=(a[0]/a.sum()), size=20))[0].item()
    except ValueError: 
        res += 21
res/1000

20.971

In [557]:
recs

array(['b0380730847', 'b0385491026', 'b0385334583', 'b0060987103',
       'b0064407667', 'b0140088946', 'b0060199652', 'b0449219623',
       'b3453007867', 'b0553579606', 'b0812566785', 'b0380807149',
       'b1551669080', 'b031284879X', 'b0385494785', 'b0821764527',
       'b0767902831', 'b0099521016', 'b0786866586', 'b0374522928'],
      dtype=object)

In [503]:
np.random.choice(cols, p=(a[0]/a.sum()), size=20+len(book_id))

array(['b0553279572', 'b0446605239', 'b0385483503', 'b0140274146',
       'b1551666839', 'b0140444173', 'b0140449132', 'b0140620869',
       'b0373825064', 'b0140449132', 'b0140449132', 'b0517618141',
       'b0140043519', 'b1593080255', 'b0393312836', 'b0060977493',
       'b1853260363', 'b0679745653', 'b0880389052', 'b0385721234',
       'b0880389052', 'b0140043519', 'b0811807843'], dtype=object)

In [504]:
a.argsort()[0][-(20+len(book_id)):]

array([9242, 5758, 8860, 8859,  889,  834, 6573, 5952, 7051, 1697,  965,
        760, 1148,  936, 8475, 6576, 3061, 6579,  713,  340, 7762, 9371,
        920], dtype=int64)

In [180]:
model_l = LightFM(loss='warp')

In [181]:
sparse_interactions = sparse.csc_matrix((df_books_rating.values[:, 2].astype(int), (r_pos, c_pos)), shape=(len(rows)+1, len(cols)))

In [182]:
a = model_l.fit(sparse_interactions, epochs=20, verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


In [191]:
recs_l = model_l.predict([27498], [i for i in range(len(cols))]).argsort()[-20:][::-1]
recs_l = cols[recs_l]

In [192]:
for i in recs_l:
    print(books_df[books_df['ISBN'] == i]['Book-Title'].item())

The Lovely Bones
Wild Animus
The Da Vinci Code
Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))
The Nanny Diaries
Bridget Jones's Diary
The Secret Life of Bees
Divine Secrets of the Ya-Ya Sisterhood
Angels &amp; Demons
Life of Pi
To Kill a Mockingbird
Where the Heart Is (Oprah's Book Club (Paperback))
Interview with the Vampire
Harry Potter and the Chamber of Secrets (Book 2)
Harry Potter and the Order of the Phoenix (Book 5)
The Catcher in the Rye
The Girls' Guide to Hunting and Fishing
Snow Falling on Cedars
The Bridges of Madison County
The Summons


In [193]:
sp_row = sparse.coo_matrix(([5 for _ in book_id], ([0 for i in book_id], book_id)), shape=(1, sparse_interactions.shape[1]))

In [194]:
sparse_interactions[-1].todense().sum()

0

In [195]:
sparse_interactions[-1] = sp_row

In [196]:
sparse_interactions[-1].todense().sum()

15

In [200]:
model_l.fit_partial(sparse_interactions, epochs=10, verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


<lightfm.lightfm.LightFM at 0x1af1c054a08>

In [201]:
recs_l = model_l.predict([27498], [i for i in range(len(cols))]).argsort()[-20:][::-1]
recs_l = cols[recs_l]

In [202]:
for i in recs_l:
    print(books_df[books_df['ISBN'] == i]['Book-Title'].item())

Ender's Game (Ender Wiggins Saga (Paperback))
High Fidelity
The Catcher in the Rye
Stupid White Men ...and Other Sorry Excuses for the State of the Nation!
Coraline
Jurassic Park
American Gods
Lolita (Vintage International)
Dune (Remembering Tomorrow)
Artemis Fowl (Artemis Fowl, Book 1)
Girl with a Pearl Earring
Atlas Shrugged
Brave New World
Girl, Interrupted
Lord of the Flies
Neverwhere
Watership Down
The Perfect Storm : A True Story of Men Against the Sea
Fast Food Nation: The Dark Side of the All-American Meal
The Hobbit : The Enchanting Prelude to The Lord of the Rings


In [164]:
model_i = implicit.als.AlternatingLeastSquares()

  "Intel MKL BLAS detected. Its highly recommend to set the environment "


In [172]:
model_i.fit(sparse_interactions)



  0%|          | 0/15 [00:00<?, ?it/s]

In [173]:
sparse_interactions.shape

(27499, 9673)

In [174]:
book_id

array([5916, 8348, 9386], dtype=int64)

In [175]:
sp_row = sparse.coo_matrix(([5 for _ in book_id], ([0 for i in book_id], book_id)), shape=(1, sparse_interactions.shape[1]))

In [176]:
i_recs = model_i.recommend(0, sp_row.tocsr(), recalculate_user=True, N=20)[0]

In [177]:
i_recs = cols[i_recs]

In [178]:
for i in i_recs:
    print(books_df[books_df['ISBN'] == i]['Book-Title'].item())

Brave New World
The Nanny Diaries
Slaughterhouse Five or the Children's Crusade: A Duty Dance With Death
Siddhartha
The Firm
Cat's Cradle
Lolita (Vintage International)
Daisy Fay and the Miracle Man
Confessions of a Shopaholic (Summer Display Opportunity)
House of Leaves
A Clockwork Orange (Norton Paperback Fiction)
Beloved
Bastard Out of Carolina
Confessions of an Ugly Stepsister
The Pelican Brief
1984
Skinny Legs and All
Blindness (Harvest Book)
Hannibal
Ishmael: An Adventure of the Mind and Spirit
