In [21]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

In [31]:
## Loading the filtered data
df = pd.read_csv("collaborative_data.csv")
df.head()

Unnamed: 0,isbn,user,rating
0,2005018,8,5
1,2005018,11400,0
2,2005018,11676,8
3,2005018,41385,0
4,2005018,67544,8


In [32]:
"""

Now pivot the dataframe as 
columns : user
row     : isbn
values  : rating

=> row will work as data point for each books

"""
pt = df.pivot(columns='user', index='isbn', values='rating')
pt

user,8,99,242,243,254,383,388,408,424,446,...,278522,278535,278554,278563,278582,278633,278637,278771,278843,278851
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0002005018,5.0,,,,,,,,,,...,,,,,,,,,,
0002251760,,,,,,,,,,,...,,,,,,,,,,
0002255081,,,,,,,,,,,...,,,,,,,,,,
0002257203,,,,,,,,,,,...,,,,,,,,,,
0002259834,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9724119378,,,,,,,,,,,...,,,,,,,,,,
9726101794,,,,,,,,,,,...,,,,,,,,,,
9871138016,,,,,,,,,,,...,,,,,,,,,,
9871138148,,,,,,,,,,,...,,,,,,,,,,


In [None]:
## There are so many NaN value
## Fill this values with 0
pt.fillna(0, inplace=True)

In [34]:
isbn_index = pt.index
isbn_index

Index(['0002005018', '0002251760', '0002255081', '0002257203', '0002259834',
       '0002558122', '0006480764', '000648302X', '0006485200', '0006485936',
       ...
       '950491036X', '958704049X', '9681336089', '9681500830', '9681500954',
       '9724119378', '9726101794', '9871138016', '9871138148', 'B00009EF82'],
      dtype='object', name='isbn', length=17478)

In [35]:
pickle.dump(isbn_index, open("isbn_index.pkl", 'wb'))
print("isbn_index is dumped!")

isbn_index is dumped!


In [None]:
## Now find the similarity between the isbn through cosine similarity 

from sklearn.metrics.pairwise import cosine_similarity

In [None]:
## Finding the cosine similarity
similarity = cosine_similarity(pt)
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [1]:
print("Work starting...")

Work starting...


In [2]:
%%time
import pickle
import numpy as np
print("Loading similarity file...")
similarity = pickle.load(open('similarity.pkl', 'rb'))

Loading similarity file...
CPU times: total: 13.6 s
Wall time: 3min 4s


In [3]:
%%time
## Now mapping the id of each book with similarity score
## similarity file is of much more size so
## put every point of data into a seperate folder
from tqdm import tqdm

cnt = 0
for sim in tqdm(similarity):
    score_with_id = [d for d in enumerate(sim)]
    score_with_id = np.array(score_with_id)
    
    ## save the file in pickle
    with open(f"temp/{cnt}.pkl", 'wb') as file:
        pickle.dump(score_with_id, file)
    cnt += 1

100%|██████████| 17478/17478 [16:27<00:00, 17.70it/s] 


CPU times: total: 10min 56s
Wall time: 16min 30s


In [None]:
%%time

total = 17478
filtered_matrix = []

for ind in tqdm(range(total)):
    arr = pickle.load(open(f'temp/{ind}.pkl', 'rb'))
    sorted_arr = np.array(sorted(arr, key=lambda x : x[1], reverse=True)[1:11])
    sorted_arr = list(map(int, sorted_arr[:, 0]))
    filtered_matrix.append(sorted_arr)

In [30]:
pickle.dump(filtered_matrix, open('filter.pkl', 'wb'))
print("filter is dumped!")

filter is dumped!
