In [2]:
print(" Project: Books Recommendation using item based collaborative filtering " )

 Project: Books Recommendation using item based collaborative filtering 


In [3]:
print(" Epic 1 - load, clean - data cleaning and preparation " )

 Epic 1 - load, clean - data cleaning and preparation 


In [4]:
# dependecies
print(" --------- Step 1 --------- ")

## E1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

## E3
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

## E4
import pickle

 --------- Step 1 --------- 


In [5]:
# load ds books
print(" --------- Step 2 --------- ")

books = pd.read_csv(
    "./dataset/BX-Books.csv",
    sep=";",
    on_bad_lines="skip",
    encoding="latin-1",
    dtype={
        "ISBN": str,
        "Book-Title": str,
        "Book-Author": str,
        "Year-Of-Publication": str,
        "Publisher": str,
        "Image-URL-S": str,
        "Image-URL-M": str,
        "Image-URL-L": str
    }
)

# row, col
print(books.shape)
print("\n")
display(books.head(4))
print("\n")
print(books["Year-Of-Publication"].unique())

 --------- Step 2 --------- 
(271360, 8)




Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...




['2002' '2001' '1991' '1999' '2000' '1993' '1996' '1988' '2004' '1998'
 '1994' '2003' '1997' '1983' '1979' '1995' '1982' '1985' '1992' '1986'
 '1978' '1980' '1952' '1987' '1990' '1981' '1989' '1984' '0' '1968' '1961'
 '1958' '1974' '1976' '1971' '1977' '1975' '1965' '1941' '1970' '1962'
 '1973' '1972' '1960' '1966' '1920' '1956' '1959' '1953' '1951' '1942'
 '1963' '1964' '1969' '1954' '1950' '1967' '2005' '1957' '1940' '1937'
 '1955' '1946' '1936' '1930' '2011' '1925' '1948' '1943' '1947' '1945'
 '1923' '2020' '1939' '1926' '1938' '2030' '1911' '1904' '1949' '1932'
 '1928' '1929' '1927' '1931' '1914' '2050' '1934' '1910' '1933' '1902'
 '1924' '1921' '1900' '2038' '2026' '1944' '1917' '1901' '2010' '1908'
 '1906' '1935' '1806' '2021' '2012' '2006' 'DK Publishing Inc' 'Gallimard'
 '1909' '2008' '1378' '1919' '1922' '1897' '2024' '1376' '2037']


In [6]:
# clean ds
# clean Year-Of-Publication column
print(" --------- Step 3 --------- ")
print('\n')

print(" --------- invalid values -> NaN /clear strings --------- ")
books["Year-Of-Publication"] = pd.to_numeric(books["Year-Of-Publication"], errors="coerce")
print(books["Year-Of-Publication"].unique())
print('\n')

print(" --------- outof range -> NaN  --------- ")
books.loc[(books["Year-Of-Publication"] < 1900) | (books["Year-Of-Publication"] > 2024), "Year-Of-Publication"] = pd.NA
print(books["Year-Of-Publication"].unique())
print('\n')

# assign 'Unknown' to missing values 
books["Year-Of-Publication"] = books["Year-Of-Publication"].astype("object")
books["Year-Of-Publication"] = books["Year-Of-Publication"].fillna("Unknown")


# convert the years to integers
books["Year-Of-Publication"] = books["Year-Of-Publication"].apply(lambda x: str(int(x)) if x != "Unknown" else x)

print("books['Year-Of-Publication'].unique():")
print(books["Year-Of-Publication"].unique())
print('\n')

 --------- Step 3 --------- 


 --------- invalid values -> NaN /clear strings --------- 
[2002. 2001. 1991. 1999. 2000. 1993. 1996. 1988. 2004. 1998. 1994. 2003.
 1997. 1983. 1979. 1995. 1982. 1985. 1992. 1986. 1978. 1980. 1952. 1987.
 1990. 1981. 1989. 1984.    0. 1968. 1961. 1958. 1974. 1976. 1971. 1977.
 1975. 1965. 1941. 1970. 1962. 1973. 1972. 1960. 1966. 1920. 1956. 1959.
 1953. 1951. 1942. 1963. 1964. 1969. 1954. 1950. 1967. 2005. 1957. 1940.
 1937. 1955. 1946. 1936. 1930. 2011. 1925. 1948. 1943. 1947. 1945. 1923.
 2020. 1939. 1926. 1938. 2030. 1911. 1904. 1949. 1932. 1928. 1929. 1927.
 1931. 1914. 2050. 1934. 1910. 1933. 1902. 1924. 1921. 1900. 2038. 2026.
 1944. 1917. 1901. 2010. 1908. 1906. 1935. 1806. 2021. 2012. 2006.   nan
 1909. 2008. 1378. 1919. 1922. 1897. 2024. 1376. 2037.]


 --------- outof range -> NaN  --------- 
[2002. 2001. 1991. 1999. 2000. 1993. 1996. 1988. 2004. 1998. 1994. 2003.
 1997. 1983. 1979. 1995. 1982. 1985. 1992. 1986. 1978. 1980. 1952. 1987.
 1990. 

In [7]:
# extract needed and change format
print(" --------- Step 4 --------- ")

print(books.columns)
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher','Image-URL-L']]
print(books.columns)
books.rename(columns= {
    "ISBN": "isbn",
    "Book-Title": "title",
    "Book-Author": "author",
    "Year-Of-Publication": "year",
    "Publisher": "publisher",
    "Image-URL-L": "image_url",
},inplace=True)

print(books.columns)
    

 --------- Step 4 --------- 
Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')
Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-L'],
      dtype='object')
Index(['isbn', 'title', 'author', 'year', 'publisher', 'image_url'], dtype='object')


In [8]:
# load ds users
print(" --------- Step 5 --------- ")

users = pd.read_csv(
    "./dataset/BX-Users.csv",
    sep=";",
    on_bad_lines="skip",
    encoding="latin-1",
    dtype={
        "User-ID": str,
        "Location": str,
        "Age": "object"
    }
)

# row, col
print(users.shape)
print("\n")
display(users.head(4))
print("\n")

 --------- Step 5 --------- 
(278858, 3)




Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0






In [9]:
# clean Age column
print(" --------- Step 6 --------- ")
print('\n')

print(" --------- invalid values -> NaN /clear strings --------- ")
users["Age"] = pd.to_numeric(users["Age"], errors="coerce")
print(users["Age"].unique())
print('\n')

# assign 'Unknown' to missing values 
users["Age"] = users["Age"].fillna("Unknown")

# convert Age to integers and filter out unrealistic ages
users["Age"] = users["Age"].apply(
    lambda x: int(x) if isinstance(x, float) and 2 <= x <= 120 else "Unknown"
)

print("users['Age'].unique():")
print(users["Age"].unique())
print('\n')

 --------- Step 6 --------- 


 --------- invalid values -> NaN /clear strings --------- 
[ nan  18.  17.  61.  26.  14.  25.  19.  46.  55.  32.  24.  20.  34.
  23.  51.  31.  21.  44.  30.  57.  43.  37.  41.  54.  42.  50.  39.
  53.  47.  36.  28.  35.  13.  58.  49.  38.  45.  62.  63.  27.  33.
  29.  66.  40.  15.  60.   0.  79.  22.  16.  65.  59.  48.  72.  56.
  67.   1.  80.  52.  69.  71.  73.  78.   9.  64. 103. 104.  12.  74.
  75. 231.   3.  76.  83.  68. 119.  11.  77.   2.  70.  93.   8.   7.
   4.  81. 114. 230. 239.  10.   5. 148. 151.   6. 101. 201.  96.  84.
  82.  90. 123. 244. 133.  91. 128.  94.  85. 141. 110.  97. 219.  86.
 124.  92. 175. 172. 209. 212. 237.  87. 162. 100. 156. 136.  95.  89.
 106.  99. 108. 210.  88. 199. 147. 168. 132. 159. 186. 152. 102. 116.
 200. 115. 226. 137. 207. 229. 138. 109. 105. 228. 183. 204.  98. 223.
 113. 208. 107. 157. 111. 146. 118. 220. 143. 140. 189. 127.]


users['Age'].unique():
['Unknown' 18 17 61 26 14 25 19 46 55 32 2

In [10]:
# change format
print(" --------- Step 7 --------- ")

print(users.columns)

users.rename(columns={
    "User-ID": "user_id",
    "Location": "location",
    "Age": "age"
}, inplace=True)

print(users.columns)

 --------- Step 7 --------- 
Index(['User-ID', 'Location', 'Age'], dtype='object')
Index(['user_id', 'location', 'age'], dtype='object')


In [11]:
# load ds ratings
print(" --------- Step 8 --------- ")

ratings = pd.read_csv(
    "./dataset/BX-Book-Ratings.csv",
    sep=";",
    on_bad_lines="skip",
    encoding="latin-1",
    dtype={
        "User-ID": str,
        "ISBN": str,
        "Book-Rating": str
    }
)

# row, col
print(ratings.shape)
print("\n")
display(ratings.head(8))
print("\n")

 --------- Step 8 --------- 
(1149780, 3)




Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
5,276733,2080674722,0
6,276736,3257224281,8
7,276737,0600570967,6






In [12]:
# clean Book-Rating column
print(" --------- Step 9 --------- ")
print('\n')

print(" --------- invalid values -> NaN /clear strings --------- ")
ratings["Book-Rating"] = pd.to_numeric(ratings["Book-Rating"], errors="coerce")
print(ratings["Book-Rating"].unique())
print('\n')

# convert Book-Rating to integers and filter out unrealistic ratings
ratings["Book-Rating"] = ratings["Book-Rating"].apply(
    lambda x: int(x) if 0 <= x <= 10 else pd.NA
)

print("ratings['Book-Rating'].unique():")
print(ratings["Book-Rating"].unique())
print('\n')

 --------- Step 9 --------- 


 --------- invalid values -> NaN /clear strings --------- 
[ 0  5  3  6  8  7 10  9  4  1  2]


ratings['Book-Rating'].unique():
[ 0  5  3  6  8  7 10  9  4  1  2]




In [13]:
# change format
print(" --------- Step 10 --------- ")

print(ratings.columns)

ratings.rename(columns={
    "User-ID": "user_id",
    "ISBN": "isbn",
    "Book-Rating": "book_rating"
}, inplace=True)

print(ratings.columns)

 --------- Step 10 --------- 
Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')
Index(['user_id', 'isbn', 'book_rating'], dtype='object')


In [14]:
print(" Epic 2 - filtering and creating our final df - feature engineering " )

 Epic 2 - filtering and creating our final df - feature engineering 


In [15]:
# data preprocessing | ( filter users rated more than 200 and filter based of number of ratings > 50 ) (relevant data, reduces noise, enhances performance) 
print(" --------- Step 11 --------- ")

# users rated more than 200
print(" --------- Step 11 | Part 1 --------- ")

# check how many books each user has rated
ratings['user_id'].value_counts()

 --------- Step 11 --------- 
 --------- Step 11 | Part 1 --------- 


user_id
11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
116180        1
116166        1
116154        1
116137        1
276723        1
Name: count, Length: 105283, dtype: int64

In [16]:
# users in this df
ratings['user_id'].unique().shape

(105283,)

In [17]:
x = ratings['user_id'].value_counts() > 200
x

user_id
11676      True
198711     True
153662     True
98391      True
35859      True
          ...  
116180    False
116166    False
116154    False
116137    False
276723    False
Name: count, Length: 105283, dtype: bool

In [18]:
# users > 200
x[x].shape

(899,)

In [19]:
# filter based on index
y = x[x].index
y

Index(['11676', '198711', '153662', '98391', '35859', '212898', '278418',
       '76352', '110973', '235105',
       ...
       '260183', '73681', '44296', '155916', '9856', '274808', '28634',
       '59727', '268622', '188951'],
      dtype='object', name='user_id', length=899)

In [20]:
# filter users rated more than 200
ratings = ratings[ratings['user_id'].isin(y)]
# row, col
ratings.shape

(526356, 3)

In [21]:
# total number of rating for each title > 50
print(" --------- Step 11 | Part 2 --------- ")
display(books.head(2))

 --------- Step 11 | Part 2 --------- 


Unnamed: 0,isbn,title,author,year,publisher,image_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...


In [22]:
display(ratings.head(2))

Unnamed: 0,user_id,isbn,book_rating
1456,277427,002542730X,10
1457,277427,0026217457,0


In [23]:
# merging data
ratings_with_books=ratings.merge(books, on='isbn')
# display(ratings_with_books.head(4))
# verify:
ratings_with_books_sorted = ratings_with_books.sort_values(by='title')
display(ratings_with_books_sorted.head(4))

Unnamed: 0,user_id,isbn,book_rating,title,author,year,publisher,image_url
165535,96448,590567330,9,A Light in the Storm: The Civil War Diary of ...,Karen Hesse,1999,Hyperion Books for Children,http://images.amazon.com/images/P/0590567330.0...
70253,35859,590567330,0,A Light in the Storm: The Civil War Diary of ...,Karen Hesse,1999,Hyperion Books for Children,http://images.amazon.com/images/P/0590567330.0...
297750,172742,964147726,0,Always Have Popsicles,Rebecca Harvin,1994,Rebecca L. Harvin,http://images.amazon.com/images/P/0964147726.0...
345423,198711,942320093,0,Apple Magic (The Collector's series),Martina Boudreau,1984,Amer Cooking Guild,http://images.amazon.com/images/P/0942320093.0...


In [24]:
ratings_with_books.shape

(487671, 8)

In [25]:
# add a new column that shows the total number of ratings for each title (useful for collaborative filtering)
number_of_ratings = ratings_with_books.groupby('title')['book_rating'].count().reset_index()
display(number_of_ratings)

Unnamed: 0,title,book_rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1
...,...,...
160264,Ã?Â?ber die Pflicht zum Ungehorsam gegen den S...,3
160265,Ã?Â?lpiraten.,1
160266,Ã?Â?rger mit Produkt X. Roman.,1
160267,Ã?Â?stlich der Berge.,1


In [26]:
number_of_ratings.rename(columns={
    "book_rating": "number_rating",
}, inplace=True)
display(number_of_ratings)

Unnamed: 0,title,number_rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1
...,...,...
160264,Ã?Â?ber die Pflicht zum Ungehorsam gegen den S...,3
160265,Ã?Â?lpiraten.,1
160266,Ã?Â?rger mit Produkt X. Roman.,1
160267,Ã?Â?stlich der Berge.,1


In [27]:
# merging data
final_merge = ratings_with_books.merge(number_of_ratings, on='title')
display(final_merge)

Unnamed: 0,user_id,isbn,book_rating,title,author,year,publisher,image_url,number_rating
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
1,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,http://images.amazon.com/images/P/0026217457.0...,7
2,277427,003008685X,8,Pioneers,James Fenimore Cooper,1974,Thomson Learning,http://images.amazon.com/images/P/003008685X.0...,1
3,277427,0030615321,0,"Ask for May, Settle for June (A Doonesbury book)",G. B. Trudeau,1982,Henry Holt &amp; Co,http://images.amazon.com/images/P/0030615321.0...,1
4,277427,0060002050,0,On a Wicked Dawn (Cynster Novels),Stephanie Laurens,2002,Avon Books,http://images.amazon.com/images/P/0060002050.0...,13
...,...,...,...,...,...,...,...,...,...
487666,275970,1931868123,0,There's a Porcupine in My Outhouse: Misadventu...,Mike Tougias,2002,Capital Books (VA),http://images.amazon.com/images/P/1931868123.0...,1
487667,275970,3411086211,10,Die Biene.,Sybil GrÃ?Â¤fin SchÃ?Â¶nfeldt,1993,"Bibliographisches Institut, Mannheim",http://images.amazon.com/images/P/3411086211.0...,1
487668,275970,3829021860,0,The Penis Book,Joseph Cohen,1999,Konemann,http://images.amazon.com/images/P/3829021860.0...,1
487669,275970,4770019572,0,Musashi,Eiji Yoshikawa,1995,Kodansha International (JPN),http://images.amazon.com/images/P/4770019572.0...,1


In [28]:
final_merge.shape

(487671, 9)

In [29]:
final_rating = final_merge[final_merge['number_rating'] >= 50 ]
display(final_rating)

Unnamed: 0,user_id,isbn,book_rating,title,author,year,publisher,image_url,number_rating
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
13,277427,0060930535,0,The Poisonwood Bible: A Novel,Barbara Kingsolver,1999,Perennial,http://images.amazon.com/images/P/0060930535.0...,133
15,277427,0060934417,0,Bel Canto: A Novel,Ann Patchett,2002,Perennial,http://images.amazon.com/images/P/0060934417.0...,108
18,277427,0061009059,9,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich,1995,HarperTorch,http://images.amazon.com/images/P/0061009059.0...,108
24,277427,006440188X,0,The Secret Garden,Frances Hodgson Burnett,1998,HarperTrophy,http://images.amazon.com/images/P/006440188X.0...,79
...,...,...,...,...,...,...,...,...,...
487505,275970,1400031354,0,Tears of the Giraffe (No.1 Ladies Detective Ag...,Alexander McCall Smith,2002,Anchor,http://images.amazon.com/images/P/1400031354.0...,84
487506,275970,1400031362,0,Morality for Beautiful Girls (No.1 Ladies Dete...,Alexander McCall Smith,2002,Anchor,http://images.amazon.com/images/P/1400031362.0...,60
487579,275970,1573229725,0,Fingersmith,Sarah Waters,2002,Riverhead Books,http://images.amazon.com/images/P/1573229725.0...,59
487618,275970,1586210661,9,Me Talk Pretty One Day,David Sedaris,2001,Time Warner Audio Major,http://images.amazon.com/images/P/1586210661.0...,146


In [30]:
final_rating.shape

(61853, 9)

In [31]:
# verify duplicates
print(" --------- Step 12 --------- ")

dummy = final_rating.copy()
dummy['is_duplicate'] = dummy.duplicated(subset=['title', 'user_id'], keep=False)
dummy = dummy.sort_values(by=['user_id', 'title'])

display(dummy.sort_values(by='is_duplicate').tail(20))
#display(dummy.sort_values(by='is_duplicate'))

 --------- Step 12 --------- 


Unnamed: 0,user_id,isbn,book_rating,title,author,year,publisher,image_url,number_rating,is_duplicate
246783,143175,1586215361,0,The Jester,James Patterson,2003,Time Warner Audio Major,http://images.amazon.com/images/P/1586215361.0...,68,True
246156,143175,0316602051,9,The Jester,James Patterson,2003,"Little, Brown",http://images.amazon.com/images/P/0316602051.0...,68,True
128412,76151,0440194903,0,When the Wind Blows,John Saul,1990,Dell,http://images.amazon.com/images/P/0440194903.0...,175,True
128338,76151,0316693324,10,When the Wind Blows,James Patterson,1998,Replica Books,http://images.amazon.com/images/P/0316693324.0...,175,True
462193,258938,0446676098,10,The Notebook,Nicholas Sparks,1999,Warner Books,http://images.amazon.com/images/P/0446676098.0...,241,True
462188,258938,0446605239,10,The Notebook,Nicholas Sparks,1998,Warner Books,http://images.amazon.com/images/P/0446605239.0...,241,True
246781,143175,1586212869,0,The Beach House,James Patterson,2002,Time Warner Audio Major,http://images.amazon.com/images/P/1586212869.0...,142,True
17498,11676,0330332775,0,Bridget Jones's Diary,Helen Fielding,1997,Picador (UK),http://images.amazon.com/images/P/0330332775.0...,277,True
246392,143175,0446612545,9,The Beach House,James Patterson,2003,Warner Books,http://images.amazon.com/images/P/0446612545.0...,142,True
298778,173415,067942573X,0,Taltos: Lives of the Mayfair Witches,Anne Rice,1994,Alfred A. Knopf,http://images.amazon.com/images/P/067942573X.0...,53,True


In [32]:
final_df=final_rating.drop_duplicates(subset=['title', 'user_id'])

In [33]:
dummy2 = final_df.copy()
dummy2['is_duplicate'] = dummy2.duplicated(subset=['title', 'user_id'], keep=False)
dummy2 = dummy2.sort_values(by=['title', 'user_id'])
display(dummy2.sort_values(by='is_duplicate'))


Unnamed: 0,user_id,isbn,book_rating,title,author,year,publisher,image_url,number_rating,is_duplicate
182516,104636,0451524934,0,1984,George Orwell,1990,Signet Book,http://images.amazon.com/images/P/0451524934.0...,76,False
261553,153662,0060096195,0,The Boy Next Door,Meggin Cabot,2002,Avon Trade,http://images.amazon.com/images/P/0060096195.0...,55,False
276373,162639,0060096195,0,The Boy Next Door,Meggin Cabot,2002,Avon Trade,http://images.amazon.com/images/P/0060096195.0...,55,False
279049,164027,0060096195,7,The Boy Next Door,Meggin Cabot,2002,Avon Trade,http://images.amazon.com/images/P/0060096195.0...,55,False
281176,165308,0060096195,0,The Boy Next Door,Meggin Cabot,2002,Avon Trade,http://images.amazon.com/images/P/0060096195.0...,55,False
...,...,...,...,...,...,...,...,...,...,...
155588,91832,0440414806,0,Holes (Yearling Newbery),LOUIS SACHAR,2000,Yearling,http://images.amazon.com/images/P/0440414806.0...,55,False
172481,98741,0440414806,0,Holes (Yearling Newbery),LOUIS SACHAR,2000,Yearling,http://images.amazon.com/images/P/0440414806.0...,55,False
180563,102967,0449702545,8,Homecoming,Cynthia Voigt,1990,Fawcett Books,http://images.amazon.com/images/P/0449702545.0...,61,False
79138,38273,0440414806,10,Holes (Yearling Newbery),LOUIS SACHAR,2000,Yearling,http://images.amazon.com/images/P/0440414806.0...,55,False


In [34]:
final_df.shape

(59850, 9)

In [35]:
print(" Epic 3 - preparing data and training Nearest Neighbors Model " )

 Epic 3 - preparing data and training Nearest Neighbors Model 


In [36]:
book_pivot=final_df.pivot_table(columns="user_id", index="title", values="book_rating")
book_pivot

user_id,100459,100644,100846,100906,101209,101305,101851,101876,102275,102359,...,95932,95991,96054,96448,97874,98297,98391,9856,98741,98758
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,,,,,,,,,,,...,,,,,,,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2nd Chance,,0.0,,,,,,,9.0,,...,,,,,,,9.0,,,
4 Blondes,,,,,,,,,,,...,,,,,,,,,,
84 Charing Cross Road,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,,,,9.0,,8.0,,,,,...,,,,,,,,,,
You Belong To Me,,,,,,,,,0.0,,...,,,,,,,,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,,0.0,,,,,,...,,,,,,,,,,
Zoya,,,,,,,,,,,...,,,,,,,,,,


In [37]:
book_pivot.shape

(742, 888)

In [38]:
book_pivot.fillna(0, inplace=True)
book_pivot

user_id,100459,100644,100846,100906,101209,101305,101851,101876,102275,102359,...,95932,95991,96054,96448,97874,98297,98391,9856,98741,98758
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84 Charing Cross Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,9.0,0.0,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
# model preparation and training
book_sparse = csr_matrix(book_pivot) # fix prob 0

In [40]:
model = NearestNeighbors(algorithm="brute")

In [41]:
model.fit(book_sparse)

In [103]:
# pre test ( skip, use it after executing the whole code )

# find book to test
# keyword = "kill"
# matching_books = book_pivot.index[books_name.str.contains(keyword, case=False, na=False)]
# display(matching_books)

Index(['A Time to Kill',
       'K Is for Killer (Kinsey Millhone Mysteries (Paperback))', 'Killjoy',
       'The Killing Game: Only One Can Win...and the Loser Dies',
       'To Kill a Mockingbird'],
      dtype='object', name='title')

In [43]:
# pre test ( skip )

# find id of book to test
#book_index = book_pivot.index.get_loc('Harry Potter and the Chamber of Secrets (Book 2)')
book_index = np.where(book_pivot.index == 'Harry Potter and the Chamber of Secrets (Book 2)')[0][0]
display(book_index)

237

In [44]:
# test 1
distance, suggestion = model.kneighbors(book_pivot.iloc[200,:].values.reshape(1,-1), n_neighbors=6)

In [45]:
distance

array([[ 0.        , 22.24859546, 22.60530911, 23.43074903, 23.68543856,
        24.14539294]])

In [46]:
suggestion

array([[200, 372, 485, 320, 184, 536]], dtype=int64)

In [47]:
for i in range(len(suggestion)):
    print(book_pivot.index[suggestion[i]])

Index(['Fatal Cure', 'No Safe Place', 'Table For Two', 'Long After Midnight',
       'Exclusive', 'The Cradle Will Fall'],
      dtype='object', name='title')


In [48]:
# test 2
def recommend_book(name):
    book_id=book_pivot.index.get_loc(name)
    distance, suggestion = model.kneighbors(book_pivot.iloc[book_id,:].values.reshape(1,-1), n_neighbors=6)
    for i in range(len(suggestion)):
        result=book_pivot.index[suggestion[i]]
        for j in result:
            print(j)

recommend_book('To Kill a Mockingbird')

To Kill a Mockingbird
Drowning Ruth
Master of the Game
Echoes
Pleading Guilty
Winter Moon


In [49]:
# will be used later on
books_name = book_pivot.index
books_name

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       '84 Charing Cross Road', 'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Cry In The Night',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='title', length=742)

In [50]:
print(" Epic 4 - Saving model " )

 Epic 4 - Saving model 


In [51]:
pickle.dump(model, open("artifacts/model.pkl", "wb"))
pickle.dump(books_name, open("artifacts/books_name.pkl", "wb"))
pickle.dump(book_pivot, open("artifacts/book_pivot.pkl", "wb"))
pickle.dump(final_df, open("artifacts/final_rating.pkl", "wb"))