## ML - Recommender Systems - Lab - SOLUTION
We work on a dataset with 3 tables


1.   Books
2.   Users
3.   Ratings



In [3]:
# The following imports are needed throughout the lab exercise
import re
import pickle
import operator
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from scipy.sparse import csr_matrix
from pandas.api.types import is_numeric_dtype
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

##0 - Read in the dataset

In [4]:
books = pd.read_csv('Books/Books.csv', delimiter=';', encoding='ISO-8859-1', on_bad_lines='skip')
users = pd.read_csv('Books/Users.csv', delimiter=';', encoding='ISO-8859-1', on_bad_lines='skip')
ratings = pd.read_csv('Books/Ratings.csv', delimiter=';', encoding='ISO-8859-1', on_bad_lines='skip')

books.shape
users.shape
ratings.shape

(1149780, 3)

##1 - Data pre-processing
The following steps should be done on the datasets: 

**Books**


*   Drop the Image URL features
*   Check for null values, replace with 'Other' (Hint: use `.loc` and `.iloc`)
*   Check the `Year-Of-Publication` column, what's wrong there? (Before 'fixing' the issue, it is better to increase the column width so the entire value of `Book-Title`can be seen (there is hidden information in that field)
*   There are three tuples with another anomaly, can you find them? Fix it manually
*   Convert publication year to `int`
*   Replace invalid years with the modus of the year column
*   Make sure the ISBN column is in uppercase, remove duplicate rows

In [5]:
#Fill in the code
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [6]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [7]:
# Drop the image URL features
books.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1, inplace=True)
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [8]:
# Check for null values, replace with 'Other'
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
dtype: int64

In [9]:
books.loc[books['Book-Author'].isnull(),:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
118033,751352497,A+ Quiz Masters:01 Earth,,1999,Dorling Kindersley
187689,9627982032,The Credit Suisse Guide to Managing Your Perso...,,1995,Edinburgh Financial Publishing


In [10]:
books.loc[books['Publisher'].isnull(),:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
128890,193169656X,Tyrant Moon,Elaine Corvidae,2002,
129037,1931696993,Finders Keepers,Linnea Sinclair,2001,


In [11]:
books.iloc[187689]['Book-Author'] = 'Other'
books.iloc[128890]['Publisher'] = 'Other'
books.iloc[129037]['Publisher'] = 'Other'

In [12]:
# Check the year column, what's wrong there? 
books['Year-Of-Publication'].unique()
# There are publishers in the year column ... 

array([2002, 2001, 1991, 1999, 2000, 1993, 1996, 1988, 2004, 1998, 1994,
       2003, 1997, 1983, 1979, 1995, 1982, 1985, 1992, 1986, 1978, 1980,
       1952, 1987, 1990, 1981, 1989, 1984, 0, 1968, 1961, 1958, 1974,
       1976, 1971, 1977, 1975, 1965, 1941, 1970, 1962, 1973, 1972, 1960,
       1966, 1920, 1956, 1959, 1953, 1951, 1942, 1963, 1964, 1969, 1954,
       1950, 1967, 2005, 1957, 1940, 1937, 1955, 1946, 1936, 1930, 2011,
       1925, 1948, 1943, 1947, 1945, 1923, 2020, 1939, 1926, 1938, 2030,
       1911, 1904, 1949, 1932, 1928, 1929, 1927, 1931, 1914, 2050, 1934,
       1910, 1933, 1902, 1924, 1921, 1900, 2038, 2026, 1944, 1917, 1901,
       2010, 1908, 1906, 1935, 1806, 2021, '2000', '1995', '1999', '2004',
       '2003', '1990', '1994', '1986', '1989', '2002', '1981', '1993',
       '1983', '1982', '1976', '1991', '1977', '1998', '1992', '1996',
       '0', '1997', '2001', '1974', '1968', '1987', '1984', '1988',
       '1963', '1956', '1970', '1985', '1978', '1973', '1980'

In [13]:
pd.set_option('display.max_colwidth', None)

In [14]:
books.loc[books['Year-Of-Publication'] == 'DK Publishing Inc', :]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
209538,078946697X,"DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)\"";Michael Teitelbaum""",2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.01.THUMBZZZ.jpg
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)\"";James Buckley""",2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.01.THUMBZZZ.jpg


In [15]:
books.loc[books['Year-Of-Publication'] == 'Gallimard',]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
220731,2070426769,"Peuple du ciel, suivi de 'Les Bergers\"";Jean-Marie Gustave Le ClÃ?Â©zio""",2003,Gallimard,http://images.amazon.com/images/P/2070426769.01.THUMBZZZ.jpg


In [16]:
books.iloc[205538]['Publisher'] = 'DK Publishing Inc'
books.iloc[209538]['Year-Of-Publication'] = 2000
books.iloc[209538]['Book-Title'] = 'DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)'
books.iloc[209538]['Book-Author'] = 'Michael Teitelbaum'

books.iloc[221678]['Publisher'] = 'DK Publishing Inc'
books.iloc[221678]['Year-Of-Publication'] = 2000
books.iloc[209538]['Book-Title'] = 'DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)'
books.iloc[209538]['Book-Author'] = 'James Buckley'

books.iloc[220731]['Publisher'] = 'Gallimard'
books.iloc[220731]['Year-Of-Publication'] = '2003'
books.iloc[209538]['Book-Title'] = 'Peuple du ciel - Suivi de Les bergers '
books.iloc[209538]['Book-Author'] = 'Jean-Marie Gustave Le ClÃ?Â©zio'

In [17]:
# Convert the year column to int
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(int)
print(sorted(list(books['Year-Of-Publication'].unique())))

[np.int64(0), np.int64(1376), np.int64(1378), np.int64(1806), np.int64(1897), np.int64(1900), np.int64(1901), np.int64(1902), np.int64(1904), np.int64(1906), np.int64(1908), np.int64(1909), np.int64(1910), np.int64(1911), np.int64(1914), np.int64(1917), np.int64(1919), np.int64(1920), np.int64(1921), np.int64(1922), np.int64(1923), np.int64(1924), np.int64(1925), np.int64(1926), np.int64(1927), np.int64(1928), np.int64(1929), np.int64(1930), np.int64(1931), np.int64(1932), np.int64(1933), np.int64(1934), np.int64(1935), np.int64(1936), np.int64(1937), np.int64(1938), np.int64(1939), np.int64(1940), np.int64(1941), np.int64(1942), np.int64(1943), np.int64(1944), np.int64(1945), np.int64(1946), np.int64(1947), np.int64(1948), np.int64(1949), np.int64(1950), np.int64(1951), np.int64(1952), np.int64(1953), np.int64(1954), np.int64(1955), np.int64(1956), np.int64(1957), np.int64(1958), np.int64(1959), np.int64(1960), np.int64(1961), np.int64(1962), np.int64(1963), np.int64(1964), np.int64(1

In [18]:
# Replace invalid years with the modus of the year column

# here we choose to use the built-in mode() function of the statistics library
# it is possible to handcraft your own method as well
import statistics
modeYear = statistics.mode(books['Year-Of-Publication'])
modeYear

2002

In [19]:
books.loc[books['Year-Of-Publication'] > 2021, 'Year-Of-Publication'] = modeYear
books.loc[books['Year-Of-Publication'] == 0, 'Year-Of-Publication'] = modeYear

In [20]:
# Make sure the ISBN column is in uppercase, remove duplicates
books['ISBN'] = books['ISBN'].str.upper()

books.drop_duplicates(keep='last', inplace=True)
books.reset_index(drop=True, inplace=True)

books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271047 entries, 0 to 271046
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271047 non-null  object
 1   Book-Title           271047 non-null  object
 2   Book-Author          271046 non-null  object
 3   Year-Of-Publication  271047 non-null  int64 
 4   Publisher            271047 non-null  object
dtypes: int64(1), object(4)
memory usage: 10.3+ MB


In [21]:
books.head(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial


**Ratings**


*   Check for null values.
*   Cast `Rating` and `User-ID` column to `int`
*   Make sure the ISBN column values are aligned with those of the `Books` table, otherwise drop the record.
*   Make sure the ISBN column is in uppercase, remove duplicate rows

In [22]:
ratings.columns

Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')

In [23]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [24]:
# Check for null values
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [25]:
# Check Rating and User-ID column
print(is_numeric_dtype(ratings['Book-Rating']))
print(is_numeric_dtype(ratings['User-ID']))

True
True


In [26]:
# Make sure the ISBN column values are aligned with those of the Books table, otherwise drop
# This code is given

## checking ISBN
flag = 0
k =[]
reg = "[^A-Za-z0-9]"

for x in ratings['ISBN']:
    z = re.search(reg,x)    
    if z:
        flag = 1

if flag == 1:
    print("False")
else:
    print("True")

False


In [27]:
## remove extra characters from ISBN (from ratings dataset) existing in books dataset
bookISBN = books['ISBN'].tolist() 
reg = "[^A-Za-z0-9]" 
for index, row_Value in ratings.iterrows():
    z = re.search(reg, row_Value['ISBN'])    
    if z:
        f = re.sub(reg,"",row_Value['ISBN'])
        if f in bookISBN:
            ratings.at[index , 'ISBN'] = f

In [28]:
# Make sure the ISBN column is in uppercase, remove duplicates
ratings['ISBN'] = ratings['ISBN'].str.upper()

ratings.drop_duplicates(keep='last', inplace=True)
ratings.reset_index(drop=True, inplace=True)

ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149776 entries, 0 to 1149775
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149776 non-null  int64 
 1   ISBN         1149776 non-null  object
 2   Book-Rating  1149776 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [29]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


**Users**


*   Check for null values.
*   Replace the invalid ages (age should be between 10 and 80) in the `Age` column with the mean value.
*   Split the `location` column in `City`, `State` and `Country` if a value is `null` replace it with `other`.
*   Remove duplicate rows

In [30]:
users.columns

Index(['User-ID', 'Location', 'Age'], dtype='object')

In [31]:
users.head(3)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


In [32]:
# Check for null values
print(users.isna().sum())
# There are no missing values :-)

User-ID          0
Location         0
Age         110762
dtype: int64


In [33]:
# Replace the invalid ages, fill null values and change datatype to int
print(sorted(list(users['Age'].unique())))

[np.float64(nan), np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0), np.float64(13.0), np.float64(14.0), np.float64(15.0), np.float64(16.0), np.float64(17.0), np.float64(18.0), np.float64(19.0), np.float64(20.0), np.float64(21.0), np.float64(22.0), np.float64(23.0), np.float64(24.0), np.float64(25.0), np.float64(26.0), np.float64(27.0), np.float64(28.0), np.float64(29.0), np.float64(30.0), np.float64(31.0), np.float64(32.0), np.float64(33.0), np.float64(34.0), np.float64(35.0), np.float64(36.0), np.float64(37.0), np.float64(38.0), np.float64(39.0), np.float64(40.0), np.float64(41.0), np.float64(42.0), np.float64(43.0), np.float64(44.0), np.float64(45.0), np.float64(46.0), np.float64(47.0), np.float64(48.0), np.float64(49.0), np.float64(50.0), np.float64(51.0), np.float64(52.0), np.float64(53.0), np.float64(54.0), np

In [34]:
validAge = users[users['Age'] <= 80]
validAge = validAge[validAge['Age'] >= 10]

mean = round(validAge['Age'].mean())
mean

35

In [35]:
users.loc[users['Age'] > 80, 'Age'] = mean
users.loc[users['Age'] < 10, 'Age'] = mean
users['Age'] = users['Age'].fillna(mean)
users['Age'] = users['Age'].astype(int)

In [36]:
# Split the location column appropriately
# this code is given

list_ = users.Location.str.split(', ')

city = []
state = []
country = []
count_no_state = 0    
count_no_country = 0

for i in range(0,len(list_)):
    if list_[i][0] == ' ' or list_[i][0] == '' or list_[i][0]=='n/a' or list_[i][0] == ',':  #removing invalid entries too
        city.append('other')
    else:
        city.append(list_[i][0].lower())

    if(len(list_[i])<2):
        state.append('other')
        country.append('other')
        count_no_state += 1
        count_no_country += 1
    else:
        if list_[i][1] == ' ' or list_[i][1] == '' or list_[i][1]=='n/a' or list_[i][1] == ',':   #removing invalid entries 
            state.append('other')
            count_no_state += 1            
        else:
            state.append(list_[i][1].lower())
        
        if(len(list_[i])<3):
            country.append('other')
            count_no_country += 1
        else:
            if list_[i][2] == ''or list_[i][1] == ',' or list_[i][2] == ' ' or list_[i][2] == 'n/a':
                country.append('other')
                count_no_country += 1
            else:
                country.append(list_[i][2].lower())
        
users = users.drop('Location',axis=1)

temp = []
for ent in city:
    c = ent.split('/')            #handling cases where city/state entries from city list as state is already given 
    temp.append(c[0])

df_city = pd.DataFrame(temp,columns=['City'])
df_state = pd.DataFrame(state,columns=['State'])
df_country = pd.DataFrame(country,columns=['Country'])

users = pd.concat([users, df_city], axis=1)
users = pd.concat([users, df_state], axis=1)
users = pd.concat([users, df_country], axis=1)

print(count_no_country)   #printing the number of countries didnt have any values 
print(count_no_state)     #printing the states which didnt have any values

4659
16044


In [37]:
# Remove duplicate rows
users.drop_duplicates(keep='last', inplace=True)
users.reset_index(drop=True, inplace=True)

In [38]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   User-ID  278858 non-null  int64 
 1   Age      278858 non-null  int64 
 2   City     278858 non-null  object
 3   State    278858 non-null  object
 4   Country  278858 non-null  object
dtypes: int64(2), object(3)
memory usage: 10.6+ MB


In [39]:
users.head()

Unnamed: 0,User-ID,Age,City,State,Country
0,1,35,nyc,new york,usa
1,2,18,stockton,california,usa
2,3,35,moscow,yukon territory,russia
3,4,17,porto,v.n.gaia,portugal
4,5,35,farnborough,hants,united kingdom


**Merge the tables**

Merge (inner join) the `Books`, `Users` and `Ratings` tables to one

In [40]:
dataset = pd.merge(books, ratings, on='ISBN', how='inner')
dataset = pd.merge(dataset, users, on='User-ID', how='inner')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1031609 entries, 0 to 1031608
Data columns (total 11 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   ISBN                 1031609 non-null  object
 1   Book-Title           1031609 non-null  object
 2   Book-Author          1031608 non-null  object
 3   Year-Of-Publication  1031609 non-null  int64 
 4   Publisher            1031609 non-null  object
 5   User-ID              1031609 non-null  int64 
 6   Book-Rating          1031609 non-null  int64 
 7   Age                  1031609 non-null  int64 
 8   City                 1031609 non-null  object
 9   State                1031609 non-null  object
 10  Country              1031609 non-null  object
dtypes: int64(4), object(7)
memory usage: 86.6+ MB


**Split the dataset**

Splitting is done on the `Book-Rating` column, it can have an implicit rating (equal to 0) or and explicit rating (!= 0)

In [41]:
explicit = dataset[dataset['Book-Rating'] != 0]
explicit = explicit.reset_index(drop=True)
explicit.shape

(384074, 11)

In [42]:
implicit = dataset[dataset['Book-Rating'] == 0]
implicit = implicit.reset_index(drop=True)
implicit.shape

(647535, 11)

##2 - Recommendation systems

**2.1 - Most popular book**

Sort the entire dataset on the ratings and recommend the top $n$ books.

**2.2 - Most popular book in a country**

Sort the entire dataset like in 2.1, but now filter according to a country to recommend the top $n$ books.

**2.3 - Use the average weighted rating**

Calculate the weighted score with: 
\begin{equation}WR = \left({{v} \over {v} + {m}} \cdot R\right) + \left({{ m} \over { v} + { m}} \cdot C\right) \end{equation}

where:

*   $v$ (```rating_count```) is the number of ratings received
*   $m$ is the minimum number of total ratings to be listed
*   $R$ (```rating_average```) is the average rating of the book
*   $C$ is the mean rating across all books

**2.4 Collaborative Filter - User-item based**

CF system with cosine similarity in ratings by different users to recommend the books. Prefer to take a subset of books with high ratings.

**2.5 Content based Filter**

Calculate similarities in Book Titles, make use of the TF-IDF feature vector. Because of computational issues, prefer only books with very high ratings.


In [43]:
# We make sure our system is dynamic, so we first ask the user for a book name and the number of books to recommend
bookName = input("What book do you like? ") # Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))
nrOfRec = int(input("How many books do you want us to recommend? ")) #5

In [44]:
# Most popular book
def mostPopularBook(dataframe, n):
  if (n >= 1 and n <= len(dataframe)):
    data = pd.DataFrame(dataframe.groupby('ISBN')['Book-Rating'].count()).sort_values('Book-Rating', ascending=False).head(n)
    result = pd.merge(data, books, on='ISBN')
    return result
  return "Invalid nr of books entered."

In [45]:
print("We recommend ", nrOfRec, " popular books: ")
mostPopularBook(explicit, nrOfRec)

We recommend  4  popular books: 


Unnamed: 0,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,316666343,707,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown"
1,971880107,581,Wild Animus,Rich Shapero,2004,Too Far
2,385504209,488,The Da Vinci Code,Dan Brown,2003,Doubleday
3,312195516,383,The Red Tent (Bestselling Backlist),Anita Diamant,1998,Picador USA


In [46]:
# Most popular book in a country
place = input("Enter the name of a country: ").lower() # India
booksPerCountry = explicit[explicit['Country'] == place]
mostPopularBook(booksPerCountry,nrOfRec)

Unnamed: 0,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,0971880107,3,Wild Animus,Rich Shapero,2004,Too Far
1,2742708448,2,Pourquoi j'ai mangÃ?Â© mon pÃ?Â¨re,Roy Lewis,1996,Actes Sud
2,2266104535,2,Et Si C'Etait Vrai / If This Were Only True,Marc Levy,2001,Pocket
3,184195425X,2,Life of Pi,Yann Martel,2004,Pub Group West


In [47]:
# Average Weighted Rating
def avgWR(newdf, df):
  newdf['Average Rating'] = 0
  for x in range(len(newdf)):
    l = list(df.loc[df['Book-Title'] == newdf['Book-Title'][x]]['Book-Rating'])
    newdf['Average Rating'][x] = sum(l)/len(l)
  return newdf

In [48]:

counts = explicit['Book-Title'].value_counts()
df = pd.DataFrame()
df['Book-Title'] = counts.index
df['Total-Ratings'] = df['Book-Title'].map(lambda item : counts[item])

#df = avgWR(df, explicit)
#df.to_pickle('weightedData.pkl')

In [49]:
# if the pickle file is already made:
df = pd.read_pickle('Books/weightedData.pkl')


In [50]:
C = df['Average Rating'].mean()
m = df['Total-Ratings'].quantile(0.90)

In [51]:
def WR(x,m=m,C=C):
  v = x['Total-Ratings']
  R = x['Average Rating']
  return (v/(v+m)*R) + (m/(m+v)*C)

In [52]:
df = df.loc[df['Total-Ratings'] >= m]

df['score'] = df.apply(WR, axis=1)
df = df.sort_values('score', ascending=False)

print("Because you like ", bookName, ", we recommend the following ", nrOfRec, " books: \n")
df.head(nrOfRec)

Because you like  clara , we recommend the following  4  books: 



Unnamed: 0,Book-Title,Total-Ratings,Average Rating,score
4794,Postmarked Yesteryear: 30 Rare Holiday Postcards,11,10,9.189906
7272,The Sneetches and Other Stories,8,10,9.002961
17,Harry Potter and the Prisoner of Azkaban (Book 3),277,9,8.971768
28,Harry Potter and the Goblet of Fire (Book 4),247,9,8.968407


In [53]:
# Collaborative Filter (User-Item based)
# we pick only books with a rating > 60
counts = explicit['Book-Title'].value_counts()
df = pd.DataFrame()
df['Book-Title'] = counts.index
df['Total-Ratings'] = df['Book-Title'].map(lambda item: counts[item])

df = explicit.merge(df, left_on='Book-Title', right_on='Book-Title', how='left')
df = df.drop(['Year-Of-Publication', 'Publisher', 'Age', 'City', 'State', 'Country'], axis=1)

In [54]:
treshold = 60
subsetExplicit = df[df['Total-Ratings'] >= treshold]
subsetExplicit = subsetExplicit.reset_index(drop=True)
#subsetExplicit.head()

In [55]:
# User-Item CF
# first create a new dataframe grouped by ISBN number (index) and only containing User-ID and Book-Rating
userRatingDf = pd.DataFrame()
userRatingDf['ISBN'] = subsetExplicit['ISBN']
userRatingDf['Book-Rating'] = subsetExplicit['Book-Rating']
userRatingDf['User-ID'] = subsetExplicit['User-ID']
userRatingDf = userRatingDf[['User-ID','Book-Rating']].groupby(userRatingDf['ISBN'])
#userRatingDf.head()

In [56]:
# Vectorizing the group keys and calculating the similarity matrix
listOfDictonaries=[]
indexMap = {}
reverseIndexMap = {}
ptr=0

for groupKey in userRatingDf.groups.keys():  #this runs over all isbn numbers
    tempDict={}
    groupDF = userRatingDf.get_group(groupKey)
    for i in range(0,len(groupDF)):
        tempDict[groupDF.iloc[i,0]] = groupDF.iloc[i,1]
    indexMap[ptr]=groupKey
    reverseIndexMap[groupKey] = ptr
    ptr=ptr+1
    listOfDictonaries.append(tempDict)

dictVectorizer = DictVectorizer(sparse=True)
vector = dictVectorizer.fit_transform(listOfDictonaries)
pairwiseSimilarity = cosine_similarity(vector)

In [57]:
vector

<1500x22940 sparse matrix of type '<class 'numpy.float64'>'
	with 57376 stored elements in Compressed Sparse Row format>

In [58]:
# make a function to print all book details
def printDetails(bookID):
  print(explicit[explicit['ISBN']==bookID]['Book-Title'].values[0])

In [66]:
# make a function to give the top n recommendations
def getCFRec(bookID):
  CFRecList = []
  row = reverseIndexMap[bookID]
  print("Input book: ", printDetails(bookID), "\n recommendations: \n")
  mn=0
  similar=[]
  print(row)
  for i in np.argsort(pairwiseSimilarity[row])[:-2][::-1]:
    if explicit[explicit['ISBN']==indexMap[i]]['Book-Title'].values[0] not in similar:
      if mn>=nrOfRec:
        break
      mn+=1
      similar.append(explicit[explicit['ISBN']==indexMap[i]]['Book-Title'].values[0])
      printDetails(indexMap[i])
      CFRecList.append(explicit[explicit['ISBN']==indexMap[i]]['Book-Title'].values[0])
  return CFRecList

In [65]:
k = list(explicit['Book-Title'])
m = list(explicit['ISBN'])

CFRecList = getCFRec(m[k.index(bookName)])

ValueError: 'clara' is not in list

In [61]:
# Content Based Filter
# we first select books with a very high rating > 80
treshold = 80
subsetExplicit = df[df['Total-Ratings'] >= treshold]
subsetExplicit = subsetExplicit.reset_index(drop=True)

In [62]:
# create the TF-IDF vectorizer
tf = TfidfVectorizer(ngram_range=(1, 2), min_df = 1, stop_words='english')
tfidf_matrix = tf.fit_transform(subsetExplicit['Book-Title'])
tfidf_matrix.shape

(44652, 1112)

In [63]:
# normalize the dataframe and calculate the cosine similarity matrix
normalized_df = tfidf_matrix.astype(np.float32)
cosine_similarities = cosine_similarity(normalized_df, normalized_df)
cosine_similarities.shape

(44652, 44652)

In [64]:
# print the recommended books according to the content filter
print("The content filter recommends the following books:\n")
isbn = books.loc[books['Book-Title'] == bookName].reset_index(drop = True).iloc[0]['ISBN']
content = []

idx = subsetExplicit.index[subsetExplicit['ISBN'] == isbn].tolist()[0]
similar_indices = cosine_similarities[idx].argsort()[::-1]
similar_items = []
for i in similar_indices:
    if subsetExplicit['Book-Title'][i] != bookName and subsetExplicit['Book-Title'][i] not in similar_items and len(similar_items) < nrOfRec:
        similar_items.append(subsetExplicit['Book-Title'][i])
        content.append(subsetExplicit['Book-Title'][i])

for book in similar_items:
    print(book)

The content filter recommends the following books:



IndexError: single positional indexer is out-of-bounds