## ML - Recommender Systems - Lab
We work on a dataset with 3 tables


1.   Books
2.   Users
3.   Ratings



In [1]:
# The following imports are needed throughout the lab exercise
import re
import pickle
import operator
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from scipy.sparse import csr_matrix
from pandas.api.types import is_numeric_dtype
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

##0 - Read in the dataset

In [2]:
books = pd.read_csv('Books/Books.csv', delimiter=';', encoding='ISO-8859-1', on_bad_lines='skip')
users = pd.read_csv('Books/Users.csv', delimiter=';', encoding='ISO-8859-1', on_bad_lines='skip')
ratings = pd.read_csv('Books/Ratings.csv', delimiter=';', encoding='ISO-8859-1', on_bad_lines='skip')

#books.shape
#users.shape
#ratings.shape

##1 - Data pre-processing
The following steps should be done on the datasets: 

**Books**


*   Drop the Image URL features
*   Check for null values, replace with 'Other' (Hint: use `.loc` and `.iloc`)
*   Check the `Year-Of-Publication` column, what's wrong there? (Before 'fixing' the issue, it is better to increase the column width so the entire value of `Book-Title`can be seen (there is hidden information in that field)
*   There are three tuples with another anomaly, can you find them? Fix it manually
*   Convert publication year to `int`
*   Replace invalid years with the modus of the year column
*   Make sure the ISBN column is in uppercase, remove duplicate rows

In [3]:
#Fill in the code
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [4]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [None]:
# Drop the image URL features
#books.drop(fill in ...)
books.head()

In [None]:
# Check for null values, replace with 'Other'
#books. 

In [None]:
#

In [None]:
#

In [None]:
# use books.iloc ...

In [None]:
# Check the year column, what's wrong there? 
books['Year-Of-Publication'].unique()
# There are publishers in the year column ... 

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
#books.loc[books['...'] == 'DK Publishing Inc', :]

In [None]:
#books.loc[books['Year-Of-Publication'] == '...',]

In [None]:
#books.iloc[205538]['Publisher'] = 'DK Publishing Inc'
#books.iloc[209538]['Year-Of-Publication'] = 2000
#books.iloc[209538]['Book-Title'] = 'DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)'
#books.iloc[209538]['Book-Author'] = 'Michael Teitelbaum'

# do something similar for the other two books

In [None]:
# Convert the year column to int and print a list of the publication years
# ...

In [None]:
# Replace invalid years with the modus of the year column

# here we choose to use the built-in mode() function of the statistics library
# it is possible to handcraft your own method as well
import statistics
modeYear = statistics.mode(books['Year-Of-Publication'])
modeYear

In [None]:
# replace the publication years that don't suffice with the mode calculated above

In [None]:
# Make sure the ISBN column is in uppercase, remove duplicates
# ...

books.info()

In [None]:
books.head(3)

**Ratings**


*   Check for null values.
*   Cast `Rating` and `User-ID` column to `int`
*   Make sure the ISBN column values are aligned with those of the `Books` table, otherwise drop the record.
*   Make sure the ISBN column is in uppercase, remove duplicate rows

In [None]:
ratings.columns

In [None]:
ratings.head()

In [None]:
# Check for null values


In [None]:
# Check Rating and User-ID column
print(is_numeric_dtype(ratings['Book-Rating']))
print(is_numeric_dtype(ratings['User-ID']))

In [None]:
# Make sure the ISBN column values are aligned with those of the Books table, otherwise drop
# This code is given

## checking ISBN
flag = 0
k =[]
reg = "[^A-Za-z0-9]"

for x in ratings['ISBN']:
    z = re.search(reg,x)    
    if z:
        flag = 1

if flag == 1:
    print("False")
else:
    print("True")

In [None]:
## remove extra characters from ISBN (from ratings dataset) existing in books dataset
bookISBN = books['ISBN'].tolist() 
reg = "[^A-Za-z0-9]" 
for index, row_Value in ratings.iterrows():
    z = re.search(reg, row_Value['ISBN'])    
    if z:
        f = re.sub(reg,"",row_Value['ISBN'])
        if f in bookISBN:
            ratings.at[index , 'ISBN'] = f

In [None]:
# Make sure the ISBN column is in uppercase, remove duplicates
# ...

ratings.info()

In [None]:
ratings.head()

**Users**


*   Check for null values.
*   Replace the invalid ages (age should be between 10 and 80) in the `Age` column with the mean value.
*   Split the `location` column in `City`, `State` and `Country` if a value is `null` replace it with `other`.
*   Remove duplicate rows

In [None]:
users.columns

In [None]:
users.head(3)

In [None]:
# Check for null values
# ...

In [None]:
# Replace the invalid ages, fill null values and change datatype to int
print(sorted(list(users['Age'].unique())))

In [None]:
validAge = #...

mean = #...
mean

In [None]:
# ...

In [None]:
# Split the location column appropriately
# this code is given

list_ = users.Location.str.split(', ')

city = []
state = []
country = []
count_no_state = 0    
count_no_country = 0

for i in range(0,len(list_)):
    if list_[i][0] == ' ' or list_[i][0] == '' or list_[i][0]=='n/a' or list_[i][0] == ',':  #removing invalid entries too
        city.append('other')
    else:
        city.append(list_[i][0].lower())

    if(len(list_[i])<2):
        state.append('other')
        country.append('other')
        count_no_state += 1
        count_no_country += 1
    else:
        if list_[i][1] == ' ' or list_[i][1] == '' or list_[i][1]=='n/a' or list_[i][1] == ',':   #removing invalid entries 
            state.append('other')
            count_no_state += 1            
        else:
            state.append(list_[i][1].lower())
        
        if(len(list_[i])<3):
            country.append('other')
            count_no_country += 1
        else:
            if list_[i][2] == ''or list_[i][1] == ',' or list_[i][2] == ' ' or list_[i][2] == 'n/a':
                country.append('other')
                count_no_country += 1
            else:
                country.append(list_[i][2].lower())
        
users = users.drop('Location',axis=1)

temp = []
for ent in city:
    c = ent.split('/')            #handling cases where city/state entries from city list as state is already given 
    temp.append(c[0])

df_city = pd.DataFrame(temp,columns=['City'])
df_state = pd.DataFrame(state,columns=['State'])
df_country = pd.DataFrame(country,columns=['Country'])

users = pd.concat([users, df_city], axis=1)
users = pd.concat([users, df_state], axis=1)
users = pd.concat([users, df_country], axis=1)

print(count_no_country)   #printing the number of countries didnt have any values 
print(count_no_state)     #printing the states which didnt have any values

In [None]:
# Remove duplicate rows and don't forget to reset the index!!
# ...

In [None]:
users.info()

In [None]:
users.head()

**Merge the tables**

Merge (inner join) the `Books`, `Users` and `Ratings` tables to one

In [None]:
dataset = pd.merge(books, ratings, on='ISBN', how='inner')
dataset = pd.merge(dataset, users, on='User-ID', how='inner')
dataset.info()

**Split the dataset**

Splitting is done on the `Book-Rating` column, it can have an implicit rating (equal to 0) or and explicit rating (!= 0)

In [None]:
explicit = dataset[dataset['Book-Rating'] != 0]
explicit = explicit.reset_index(drop=True)
explicit.shape

In [None]:
implicit = dataset[dataset['Book-Rating'] == 0]
implicit = implicit.reset_index(drop=True)
implicit.shape

##2 - Recommendation systems

**2.1 - Most popular book**

Sort the entire dataset on the ratings and recommend the top $n$ books.

**2.2 - Most popular book in a country**

Sort the entire dataset like in 2.1, but now filter according to a country to recommend the top $n$ books.

**2.3 - Use the average weighted rating**

Calculate the weighted score with: 
\begin{equation}WR = \left({{v} \over {v} + {m}} \cdot R\right) + \left({{ m} \over { v} + { m}} \cdot C\right) \end{equation}

where:

*   $v$ (```rating_count```) is the number of ratings received
*   $m$ is the minimum number of total ratings to be listed
*   $R$ (```rating_average```) is the average rating of the book
*   $C$ is the mean rating across all books

**2.4 Collaborative Filter - User-item based**

CF system with cosine similarity in ratings by different users to recommend the books. Prefer to take a subset of books with high ratings.

**2.5 Content based Filter**

Calculate similarities in Book Titles, make use of the TF-IDF feature vector. Because of computational issues, prefer only books with very high ratings.


In [None]:
# We make sure our system is dynamic, so we first ask the user for a book name and the number of books to recommend
bookName = input("What book do you like? ") # Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))
nrOfRec = int(input("How many books do you want us to recommend? ")) #5

In [None]:
# Most popular book
def mostPopularBook(dataframe, n):
  if (n >= 1 and n <= len(dataframe)):
    data = #...
    result = #...
    return result
  return "Invalid nr of books entered."

In [None]:
print("We recommend ", nrOfRec, " popular books: ")
mostPopularBook(explicit, nrOfRec)

In [None]:
# Most popular book in a country
place = input("Enter the name of a country: ").lower() # India
booksPerCountry = #...
mostPopularBook(booksPerCountry,nrOfRec)

In [None]:
# Average Weighted Rating
def avgWR(newdf, df):
  newdf['Average Rating'] = 0
  for x in range(len(newdf)):
    #...
  return newdf

In [None]:
counts = explicit['Book-Title'].value_counts()
df = pd.DataFrame()
df['Book-Title'] = counts.index
df['Total-Ratings'] = df['Book-Title'].map(lambda item : counts[item])

#df = avgWR(df, explicit)
#df.to_pickle('weightedData.pkl')

In [None]:
# if the pickle file is already made:
df = pd.read_pickle('Books/weightedData.pkl')


In [None]:
C = #...
m = #...

In [None]:
def WR(x,m=m,C=C):
  v = #...
  R = #...
  return #...

In [None]:
df = #df.loc[...]

df['score'] = df.apply(WR, axis=1)
df = df.sort_values('score', ascending=False)

print("Because you like ", bookName, ", we recommend the following ", nrOfRec, " books: \n")
df.head(nrOfRec)

In [None]:
# Collaborative Filter (User-Item based)
# we pick only books with a rating > 60
counts = explicit['Book-Title'].value_counts()
df = pd.DataFrame()
df['Book-Title'] = counts.index
df['Total-Ratings'] = df['Book-Title'].map(lambda item: counts[item])

df = explicit.merge(df, left_on='Book-Title', right_on='Book-Title', how='left')
df = df.drop(['Year-Of-Publication', 'Publisher', 'Age', 'City', 'State', 'Country'], axis=1)

In [None]:
# set the treshold to 60 and filter out the books with ratings >= 60
#...


In [None]:
# User-Item CF
# first create a new dataframe grouped by ISBN number (index) and only containing User-ID and Book-Rating
userRatingDf = #..
#...


In [None]:
# Vectorizing the group keys and calculating the similarity matrix
listOfDictonaries=[]
indexMap = {}
reverseIndexMap = {}
ptr=0

for groupKey in userRatingDf.groups.keys():  #this runs over all isbn numbers
    tempDict={}
    groupDF = userRatingDf.get_group(groupKey)
    for i in range(0,len(groupDF)):
        tempDict[groupDF.iloc[i,0]] = groupDF.iloc[i,1]
    indexMap[ptr]=groupKey
    reverseIndexMap[groupKey] = ptr
    ptr=ptr+1
    listOfDictonaries.append(tempDict)

dictVectorizer = DictVectorizer(sparse=True)
vector = dictVectorizer.fit_transform(listOfDictonaries)
pairwiseSimilarity = #cosine similarity on vector

In [None]:
# make a function to print all book details
def printDetails(bookID):
  #print(...)

In [None]:
# make a function to give the top n recommendations
def getCFRec(bookID):
  CFRecList = []
  row = reverseIndexMap[bookID]
  print("Input book: ", printDetails(bookID), "\n recommendations: \n")
  mn=0
  similar=[]
  print(row)
  #for i in np.argsort(pairwiseSimilarity[row])[:-2][::-1]:
    # ...
    # ...
    # ...
      #CFRecList.append(...)
  return CFRecList

In [None]:
k = list(explicit['Book-Title'])
m = list(explicit['ISBN'])

CFRecList = getCFRec(m[k.index(bookName)])

In [None]:
# Content Based Filter
# we first select books with a very high rating > 80
treshold = #...
#...

In [None]:
# create the TF-IDF vectorizer
#...
#...
tfidf_matrix.shape

In [None]:
# normalize the dataframe and calculate the cosine similarity matrix
#...
#...
cosine_similarities.shape

In [None]:
# print the recommended books according to the content filter
print("The content filter recommends the following books:\n")
isbn = books.loc[books['Book-Title'] == bookName].reset_index(drop = True).iloc[0]['ISBN']
content = []

idx = subsetExplicit.index[subsetExplicit['ISBN'] == isbn].tolist()[0]
similar_indices = cosine_similarities[idx].argsort()[::-1]
similar_items = []
for i in similar_indices:
#...
#...
#...

for book in similar_items:
    print(book)