<a href="https://colab.research.google.com/github/Vaselefth/Recommendation_System_Project/blob/main/Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import string
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# read the csv files and create datasets
books = pd.read_csv('BX-Books.csv', sep=';', encoding="ISO-8859-1", error_bad_lines=False, low_memory=False)
users = pd.read_csv('BX-Users.csv', sep=';', encoding="ISO-8859-1", error_bad_lines=False, low_memory=False)
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', encoding="ISO-8859-1", error_bad_lines=False, low_memory=False)

# Preprocess
# drop columns from dataset tha we dont need 
books = books.drop(books.columns[[4, 5, 6, 7]], axis = 1)    
users = users.drop(users.columns[[1, 2]], axis = 1)    

# drop rows with missing values from all datasets
books = books.dropna()
users = users.dropna()
ratings = ratings.dropna()

# 1st step of preprocess. Remove all books that have less than 10 ratings and all users that have less than 5 ratings
User_counts = ratings['User-ID'].value_counts()
ratings = ratings[ratings['User-ID'].isin(User_counts[User_counts >= 5].index)]
ISBN_counts = ratings['ISBN'].value_counts()
ratings = ratings[ratings['ISBN'].isin(ISBN_counts[ISBN_counts >= 10].index)]

books = books.merge(ratings[['ISBN']], how = 'inner', on = 'ISBN').drop_duplicates()
users = users.merge(ratings[['User-ID']], how = 'inner', on = 'User-ID').drop_duplicates()

# 2nd step of preprocess. Make book titles a list of keywords with a variety of methods
# Function for removing NonAscii characters
def remove_non_Ascii(title):
    return "".join(i for i in title if ord(i)<128)

# Function for converting into lower case
def to_lower_case(title):
    return title.lower()

# Function for removing stop words
def remove_stop_words(title):
    title = title.split()
    stops = set(stopwords.words("english"))
    title = [w for w in title if not w in stops]
    title = " ".join(title)
    return title

# Function for removing punctuation
def remove_punctuation(title):
    exclude = set(string.punctuation)
    title = ''.join(ch for ch in title if ch not in exclude)
    return title

# Function for removing the html tags
def remove_html_tags(title):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', title)

# Function for tokenizing the title
def tokenizer(title):
    title = word_tokenize(title)
    return title

# Function for stemming the title
ps = PorterStemmer()
def stemmer(title):
    stem_title = [ps.stem(w) for w in title]
    return stem_title

# Apllying the methods to reform book titles as keywords
books['Title-Keywords'] = books['Book-Title'].apply(lambda x : remove_non_Ascii(x))
books['Title-Keywords'] = books['Title-Keywords'].apply(lambda x : to_lower_case(x))
books['Title-Keywords'] = books['Title-Keywords'].apply(lambda x : remove_stop_words(x))
books['Title-Keywords'] = books['Title-Keywords'].apply(lambda x : remove_punctuation(x))
books['Title-Keywords'] = books['Title-Keywords'].apply(lambda x : remove_html_tags(x))
books['Title-Keywords'] = books['Title-Keywords'].apply(lambda x : tokenizer(x))
books['Title-Keywords'] = books['Title-Keywords'].apply(lambda x : stemmer(x))
   
# Function for finding the top 3 books rated by a User
def find_favorite_books(user):
    top_books = ratings.set_index('User-ID')
    top_books = top_books.loc[user]
    top_books = top_books.reset_index()[['ISBN','Book-Rating']]
    top_books = top_books.nlargest(3, 'Book-Rating')
    return top_books

# Function for finding which books a user have already read
def find_books_already_read(user):
    read_books = ratings.set_index('User-ID')
    read_books = read_books.loc[user]
    read_books = read_books.reset_index()[['ISBN','Book-Rating']]
    return read_books

# Function for creating a user profile based on top 3 books that he rated
def create_user_profile(user):
    favorite_books = find_favorite_books(user)
    favorite_books = favorite_books.reset_index()[['ISBN','Book-Rating']]
    
    # Creating empty lists to store the concatenation of top 3 Authors, Publication Years and Titles
    titles = []
    authors = [] 
    publication_years = []
    
    # Creating the "profile" list tha contains the lists above
    profile = []
    
    # Filling the lists 
    for index,book in favorite_books.iterrows():
            titles += books.set_index('ISBN').loc[book['ISBN']]['Title-Keywords'] #not using append because titles are already lists
            authors.append(books.set_index('ISBN').loc[book['ISBN']]['Book-Author'])
            publication_years.append(books.set_index('ISBN').loc[book['ISBN']]['Year-Of-Publication'])
    
    profile.append(titles)
    profile.append(authors)
    profile.append(publication_years)
    return profile

# Function for calculating the jaccard similarity between 2 lists
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

# Function for calculating the dice coefficient between 2 lists
def dice_coefficient(a, b):
    a_bigrams = set(a)
    b_bigrams = set(b)
    overlap = len(a_bigrams & b_bigrams)
    return overlap * 2.0/(len(a_bigrams) + len(b_bigrams))

# Function for calculating the minimum difference between the top 3 years and the year of each book
def publication_years_difference_similarity(year, top_3_years):
    differences_years=[]
    for x in top_3_years:
        diff = 1-round((abs(year-int(x))/2005),4) 
        differences_years.append(diff)
    return min(differences_years)

# Function for finding if the author is the same with one of the top 3 authors 
def same_author_similarity(author, top_3_authors):
    if author in top_3_authors:
        return 1
    else:
        return 0

# Function for calculating the title keywords similarity
def title_similarity(mode, title, top_3_titles):
    if mode == "Jaccard":
        similarity = jaccard_similarity(title, top_3_titles)
        return similarity
    elif mode == "Dice":
        similarity = dice_coefficient(title, top_3_titles)
        return similarity

# Function for calculating the weighted jaccard similarity and dice coefficient
def similarity_score(mode, title, author,year):
    if mode == "Jaccard":
        score = 0.2 * title + 0.4 * author + 0.4 * year
        return round(score,4)
    elif mode == "Dice":
        score = 0.5 * title + 0.3 * author + 0.2 * year
        return round(score,4)

# Function for calculating the FINAL similarity in order to make the recommendations
def final_similarity(mode, book, profile):
    year_score = publication_years_difference_similarity(int(book['Year-Of-Publication']), profile[2])
    author_score = same_author_similarity(book['Book-Author'], profile[1])
    title_score = title_similarity(mode, book['Title-Keywords'], profile[0])
    final_score = similarity_score(mode, title_score, author_score, year_score)
    return final_score

# Function for returning the title of a Book based on ISBN
def get_title_from_ISBN(ISBN):
    return books.set_index('ISBN').loc[ISBN]['Book-Title']

# Function for making the recommendations
def recommend_books(mode, profile, user_id):
    
    recommended_books = pd.DataFrame(books['ISBN'])
    
    books_already_rated = find_books_already_read(user_id)
    
    similarity_mode = ''
    if mode == "Jaccard":
        similarity_mode = "Jaccard-Similarity"
    elif mode == "Dice": 
        similarity_mode = "Dice-Coefficient"
    
    recommended_books[similarity_mode] = None   
    
    for index, book in books.iterrows():
        if books['ISBN'][index] not in books_already_rated['ISBN'].tolist():
            recommended_books.loc[index][similarity_mode] = final_similarity(mode, book, profile)
            
    return recommended_books.sort_values([similarity_mode],ascending=False).head(10)

# Get 5 random users id and make predictions for Jaccard and Dice and exporting them to csv file
random_users = users['User-ID'].sample(5).tolist()
print(random_users)

# Creating empty lists to store the 10 predictions for each user
jaccard_list = []
dice_list = []

for user_id in random_users:
    profile = create_user_profile(user_id)

    recommended_books_with_Jaccard_Similarity = recommend_books("Jaccard",profile,user_id)
    recommended_books_with_Dice_Coefficient = recommend_books("Dice",profile,user_id)
    
    recommended_books_with_Jaccard_Similarity["Book-Title"] = recommended_books_with_Jaccard_Similarity["ISBN"].apply(lambda x : get_title_from_ISBN(x))
    recommended_books_with_Dice_Coefficient["Book-Title"] = recommended_books_with_Dice_Coefficient["ISBN"].apply(lambda x : get_title_from_ISBN(x))

    jaccard_list.append(recommended_books_with_Jaccard_Similarity)
    dice_list.append(recommended_books_with_Dice_Coefficient)
    
    recommended_books_with_Jaccard_Similarity.to_csv(str(user_id)+' 10 Jaccard Recommendations', sep=';', index=False)
    recommended_books_with_Dice_Coefficient.to_csv(str(user_id)+' 10 Dice Recommendations', sep=';', index=False)
