In [None]:
import numpy as np
import pandas as pd
import collections
from collections import Counter
from itertools import repeat, chain 
import nltk 
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem import PorterStemmer
stemming = PorterStemmer()
from nltk.corpus import stopwords
stopsEnglish = set(stopwords.words("english"))
stopsGerman = set(stopwords.words("german")) 
stopsItalian = set(stopwords.words("italian")) 
stopsFrench = set(stopwords.words("french")) 
stopsRussian = set(stopwords.words("russian")) 
stopsSpanish = set(stopwords.words("spanish")) 

#Read CSVs and Creating Datarames with Pandas.

RatingsDF = pd.read_csv('BX-Book-Ratings.csv',encoding= 'unicode_escape',delimiter = ';')
BooksDF = pd.read_csv('BX-Books.csv' ,encoding = 'latin-1', delimiter = ';')
UsersDF = pd.read_csv('BX-Users.csv' ,encoding = 'latin-1', delimiter = ';')

#Changing User-ID to UserID cause problems may occure with the '-'
RatingsDF = RatingsDF.rename(columns={'User-ID': 'UserID'})
UsersDF = UsersDF.rename(columns={'User-ID': 'UserID'})

# Preprocess

#Erasing Ratings for Books That don't exists in BX-Books.
RatingsDF = RatingsDF[RatingsDF.ISBN.isin(BooksDF.ISBN)]

#Keeping only Ratings for Books that have over 10 ratings.
newBookRatings = RatingsDF[RatingsDF.groupby('ISBN')['ISBN'].transform('count').ge(10)]

#Keeping only Ratings from Users who rated more than 5 Books.
finalRatingsDF = newBookRatings[newBookRatings.groupby('UserID')['UserID'].transform('count').ge(5)]

#Keeping only Books which are in finalRatingsDF.
newBooksDF =(BooksDF[BooksDF.ISBN.isin(finalRatingsDF.ISBN)])

#Keeping only Users that are in finalRatingsDF.
newUsersDF =(UsersDF[UsersDF.UserID.isin(finalRatingsDF.UserID)])

#Cast titles to String and Creating a new Column in newBooksDF to keep each title's keywords in there.

newBooksDF['Keywords-Book-Title'] = newBooksDF['Book-Title'].astype(str)

#Making all Characters in Keyword column Lower-Case.

newBooksDF['Keywords-Book-Title'] = newBooksDF['Keywords-Book-Title'].str.lower()

#Tokenization:

def identify_tokens(row):
    titles = row['Keywords-Book-Title']
    tokens = nltk.word_tokenize(titles)
    
    #Keeping only words(Not Numbers or punctuation).
    token_words = [w for w in tokens if w.isalpha()]
    return token_words


newBooksDF['Keywords-Book-Title'] = newBooksDF.apply(identify_tokens, axis=1)


#Stemming:

def stem_list(row):
    my_list = row['Keywords-Book-Title']
    stemmed_list = [stemming.stem(title) for title in my_list]
    return (stemmed_list)

newBooksDF['Keywords-Book-Title'] = newBooksDF.apply(stem_list, axis=1)

#Remove Stopwords(English,German,Italian,French,Russian,Spanish):

newBooksDF['Keywords-Book-Title'] = newBooksDF['Keywords-Book-Title'].apply(lambda x: [item for item in x if item not in stopsEnglish])
newBooksDF['Keywords-Book-Title'] = newBooksDF['Keywords-Book-Title'].apply(lambda x: [item for item in x if item not in stopsGerman])
newBooksDF['Keywords-Book-Title'] = newBooksDF['Keywords-Book-Title'].apply(lambda x: [item for item in x if item not in stopsItalian])
newBooksDF['Keywords-Book-Title'] = newBooksDF['Keywords-Book-Title'].apply(lambda x: [item for item in x if item not in stopsFrench])
newBooksDF['Keywords-Book-Title'] = newBooksDF['Keywords-Book-Title'].apply(lambda x: [item for item in x if item not in stopsRussian])
newBooksDF['Keywords-Book-Title'] = newBooksDF['Keywords-Book-Title'].apply(lambda x: [item for item in x if item not in stopsSpanish])

#Calculate some similarities for the Recommender System.

#Calculating Dice Coefficient for the lists.

def Dice_coefficient(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return (2*len(s1.intersection(s2))) / (len(s1) + len(s2))

#Calculating Jaccard Similarity for the lists.

def jaccard_similarity(list1,list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

#Checking for Writer Similarity.

def writer_similarity(writerList,writer):
         if writer in writerList:
            return 1;
         else:
            return 0; 

#Calculating minimum difference from publication year(and applying normalization).

def year_similarity(year_list,year):
    mindif = 1
    for i in year_list:
        difference = 1-(abs(int(i)-int(year))/2005)
        if (difference < mindif):
            mindif = difference
    return mindif

#Creating a 'User Profile' Dataframe with the books a user Prefers.

def user_preferences(user):
    dfUserScores = finalRatingsDF.loc[finalRatingsDF['UserID'] == user]
    dfPrefs = dfUserScores.nlargest(3,'Book-Rating')
    
    dfUserBooks =(newBooksDF[newBooksDF.ISBN.isin(dfPrefs.ISBN)])
    
    list_of_tag_list = dfUserBooks['Keywords-Book-Title'].tolist()
    
    
    #There is list of lists so I make it flat list.
    
    tag_list = []
    for x in list_of_tag_list:
        for y in x:
            tag_list.append(y)
            
    writer_list = dfUserBooks['Book-Author'].tolist()
    year_list = dfUserBooks['Year-Of-Publication'].tolist()
    
    return dfUserScores, dfUserBooks, tag_list, writer_list, year_list


#Calculating similarity with every element of the Dataframe and creating two new Columns with these values
#for the 2 different similarities.


def similarity_computation(userid,tag_list,writer_list,year_list):
    for index, row in booksToSuggest.iterrows():
        thisWriter = row['Book-Author']
        thisYear = row['Year-Of-Publication']
        thisTagList = row ['Keywords-Book-Title']
        jaccardSim = jaccard_similarity(tag_list,thisTagList)
        
        diceCoefficient = Dice_coefficient(tag_list,thisTagList)
        
        writerSim = writer_similarity(writer_list,thisWriter)
        
        yearSim = year_similarity(year_list,thisYear)
        
        rowSimilarityJaccard = (jaccardSim*0.2) + (writerSim*0.4) + (yearSim*0.4)
        rowSimilarityDice =  (diceCoefficient*0.5) + (writerSim*0.3) + (yearSim*0.2)
        booksToSuggest.at[index,'Jac-Similarity'] = rowSimilarityJaccard
        booksToSuggest.at[index,'Dice-Coefficient'] = rowSimilarityDice
    return booksToSuggest


#Calculating List Overlap.

def Overlap (list1,list2):
    s1 = set(list1)
    s2 = set(list2)
    
    return  len(s1.intersection(s2)) / min(len(s1),len(s2))


#Experiment No1: Select 5 random users and Export CSVs based on the Recommendation System.
#Exporting 5 CSVs for Jaccard Similarity and 5 CSVs for Dice Coefficient.


    
randUsers = newUsersDF['UserID'].sample(n=5).tolist()

for x in randUsers:
    
    dfUserScores, dfUserBooks, tag_list, writer_list, year_list = user_preferences(x)
    
    #Creating Dataframe with the Books that the User has not rated.
    booksToSuggest = newBooksDF[~newBooksDF.ISBN.isin(dfUserBooks.ISBN)]
   
    
    #Calculating Similarity.
    similarity_computation(x,tag_list,writer_list,year_list)    
    
    #Fetching 10 more similar Books Sorted by each similarity, and Exporting CSVs for each User.
    booksToSuggestJaccard = booksToSuggest.nlargest(10,'Jac-Similarity')
    booksToSuggestDice = booksToSuggest.nlargest(10,'Dice-Coefficient')
    booksToSuggestJaccard.to_csv('Jaccard_User%dRecommendations.csv' %(x), index = None, header=True,sep = ';')
    booksToSuggestDice.to_csv('Dice_User%dRecommendations.csv' %(x), index = None, header=True,sep = ';')
    
    
    


# Experiment No2:Calculating Overlap.

OverlapList = []

for x in randUsers:
    
    overlapSum = 0
    
    #Fetching each User's created CSVs.
    thisUserDfJaccard = pd.read_csv('Jaccard_User%dRecommendations.csv' %(x) ,encoding = 'latin-1', delimiter = ';')
    thisUserDfDice = pd.read_csv('Dice_User%dRecommendations.csv' %(x) ,encoding = 'latin-1', delimiter = ';')
    
    #Creating Book Lists with each User's Results for both Jaccard and Dice.
    thisUserJaccardList = thisUserDfJaccard['ISBN'].tolist()
    thisUserDiceList = thisUserDfDice['ISBN'].tolist()
    
    #Calculating average Overlap and append it to the OverlapList for all Users.
    
    for j in range(1,11):
        overlapSum += Overlap( thisUserJaccardList[:j], thisUserDiceList[:j])
        
    thisUserOverlap = overlapSum/10
    OverlapList.append(thisUserOverlap)
print (OverlapList)

#Experiment No3:Calculating Overlap Using a Golden Standard List.

OverlapListJac = []
OverlapListDice = []

for x in randUsers:
    UserISBNListofLists = []
    overlapSumJac = 0
    overlapSumDice = 0
    
    #Fetch every user's created CSV.
    thisUserDfJaccard = pd.read_csv('Jaccard_User%dRecommendations.csv' %(x) ,encoding = 'latin-1', delimiter = ';')
    thisUserDfDice = pd.read_csv('Dice_User%dRecommendations.csv' %(x) ,encoding = 'latin-1', delimiter = ';')
    
    #Creating a List with each user results
    thisUserListJac = thisUserDfJaccard['ISBN'].tolist()
    thisUserListDice = thisUserDfDice['ISBN'].tolist()
    
    #Merging the two Lists
    thisUserList =  thisUserListJac + thisUserListDice
    
    #Sorting List by Number of every Book.
    thisUserSortedList = list(chain.from_iterable(repeat(i, c) 
         for i, c in Counter(thisUserList).most_common())) 
    
    #Creation of a 'Golden Standard' for the userdeleting duplicates.
    goldenStandard = list(dict.fromkeys(thisUserSortedList))
    
    #Calculating ovrlap of every list using the 10 first golden standard elements.
    for j in range(1,11):
        overlapSumJac += Overlap( thisUserListJac[:j], goldenStandard[:j])
        overlapSumDice += Overlap( thisUserListDice[:j], goldenStandard[:j])
    
    thisUserOverlapJac = overlapSumJac/10
    thisUserOverlapDice = overlapSumDice/10
    OverlapListJac.append(thisUserOverlapJac)
    OverlapListDice.append(thisUserOverlapDice)
    
print (OverlapListJac,OverlapListDice)
            
