# Important Libraries

In [11]:
import time
from qwikidata.sparql  import return_sparql_query_results
from qwikidata.linked_data_interface import get_entity_dict_from_api
import pandas as pd
import numpy as np
import pickle as pkl
from collections import Counter
import urllib.request
import json
import string
from difflib import SequenceMatcher
from viapy.api import ViafAPI
import re
import math
import matplotlib.pyplot as plt
from scipy.signal import find_peaks

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [12]:
location_to_save = "data/final_files/"
large_location_to_save = "data/final_files/large_files/"

In [38]:
pd.set_option('display.max_rows', 1000)

# Read

In [30]:
entire_books = pd.read_csv(location_to_save+"items_books_with_readable_properties_correct_ISBN.csv", low_memory=False, index_col=0).fillna("")
entire_ratings = pd.read_csv(large_location_to_save+"entire_ratings_with_readable_properties_filtered_ISBNs.csv", low_memory=False, index_col=0).fillna("")
fairbook_ratings = pd.read_csv(location_to_save+"fairbook_ratings_with_readable_properties_filtered_ISBNs.csv", low_memory=False, index_col=0).fillna("")

# Replace

In [31]:
len(fairbook_ratings), len(fairbook_ratings.drop_duplicates(["ISBN", "User-ID"]))

(86782, 86782)

In [32]:
len(entire_ratings), len(entire_ratings.drop_duplicates(["ISBN", "User-ID"]))

(1031136, 1031136)

In [33]:
with open('ISBN_dict.pkl', 'rb') as handle:
        large_ISBN_dict = pkl.load(handle)

In [34]:
for index, row in fairbook_ratings.iterrows():
    if row.alt_title!="":
        ISBN = row.ISBN
        ISBN_alt = large_ISBN_dict[ISBN]
        fairbook_ratings.at[index, "ISBN"] = ISBN_alt

In [35]:
len(fairbook_ratings), len(fairbook_ratings.drop_duplicates(["ISBN", "User-ID"]))

(86782, 86356)

In [39]:
fairbook_ratings[fairbook_ratings.duplicated(["ISBN", "User-ID"], keep=False)].sort_values("ISBN")

Unnamed: 0,User-ID,ISBN,Book-Rating,title,author,year,publisher,alt_title,alt_author,correct_author,...,QID,sexuality,country,language,religion,gender,ethnicity,birthyear,deathyear,label
63246,203240,0020442203,8,"The Lion, the Witch and the Wardrobe (rpkg) (N...",C. S. Lewis,1994.0,HarperTrophy,"The Lion, the Witch and the Wardrobe",C. S. Lewis,C. S. Lewis,...,Q9204,,United Kingdom,English,"atheist,Anglicanism",male,"British people,Irish people",1898.0,1963.0,C. S. Lewis
63240,203240,0020442203,8,"Lion, the Witch and the Wardrobe",C.S. Lewis,1970.0,MacMillan Publishing Company.,"The Lion, the Witch and the Wardrobe",Clive Staples Lewis,,...,Q9204,,United Kingdom,English,"atheist,Anglicanism",male,"British people,Irish people",1898.0,1963.0,C. S. Lewis
50801,162030,006001203X,7,Dr. Atkins' New Diet Revolution,Robert C. Atkins,1997.0,Avon Books,Dr. Atkins' New Diet Revolution,"Robert C. Atkins, M.D.","Robert C. Atkins, M.D.",...,Q638020,,United States of America,,,male,,1930.0,2003.0,Robert Atkins
50794,162030,006001203X,8,Dr. Atkins' New Diet Revolution,Robert C. Atkins,2001.0,Avon Books,Dr. Atkins' New Diet Revolution,Robert C. Atkins,Robert C. Atkins,...,Q638020,,United States of America,,,male,,1930.0,2003.0,Robert Atkins
43544,137589,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996.0,HarperCollins,Divine Secrets of the Ya-Ya Sisterhood,Rebecca Wells,Rebecca Wells,...,Q1619374,,United States of America,,,female,,1953.0,,Rebecca Wells
43545,137589,0060173289,8,The Divine Secrets of the Ya-Ya Sisterhood: A ...,Rebecca Wells,2002.0,HarperTorch,Divine Secrets of the Ya-Ya Sisterhood,Rebecca Wells,Rebecca Wells,...,Q1619374,,United States of America,,,female,,1953.0,,Rebecca Wells
45015,141710,0060173289,10,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996.0,HarperCollins,Divine Secrets of the Ya-Ya Sisterhood,Rebecca Wells,Rebecca Wells,...,Q1619374,,United States of America,,,female,,1953.0,,Rebecca Wells
45016,141710,0060173289,10,The Divine Secrets of the Ya-Ya Sisterhood: A ...,Rebecca Wells,2002.0,HarperTorch,Divine Secrets of the Ya-Ya Sisterhood,Rebecca Wells,Rebecca Wells,...,Q1619374,,United States of America,,,female,,1953.0,,Rebecca Wells
54502,173415,0060188731,10,Bel Canto: A Novel,Ann Patchett,2002.0,Perennial,Bel Canto,Ann Patchett,Ann Patchett,...,Q433485,,United States of America,,,female,,1963.0,,Ann Patchett
54499,173415,0060188731,10,Bel Canto,Ann Patchett,2001.0,HarperCollins Publishers,Bel Canto,Ann Patchett,Ann Patchett,...,Q433485,,United States of America,,,female,,1963.0,,Ann Patchett


In [40]:
for index, row in entire_ratings.iterrows():
    if row.alt_title!="":
        ISBN = row.ISBN
        ISBN_alt = large_ISBN_dict[ISBN]
        entire_ratings.at[index, "ISBN"] = ISBN_alt

In [41]:
len(entire_ratings), len(entire_ratings.drop_duplicates(["ISBN", "User-ID"]))

(1031136, 1021847)

In [42]:
entire_ratings[entire_ratings.duplicated(["ISBN", "User-ID"], keep=False)].sort_values("ISBN")

Unnamed: 0,ISBN,title,author,year,publisher,alt_title,alt_author,correct_author,alt_first_author,viaf_id,...,country,language,religion,gender,ethnicity,birthyear,deathyear,label,User-ID,Book-Rating
190178,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,Clara Callan,Richard Bruce Wright,Richard Bruce Wright,Richard Bruce Wright,64022406,...,Canada,,,male,,1937,2017,Richard B. Wright,11676,8
468688,0002005018,Clara Callan : A Novel,Richard B. Wright,2002,HarperCollins,Clara Callan,Richard B. Wright,Richard B. Wright,Richard B. Wright,64022406,...,Canada,,,male,,1937,2017,Richard B. Wright,11676,8
880104,0002190915,Life on Earth: A Natural History,David Attenborough,1981,Little Brown &amp; Co,Life on Earth,David Attenborough,David Attenborough,David Attenborough,41836007,...,United Kingdom,,,male,,1926,,David Attenborough,76626,10
790324,0002190915,Life on earth: A natural history,David Attenborough,1979,Collins,Life on Earth,David Attenborough,David Attenborough,David Attenborough,41836007,...,United Kingdom,,,male,,1926,,David Attenborough,76626,0
978884,0002200813,Lichens (New Naturalist),Oliver Gilbert,2000,Trafalgar Square,Lichens,O. L. Gilbert,O. L. Gilbert,O. L. Gilbert,92720950,...,United Kingdom,,,male,,1936,2005,Oliver Lathe Gilbert,122881,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529551,9682105331,"Horror 2, Los Relatos De Twilight Zone",Stephen King,1995,Planeta Pub Corp,Horror 2,Stephen King,Stephen King,Stephen King,97113511,...,United States of America,English,,male,,1947,,Stephen King,189835,5
13144,9871138016,Cronica De Una Muerte Anunciada,Gabriel Marques Garcia,2002,Debols!llo,Crónica de una muerte anunciada,Gabriel García Márquez,Gabriel García Márquez,Gabriel García Márquez,54147956,...,Colombia,Spanish,,male,,1927,2014,Gabriel García Márquez,11676,0
449156,9871138016,Cronica De Una Muerte Anunciada,Gabriel Garcia Marquez,1996,Sudamericana,Crónica de una muerte anunciada,Gabriel García Márquez,Gabriel García Márquez,Gabriel García Márquez,54147956,...,Colombia,Spanish,,male,,1927,2014,Gabriel García Márquez,11676,8
969219,9971401169,Thailand: Seven Days in the Kingdom,William Warren,1989,Granite Impex Ltd,Thailand,William Warren|Times Editions (Singapore),William Warren,William Warren,44312142,...,United States of America,,,male,,1934,1990,W. W. Bartley III,114865,10
