In [10]:
from fuzzywuzzy import process, fuzz
import pandas as pd
import sys

In [11]:
dtype_books = {
    'User-ID': str,
    'Book-Title': str,
    'Book-Author': str,
    'Year-Of-Publication': str,
    'Publisher': str,
    'Image-URL-S': str,
    'Image-URL-M': str,
    'Image-URL-L': str,
}

# load books
books = pd.read_csv('../backend/BX-Books.csv',  encoding='cp1251', sep=';', dtype=dtype_books, on_bad_lines='skip')

# Normalize the text data for more effective comparison
books['Normalized-Title'] = books['Book-Title'].str.lower()
books['Normalized-Title'] = books['Normalized-Title'].str.replace('[^a-z ]', '', regex=True)
books['Normalized-Author'] = books['Book-Author'].str.lower()
books['Normalized-Author'] = books['Normalized-Author'].str.replace('[^a-z]', '', regex=True)
books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Normalized-Title,Normalized-Author
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,classical mythology,markpomorford
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,clara callan,richardbrucewright
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,decision in normandy,carlodeste
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,flu the story of the great influenza pandemic ...,ginabarikolata
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,the mummies of urumchi,ejwbarber
...,...,...,...,...,...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,theres a bat in bunk five,pauladanziger
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,from one to one hundred,terisloat
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,lily dale the true story of the town that tal...,christinewicker
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,republic worlds classics,plato


In [12]:
books_by_author = books.groupby('Normalized-Author')

def are_titles_similar(title1, title2):
    """Check if titles are similar based on tokenization and element count variance."""
    tokens1 = set(title1.split())
    tokens2 = set(title2.split())

    # Check for at least one title containing all elements of the other
    if not tokens1.issubset(tokens2) and not tokens2.issubset(tokens1):
        return False

    # Allow for +/- 30% more or less elements in one of the titles
    len1, len2 = len(tokens1), len(tokens2)
    max_allowed = min(len1, len2) * 1.3
    min_allowed = max(len1, len2) * 0.7

    return len(tokens1) >= min_allowed and len(tokens2) <= max_allowed

def find_similar_books(row):
    final_isbns = []
    
    if row['Normalized-Author'] in books_by_author.groups:
        author_books = books.loc[books_by_author.groups[row['Normalized-Author']]]
        titles = author_books['Normalized-Title'].tolist()
        isbns = author_books['ISBN'].tolist()

        for index, compare_title in enumerate(titles):
            # Skip comparison with itself
            if compare_title == row['Normalized-Title']:
                continue
            
            if are_titles_similar(row['Normalized-Title'], compare_title):
                final_isbns.append(isbns[index])

    return tuple(final_isbns)

# Apply the modified function
books['Similar-ISBN'] = books.apply(find_similar_books, axis=1)
books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Normalized-Title,Normalized-Author,Similar-ISBN
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,classical mythology,markpomorford,()
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,clara callan,richardbrucewright,()
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,decision in normandy,carlodeste,()
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,flu the story of the great influenza pandemic ...,ginabarikolata,()
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,the mummies of urumchi,ejwbarber,()
...,...,...,...,...,...,...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,theres a bat in bunk five,pauladanziger,()
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,from one to one hundred,terisloat,()
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,lily dale the true story of the town that tal...,christinewicker,"(0060086661,)"
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,republic worlds classics,plato,"(039395501X,)"


In [13]:
"""
# Group books by author
books_by_author = books.groupby('Normalized-Author')

def is_length_similar(len1, len2, threshold=0.5):
    return min(len1, len2) / max(len1, len2) > threshold

def find_similar_books(row, threshold=90, length_ratio_threshold=0.5):
    final_isbns = []
    
    if row['Normalized-Author'] in books_by_author.groups:
        author_books = books.loc[books_by_author.groups[row['Normalized-Author']]]
        titles = author_books['Normalized-Title'].tolist()
        isbns = author_books['ISBN'].tolist()

        for index, compare_title in enumerate(titles):
            # Skip comparison with itself
            if compare_title == row['Normalized-Title']:
                continue
            
            # Check if the lengths of the titles are somewhat similar
            if not is_length_similar(len(row['Normalized-Title']), len(compare_title), length_ratio_threshold):
                continue

            # Combine scoring functions for a more nuanced similarity assessment
            token_set_score = fuzz.token_set_ratio(row['Normalized-Title'], compare_title)
            simple_ratio_score = fuzz.ratio(row['Normalized-Title'], compare_title)

            # Require both scores to be above the threshold
            if token_set_score > threshold and simple_ratio_score > threshold:
                final_isbns.append(isbns[index])

    print(f"\rProcessed: {row['Normalized-Title'][:50]:50}", end='')
    sys.stdout.flush()

    return tuple(final_isbns)

# Apply the modified function
books['Similar-ISBN'] = books.apply(find_similar_books, axis=1)
print()
"""

'\n# Group books by author\nbooks_by_author = books.groupby(\'Normalized-Author\')\n\ndef is_length_similar(len1, len2, threshold=0.5):\n    return min(len1, len2) / max(len1, len2) > threshold\n\ndef find_similar_books(row, threshold=90, length_ratio_threshold=0.5):\n    final_isbns = []\n    \n    if row[\'Normalized-Author\'] in books_by_author.groups:\n        author_books = books.loc[books_by_author.groups[row[\'Normalized-Author\']]]\n        titles = author_books[\'Normalized-Title\'].tolist()\n        isbns = author_books[\'ISBN\'].tolist()\n\n        for index, compare_title in enumerate(titles):\n            # Skip comparison with itself\n            if compare_title == row[\'Normalized-Title\']:\n                continue\n            \n            # Check if the lengths of the titles are somewhat similar\n            if not is_length_similar(len(row[\'Normalized-Title\']), len(compare_title), length_ratio_threshold):\n                continue\n\n            # Combine scoring

In [14]:
books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Normalized-Title,Normalized-Author,Similar-ISBN
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,classical mythology,markpomorford,()
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,clara callan,richardbrucewright,()
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,decision in normandy,carlodeste,()
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,flu the story of the great influenza pandemic ...,ginabarikolata,()
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,the mummies of urumchi,ejwbarber,()
...,...,...,...,...,...,...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,theres a bat in bunk five,pauladanziger,()
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,from one to one hundred,terisloat,()
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,lily dale the true story of the town that tal...,christinewicker,"(0060086661,)"
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,republic worlds classics,plato,"(039395501X,)"


In [15]:
duplicates = books[books['Similar-ISBN'].apply(len) > 1]
duplicates

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Normalized-Title,Normalized-Author,Similar-ISBN
14,1558746218,A Second Chicken Soup for the Woman's Soul (Ch...,Jack Canfield,1998,Health Communications,http://images.amazon.com/images/P/1558746218.0...,http://images.amazon.com/images/P/1558746218.0...,http://images.amazon.com/images/P/1558746218.0...,a second chicken soup for the womans soul chic...,jackcanfield,"(155874262X, 1558749209)"
19,0452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994,Plume,http://images.amazon.com/images/P/0452264464.0...,http://images.amazon.com/images/P/0452264464.0...,http://images.amazon.com/images/P/0452264464.0...,beloved plume contemporary fiction,tonimorrison,"(0452280621, 0451161394, 8440656955, 009976011..."
51,0842342702,Left Behind: A Novel of the Earth's Last Days ...,Tim Lahaye,2000,Tyndale House Publishers,http://images.amazon.com/images/P/0842342702.0...,http://images.amazon.com/images/P/0842342702.0...,http://images.amazon.com/images/P/0842342702.0...,left behind a novel of the earths last days le...,timlahaye,"(0842329129, 0842329110, 0842316752, 0842354204)"
61,0679810307,Shabanu: Daughter of the Wind (Border Trilogy),SUZANNE FISHER STAPLES,1991,Laurel Leaf,http://images.amazon.com/images/P/0679810307.0...,http://images.amazon.com/images/P/0679810307.0...,http://images.amazon.com/images/P/0679810307.0...,shabanu daughter of the wind border trilogy,suzannefisherstaples,"(0744590116, 0394848152)"
70,1853260053,Tess of the D'Urbervilles (Wordsworth Classics),Thomas Hardy,1997,NTC/Contemporary Publishing Company,http://images.amazon.com/images/P/1853260053.0...,http://images.amazon.com/images/P/1853260053.0...,http://images.amazon.com/images/P/1853260053.0...,tess of the durbervilles wordsworth classics,thomashardy,"(0553211684, 0553211048, 0451524292, 156619146..."
...,...,...,...,...,...,...,...,...,...,...,...
271169,0395647398,"The Two Towers (The Lord of the Rings, Part 2)",J. R. R. Tolkien,1992,Houghton Mifflin,http://images.amazon.com/images/P/0395647398.0...,http://images.amazon.com/images/P/0395647398.0...,http://images.amazon.com/images/P/0395647398.0...,the two towers the lord of the rings part,jrrtolkien,"(0345272595, 0395489326, 0345253442, 061815396..."
271173,0553333062,"Anne of Green Gables Boxed Set, Vol. 1 (Anne o...",L.M. MONTGOMERY,1997,Laurel Leaf,http://images.amazon.com/images/P/0553333062.0...,http://images.amazon.com/images/P/0553333062.0...,http://images.amazon.com/images/P/0553333062.0...,anne of green gables boxed set vol anne of gr...,lmmontgomery,"(0340568879, 059042243X, 0140325670, 014032462..."
271230,9626345268,Moby Dick (Classic Literature with Classical M...,Herman Melville,1995,Naxos Audiobooks Ltd.,http://images.amazon.com/images/P/9626345268.0...,http://images.amazon.com/images/P/9626345268.0...,http://images.amazon.com/images/P/9626345268.0...,moby dick classic literature with classical music,hermanmelville,"(B00009APKU, 1561560936, 0553210076, 045151538..."
271258,0684857502,TEXASVILLE : A Novel,Larry McMurtry,1999,Simon &amp; Schuster,http://images.amazon.com/images/P/0684857502.0...,http://images.amazon.com/images/P/0684857502.0...,http://images.amazon.com/images/P/0684857502.0...,texasville a novel,larrymcmurtry,"(067165764X, 0671735179, 0671625330)"


In [16]:
duplicate_isbns = set(duplicates['Similar-ISBN'])
duplicate_isbns

{('0590475460', '0440423376'),
 ('0451124685', '0451156889'),
 ('0671547798', '0671008870'),
 ('0440404061', '0152524088'),
 ('0553211684',
  '0553211048',
  '0451524292',
  '1566191467',
  '0553210610',
  '0451519248',
  '1853268380',
  '0451516869',
  '0395051444',
  '0312793464'),
 ('0553239082', '0061002879'),
 ('0393096653',
  '0451513398',
  '3895082090',
  '0691014647',
  '0060955724',
  '0807014257'),
 ('0553561669', '0553078755', '0613080939'),
 ('0060179872', '0815411839'),
 ('0590429892', '0395428572'),
 ('0385730586', '0385729332', '0807205893'),
 ('0804900973',
  '0425098389',
  '0345327128',
  '0893754021',
  '0891040234',
  '0883011379',
  '0020195907'),
 ('0060975547', '0553344234', '0553268082', '0060975814'),
 ('0060512180', '0688085105'),
 ('0440495717', '0394925696', '0717289664'),
 ('0671877992', '3404139895', '0671577913'),
 ('0918956463', '0829713700'),
 ('0312266588', '0753810875', '1842121421'),
 ('0688014348', '0394867297'),
 ('0679415629', '0767904443'),
 ('0

In [17]:
len(duplicate_isbns)

7476

In [18]:
df = pd.read_csv('../backend/BX-Books.csv',  encoding='cp1251', sep=';', dtype=dtype_books, on_bad_lines='skip')

data = {}

for dup in duplicate_isbns:
    rows = df[df['ISBN'].isin(dup)]
    
    # Find the most common title among these duplicates
    all_titles = rows['Book-Title'].tolist()
    print("all_titles:", all_titles)
    most_common_title = max(set(all_titles), key=all_titles.count)
    
    keep = rows[rows['Book-Title'] == most_common_title].iloc[0]
    remove = rows[rows['ISBN'] != keep['ISBN']]
    data[keep['ISBN']] = remove
    print("keep:", keep['Book-Title'])
    books_copy = df.drop(remove.index)

df

all_titles: ['Enormous Egg', 'Enormous Egg']
keep: Enormous Egg
all_titles: ['Touch the Devil', 'Touch the Devil']
keep: Touch the Devil
all_titles: ['Kahless (Star Trek: The Next Generation)', 'Kahless (Star Trek: The Next Generation)']
keep: Kahless (Star Trek: The Next Generation)
all_titles: ['MARY POPPINS', 'Mary Poppins']
keep: MARY POPPINS
all_titles: ["Tess of the d'Urbervilles", "Tess of the D'Urbervilles", "Tess of the D'Urbervilles", "Tess of the D'Urbervilles", "Tess of the d'Urbervilles", 'Tess of the Durbervilles', 'Tess of the Durbervilles', "Tess of the D'Urbervilles", 'Tess of the Durbervilles', 'Tess of the Durbervilles']
keep: Tess of the Durbervilles
all_titles: ['Poirot Investigates', 'Poirot Investigates']
keep: Poirot Investigates
all_titles: ['Walden', 'Walden and Civil Disobedience', 'Walden', 'Walden', 'Walden', 'Walden']
keep: Walden
all_titles: ['Ishmael', 'Ishmael', 'Ishmael']
keep: Ishmael
all_titles: ['Marilyn Monroe: The Biography', 'Marilyn Monroe: The 

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


In [19]:
df.to_csv('../backend/BX-Books-Cleaned.csv', sep=';', index=False, encoding='cp1251')

In [20]:
dtype_ratings = {
    'User-ID': str,
    'ISBN': str,
    'Book-Rating': int,
}

ratings = pd.read_csv('../backend/BX-Book-Ratings.csv', encoding='cp1251', sep=';', dtype=dtype_ratings)
ratings

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [21]:
for key, value in data.items():
    ratings.loc[ratings['ISBN'].isin(value['ISBN']), 'ISBN'] = key

ratings.to_csv('../backend/BX-Book-Ratings-Cleaned.csv', sep=';', index=False, encoding='cp1251')