In [100]:
from fuzzywuzzy import process, fuzz
import pandas as pd
import sys

In [101]:
dtype_books = {
    'User-ID': str,
    'Book-Title': str,
    'Book-Author': str,
    'Year-Of-Publication': str,
    'Publisher': str,
    'Image-URL-S': str,
    'Image-URL-M': str,
    'Image-URL-L': str,
}

# load books
books = pd.read_csv('../backend/BX-Books.csv',  encoding='cp1251', sep=';', dtype=dtype_books, on_bad_lines='skip')

# Normalize the text data for more effective comparison
books['Normalized-Title'] = books['Book-Title'].str.lower()
books['Normalized-Title'] = books['Normalized-Title'].str.replace('[^a-z ]', '', regex=True)
books['Normalized-Author'] = books['Book-Author'].str.lower()
books['Normalized-Author'] = books['Normalized-Author'].str.replace('[^a-z]', '', regex=True)
books

In [102]:
import pandas as pd

# Assuming 'books' DataFrame is already defined with 'Normalized-Author' and 'Normalized-Title'

# Group books by author
books_by_author = books.groupby('Normalized-Author')

def are_titles_similar(title1, title2):
    """Check if titles are similar based on tokenization and element count variance."""
    tokens1 = set(title1.split())
    tokens2 = set(title2.split())

    # Check for at least one title containing all elements of the other
    if not tokens1.issubset(tokens2) and not tokens2.issubset(tokens1):
        return False

    # Allow for +/- 30% more or less elements in one of the titles
    len1, len2 = len(tokens1), len(tokens2)
    max_allowed = max(len1, len2) * 1.3
    min_allowed = min(len1, len2) * 0.7

    return len(tokens1) >= min_allowed and len(tokens2) <= max_allowed

def find_similar_books(row):
    final_isbns = []
    
    if row['Normalized-Author'] in books_by_author.groups:
        author_books = books.loc[books_by_author.groups[row['Normalized-Author']]]
        titles = author_books['Normalized-Title'].tolist()
        isbns = author_books['ISBN'].tolist()

        for index, compare_title in enumerate(titles):
            # Skip comparison with itself
            if compare_title == row['Normalized-Title']:
                continue
            
            if are_titles_similar(row['Normalized-Title'], compare_title):
                final_isbns.append(isbns[index])

    return tuple(final_isbns)

# Apply the modified function
books['Similar-ISBN'] = books.apply(find_similar_books, axis=1)
books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Normalized-Title,Normalized-Author,Similar-ISBN
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,classical mythology,markpomorford,()
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,clara callan,richardbrucewright,()
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,decision in normandy,carlodeste,()
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,flu the story of the great influenza pandemic ...,ginabarikolata,()
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,the mummies of urumchi,ejwbarber,()
...,...,...,...,...,...,...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,theres a bat in bunk five,pauladanziger,"(0603550282,)"
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,from one to one hundred,terisloat,()
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,lily dale the true story of the town that tal...,christinewicker,"(0060086661,)"
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,republic worlds classics,plato,"(039395501X, 0192833707, 0192829092)"


In [103]:
"""
# Group books by author
books_by_author = books.groupby('Normalized-Author')

def is_length_similar(len1, len2, threshold=0.5):
    return min(len1, len2) / max(len1, len2) > threshold

def find_similar_books(row, threshold=90, length_ratio_threshold=0.5):
    final_isbns = []
    
    if row['Normalized-Author'] in books_by_author.groups:
        author_books = books.loc[books_by_author.groups[row['Normalized-Author']]]
        titles = author_books['Normalized-Title'].tolist()
        isbns = author_books['ISBN'].tolist()

        for index, compare_title in enumerate(titles):
            # Skip comparison with itself
            if compare_title == row['Normalized-Title']:
                continue
            
            # Check if the lengths of the titles are somewhat similar
            if not is_length_similar(len(row['Normalized-Title']), len(compare_title), length_ratio_threshold):
                continue

            # Combine scoring functions for a more nuanced similarity assessment
            token_set_score = fuzz.token_set_ratio(row['Normalized-Title'], compare_title)
            simple_ratio_score = fuzz.ratio(row['Normalized-Title'], compare_title)

            # Require both scores to be above the threshold
            if token_set_score > threshold and simple_ratio_score > threshold:
                final_isbns.append(isbns[index])

    print(f"\rProcessed: {row['Normalized-Title'][:50]:50}", end='')
    sys.stdout.flush()

    return tuple(final_isbns)

# Apply the modified function
books['Similar-ISBN'] = books.apply(find_similar_books, axis=1)
print()
"""

'\n# Group books by author\nbooks_by_author = books.groupby(\'Normalized-Author\')\n\ndef is_length_similar(len1, len2, threshold=0.5):\n    return min(len1, len2) / max(len1, len2) > threshold\n\ndef find_similar_books(row, threshold=90, length_ratio_threshold=0.5):\n    final_isbns = []\n    \n    if row[\'Normalized-Author\'] in books_by_author.groups:\n        author_books = books.loc[books_by_author.groups[row[\'Normalized-Author\']]]\n        titles = author_books[\'Normalized-Title\'].tolist()\n        isbns = author_books[\'ISBN\'].tolist()\n\n        for index, compare_title in enumerate(titles):\n            # Skip comparison with itself\n            if compare_title == row[\'Normalized-Title\']:\n                continue\n            \n            # Check if the lengths of the titles are somewhat similar\n            if not is_length_similar(len(row[\'Normalized-Title\']), len(compare_title), length_ratio_threshold):\n                continue\n\n            # Combine scoring

In [104]:
books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Normalized-Title,Normalized-Author,Similar-ISBN
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,classical mythology,markpomorford,()
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,clara callan,richardbrucewright,()
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,decision in normandy,carlodeste,()
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,flu the story of the great influenza pandemic ...,ginabarikolata,()
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,the mummies of urumchi,ejwbarber,()
...,...,...,...,...,...,...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,theres a bat in bunk five,pauladanziger,"(0603550282,)"
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,from one to one hundred,terisloat,()
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,lily dale the true story of the town that tal...,christinewicker,"(0060086661,)"
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,republic worlds classics,plato,"(039395501X, 0192833707, 0192829092)"


In [105]:
duplicates = books[books['Similar-ISBN'].apply(len) > 1]
duplicates

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Normalized-Title,Normalized-Author,Similar-ISBN
5,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,http://images.amazon.com/images/P/0399135782.0...,the kitchen gods wife,amytan,"(0679748083, 1560542578)"
14,1558746218,A Second Chicken Soup for the Woman's Soul (Ch...,Jack Canfield,1998,Health Communications,http://images.amazon.com/images/P/1558746218.0...,http://images.amazon.com/images/P/1558746218.0...,http://images.amazon.com/images/P/1558746218.0...,a second chicken soup for the womans soul chic...,jackcanfield,"(155874262X, 1558749209)"
19,0452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994,Plume,http://images.amazon.com/images/P/0452264464.0...,http://images.amazon.com/images/P/0452264464.0...,http://images.amazon.com/images/P/0452264464.0...,beloved plume contemporary fiction,tonimorrison,"(0452280621, 0451161394, 8440656955, 045226136..."
28,0345417623,Timeline,MICHAEL CRICHTON,2000,Ballantine Books,http://images.amazon.com/images/P/0345417623.0...,http://images.amazon.com/images/P/0345417623.0...,http://images.amazon.com/images/P/0345417623.0...,timeline,michaelcrichton,"(0375408738, 0375404376, 8811685087, 034544583X)"
29,0684823802,OUT OF THE SILENT PLANET,C.S. Lewis,1996,Scribner,http://images.amazon.com/images/P/0684823802.0...,http://images.amazon.com/images/P/0684823802.0...,http://images.amazon.com/images/P/0684823802.0...,out of the silent planet,cslewis,"(0743234901, 0684833646)"
...,...,...,...,...,...,...,...,...,...,...,...
271248,0451521390,Pathfinder,James Fenimore Cooper,1982,Signet Book,http://images.amazon.com/images/P/0451521390.0...,http://images.amazon.com/images/P/0451521390.0...,http://images.amazon.com/images/P/0451521390.0...,pathfinder,jamesfenimorecooper,"(0451522575, 0873953606, 0192829564, 0140390715)"
271258,0684857502,TEXASVILLE : A Novel,Larry McMurtry,1999,Simon &amp; Schuster,http://images.amazon.com/images/P/0684857502.0...,http://images.amazon.com/images/P/0684857502.0...,http://images.amazon.com/images/P/0684857502.0...,texasville a novel,larrymcmurtry,"(067165764X, 0671735179, 0671625330)"
271312,0671015885,Star Trek: First Contact (Star Trek),J.M. Dillard,1997,Simon &amp; Schuster (Trade Division),http://images.amazon.com/images/P/0671015885.0...,http://images.amazon.com/images/P/0671015885.0...,http://images.amazon.com/images/P/0671015885.0...,star trek first contact star trek,jmdillard,"(067100316X, 067100316x, 0671573918)"
271329,8420614556,Lewis Carroll: A Traves Del Espejo Y Lo Que Al...,Lewis Carroll,1986,Lectorum Pubns (Adult),http://images.amazon.com/images/P/8420614556.0...,http://images.amazon.com/images/P/8420614556.0...,http://images.amazon.com/images/P/8420614556.0...,lewis carroll a traves del espejo y lo que ali...,lewiscarroll,"(9504001068, 8420636258, 0747553734, 070641558..."


In [106]:
duplicate_isbns = set(duplicates['Similar-ISBN'])
duplicate_isbns

{('0425167313', '0399143130', '0099750813', '1567402542'),
 ('0140320970', '0874063167'),
 ('0515100536', '0880297123'),
 ('0671452665', '0671411241'),
 ('0060175966', '0694520667'),
 ('0226039269', '287714125X'),
 ('1857984986',
  '042503819X',
  '0425061701',
  '0425081117',
  '042507479X',
  '0425095312'),
 ('0802132189', '2253007153'),
 ('0345301870', '0345335090', '0441062253'),
 ('155874262X', '1558749209', '0439159849'),
 ('0226743403', '0736620621'),
 ('0064400581',
  '0060739444',
  '0060235284',
  '0060274069',
  '0440844444',
  '0064407217',
  '0060219440',
  '0064471462',
  '0060219432',
  '0673801497',
  '0060540958'),
 ('0060987561', '0061097640', '0965605914', '0694519405', '0061098124'),
 ('0440164842', '0440064562', '0385287291'),
 ('0140019502', '2070360180', '3125974003'),
 ('0812970101',
  '0425129616',
  '0646418432',
  '0396071910',
  '0553240935',
  '0486296954',
  '158287171X'),
 ('0345288823', '0345012518', '0345334299', '0345274687'),
 ('0804108862', '04251036

In [107]:
len(duplicate_isbns)

12240

In [108]:
df = pd.read_csv('../backend/BX-Books.csv',  encoding='cp1251', sep=';', dtype=dtype_books, on_bad_lines='skip')

data = {}

for dup in duplicate_isbns:
    rows = df[df['ISBN'].isin(dup)]
    
    # Find the most common title among these duplicates
    all_titles = rows['Book-Title'].tolist()
    print("all_titles:", all_titles)
    most_common_title = max(set(all_titles), key=all_titles.count)
    
    keep = rows[rows['Book-Title'] == most_common_title].iloc[0]
    remove = rows[rows['ISBN'] != keep['ISBN']]
    data[keep['ISBN']] = remove
    print("keep:", keep['Book-Title'])
    books_copy = df.drop(remove.index)

df

all_titles: ['Here on Earth', "Here on Earth (Oprah's Book Club (Hardcover))", 'Here On Earth', 'Here on Earth']
keep: Here on Earth
all_titles: ['The Twenty-One Balloons', 'The Twenty-One Balloons']
keep: The Twenty-One Balloons
all_titles: ['The Aviators (Brotherhood of War)', 'Brotherhood of War']
keep: Brotherhood of War
all_titles: ['Flowers in the Attic', 'FLOWERS IN ATTIC']
keep: Flowers in the Attic
all_titles: ['The Professor and the Madman', 'Professor and The Madman, The']
keep: Professor and The Madman, The
all_titles: ['Selected Poems from Les Fleurs Du Mal', 'Les Fleurs Du Mal (World Classics)']
keep: Selected Poems from Les Fleurs Du Mal
all_titles: ['The Stainless Steel Rat', 'The Adventures of the Stainless Steel Rat', 'Adventures of Stainless Steel Rat', 'Adventures of Stainless Steel Rat', 'The Adventures of the Stainless Steel Rat', 'Adventures of Stainless Steel Rat']
keep: Adventures of Stainless Steel Rat
all_titles: ['Justine, Philosophy in the Bedroom and Other

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


In [109]:
df.to_csv('../backend/BX-Books-Cleaned.csv', sep=';', index=False, encoding='cp1251')

In [110]:
dtype_ratings = {
    'User-ID': str,
    'ISBN': str,
    'Book-Rating': int,
}

ratings = pd.read_csv('../backend/BX-Book-Ratings.csv', encoding='cp1251', sep=';', dtype=dtype_ratings)
ratings

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [111]:
for key, value in data.items():
    ratings.loc[ratings['ISBN'].isin(value['ISBN']), 'ISBN'] = key

df.to_csv('../backend/BX-Book-Ratings-Cleaned.csv', sep=';', index=False, encoding='cp1251')