In [29]:
import pandas as pd

In [30]:
dtype_books = {
    'User-ID': str,
    'Book-Title': str,
    'Book-Author': str,
    'Year-Of-Publication': str,
    'Publisher': str,
    'Image-URL-S': str,
    'Image-URL-M': str,
    'Image-URL-L': str,
}

# load books
books = pd.read_csv('../backend/BX-Books.csv',  encoding='cp1251', sep=';', dtype=dtype_books, on_bad_lines='skip')

# Normalize the text data for more effective comparison
books['Normalized-Title'] = books['Book-Title'].str.lower()
books['Normalized-Title'] = books['Normalized-Title'].str.replace('[^a-z ]', '', regex=True)
books['Normalized-Author'] = books['Book-Author'].str.lower()
books['Normalized-Author'] = books['Normalized-Author'].str.replace('[^a-z]', '', regex=True)
books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Normalized-Title,Normalized-Author
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,classical mythology,markpomorford
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,clara callan,richardbrucewright
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,decision in normandy,carlodeste
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,flu the story of the great influenza pandemic ...,ginabarikolata
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,the mummies of urumchi,ejwbarber
...,...,...,...,...,...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,theres a bat in bunk five,pauladanziger
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,from one to one hundred,terisloat
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,lily dale the true story of the town that tal...,christinewicker
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,republic worlds classics,plato


In [31]:
books_by_author = books.groupby('Normalized-Author')

def are_titles_similar(title1, title2):
    """Check if titles are similar based on tokenization and element count variance."""
    tokens1 = set(title1.split())
    tokens2 = set(title2.split())

    # Check for at least one title containing all elements of the other
    if not tokens1.issubset(tokens2) and not tokens2.issubset(tokens1):
        return False

    # Allow for +/- 30% more or less elements in one of the titles
    len1, len2 = len(tokens1), len(tokens2)
    if len1 >= len2 * 0.8 and len1 <= len2 * 1.2:
        return True

    return False

def find_similar_books(row):
    final_isbns = []
    
    if row['Normalized-Author'] in books_by_author.groups:
        author_books = books.loc[books_by_author.groups[row['Normalized-Author']]]
        titles = author_books['Normalized-Title'].tolist()
        isbns = author_books['ISBN'].tolist()

        for index, compare_title in enumerate(titles):
            # Skip comparison with itself
            if compare_title == row['Normalized-Title']:
                continue
            
            if are_titles_similar(row['Normalized-Title'], compare_title):
                final_isbns.append(isbns[index])

    return tuple(final_isbns)

# Apply the modified function
books['Similar-ISBN'] = books.apply(find_similar_books, axis=1)
books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Normalized-Title,Normalized-Author,Similar-ISBN
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,classical mythology,markpomorford,()
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,clara callan,richardbrucewright,()
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,decision in normandy,carlodeste,()
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,flu the story of the great influenza pandemic ...,ginabarikolata,()
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,the mummies of urumchi,ejwbarber,()
...,...,...,...,...,...,...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,theres a bat in bunk five,pauladanziger,()
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,from one to one hundred,terisloat,()
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,lily dale the true story of the town that tal...,christinewicker,"(0060086661,)"
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,republic worlds classics,plato,()


In [32]:
books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Normalized-Title,Normalized-Author,Similar-ISBN
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,classical mythology,markpomorford,()
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,clara callan,richardbrucewright,()
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,decision in normandy,carlodeste,()
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,flu the story of the great influenza pandemic ...,ginabarikolata,()
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,the mummies of urumchi,ejwbarber,()
...,...,...,...,...,...,...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,theres a bat in bunk five,pauladanziger,()
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,from one to one hundred,terisloat,()
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,lily dale the true story of the town that tal...,christinewicker,"(0060086661,)"
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,republic worlds classics,plato,()


In [33]:
duplicates = books[books['Similar-ISBN'].apply(len) > 1]
duplicates

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Normalized-Title,Normalized-Author,Similar-ISBN
51,0842342702,Left Behind: A Novel of the Earth's Last Days ...,Tim Lahaye,2000,Tyndale House Publishers,http://images.amazon.com/images/P/0842342702.0...,http://images.amazon.com/images/P/0842342702.0...,http://images.amazon.com/images/P/0842342702.0...,left behind a novel of the earths last days le...,timlahaye,"(0842329129, 0842329110, 0842354204)"
102,0451166892,The Pillars of the Earth,Ken Follett,1996,Signet Book,http://images.amazon.com/images/P/0451166892.0...,http://images.amazon.com/images/P/0451166892.0...,http://images.amazon.com/images/P/0451166892.0...,the pillars of the earth,kenfollett,"(0688046592, 0671690841, 0451207149)"
158,0060801263,Tree Grows In Brooklyn,Betty Smith,1988,Harpercollins Publisher,http://images.amazon.com/images/P/0060801263.0...,http://images.amazon.com/images/P/0060801263.0...,http://images.amazon.com/images/P/0060801263.0...,tree grows in brooklyn,bettysmith,"(006092988X, 0060001941, 0895773287, 069452582..."
162,0671461494,The Hitchhiker's Guide to the Galaxy,Douglas Adams,1982,Pocket,http://images.amazon.com/images/P/0671461494.0...,http://images.amazon.com/images/P/0671461494.0...,http://images.amazon.com/images/P/0671461494.0...,the hitchhikers guide to the galaxy,douglasadams,"(0345453743, 0517599244)"
222,0385730586,Sisterhood of the Traveling Pants,ANN BRASHARES,2003,Delacorte Books for Young Readers,http://images.amazon.com/images/P/0385730586.0...,http://images.amazon.com/images/P/0385730586.0...,http://images.amazon.com/images/P/0385730586.0...,sisterhood of the traveling pants,annbrashares,"(0385729332, 0807205893)"
...,...,...,...,...,...,...,...,...,...,...,...
270648,0439128668,"Broken Sky #4 (Broken Sky, 4)",Chris Wooding,2000,Apple,http://images.amazon.com/images/P/0439128668.0...,http://images.amazon.com/images/P/0439128668.0...,http://images.amazon.com/images/P/0439128668.0...,broken sky broken sky,chriswooding,"(0439128684, 0439128641, 0439128692)"
270649,0439128692,Broken Sky (Broken Sky),Chris Wooding,2001,Apple,http://images.amazon.com/images/P/0439128692.0...,http://images.amazon.com/images/P/0439128692.0...,http://images.amazon.com/images/P/0439128692.0...,broken sky broken sky,chriswooding,"(0439139988, 0439128684, 0439128641, 043912865..."
270691,0060919639,The Autumn of the Patriarch,Gabriel Garcia Marquez,1991,Perennial,http://images.amazon.com/images/P/0060919639.0...,http://images.amazon.com/images/P/0060919639.0...,http://images.amazon.com/images/P/0060919639.0...,the autumn of the patriarch,gabrielgarciamarquez,"(0380017741, 0060114193)"
270891,0590974092,"Sarah, Plain and Tall",Patricia MacLachlan,1996,Scholastic,http://images.amazon.com/images/P/0590974092.0...,http://images.amazon.com/images/P/0590974092.0...,http://images.amazon.com/images/P/0590974092.0...,sarah plain and tall,patriciamaclachlan,"(0064402053, 0064406229, 0060241012, 0060241020)"


In [34]:
duplicate_isbns = set(duplicates['Similar-ISBN'])
duplicate_isbns

{('0812908341', '0449900878'),
 ('0553351702', '0879510188'),
 ('0060293152', '006440966X'),
 ('0380978326', 'B0000AA9JU'),
 ('0812566637', '0345456440', '0765300354'),
 ('0434009210', '0671678523'),
 ('0345409671', '0679438327'),
 ('0345334299', '0345351843'),
 ('0553141740', '0553276786'),
 ('0722150954',
  '0446348325',
  '0446314366',
  '0445002972',
  '0446313556',
  '0446310085'),
 ('0451166582', '067081458X'),
 ('0671218336', '0671741926', '067149175X'),
 ('039441392X', '0345352483'),
 ('0312148267',
  '0805002138',
  '1587172046',
  '0590447742',
  '0440495555',
  '068971310X',
  '0440403855',
  '0448124114',
  '0448060280',
  '0307122182',
  '0582526523',
  '0812505107',
  '1590071476',
  '072701854X',
  '0448110288',
  '093459368X',
  '0312136242',
  '0684180251',
  '1568651155',
  '0590412949',
  '0590434047'),
 ('0590542095', '0590975145'),
 ('0439064872', '0439064864', '0439420105', '0439554896'),
 ('0140389652', '0525442766'),
 ('0440504716', '0440105595', '0800714083'),


In [35]:
len(duplicate_isbns)

1502

In [36]:
df = pd.read_csv('../backend/BX-Books.csv',  encoding='cp1251', sep=';', dtype=dtype_books, on_bad_lines='skip')

data = {}
toRemove = []

for dup in duplicate_isbns:
    rows = df[df['ISBN'].isin(dup)]
    
    # Find the most common title among these duplicates
    all_titles = rows['Book-Title'].tolist()
    #print("all_titles:", all_titles)
    most_common_title = max(set(all_titles), key=all_titles.count)
    
    keep = rows[rows['Book-Title'] == most_common_title].iloc[0]
    remove = rows[rows['ISBN'] != keep['ISBN']]
    toRemove.extend(remove.index)
    data[keep['ISBN']] = remove['ISBN'].tolist()
    #print("keep:", keep['Book-Title'])

df.drop(toRemove, inplace=True)
len(toRemove)

2803

In [37]:
df.to_csv('../backend/BX-Books-Cleaned.csv', sep=';', index=False, encoding='cp1251')

In [38]:
data

{'0812908341': ['0449900878'],
 '0553351702': ['0879510188'],
 '0060293152': ['006440966X'],
 '0380978326': ['B0000AA9JU'],
 '0765300354': ['0312867875'],
 '0671678523': ['0434009210'],
 '0679438327': ['0679441018'],
 '0345351843': ['0345334299'],
 '0553276786': ['0553255134'],
 '0722150954': ['0446348325',
  '0446314366',
  '0445002972',
  '0446313556',
  '0446310085'],
 '0451166582': ['067081458X'],
 '0671218336': ['0671741926', '067149175X'],
 '039441392X': ['0345352483'],
 '0312148267': ['0805002138',
  '1587172046',
  '0590447742',
  '068971310X',
  '0684128195',
  '0440403855',
  '0448124114',
  '0448060280',
  '0307122182',
  '0582526523',
  '0812505107',
  '1590071476',
  '072701854X',
  '0448110288',
  '093459368X',
  '0312136242',
  '0684180251',
  '1568651155',
  '0590412949',
  '0590434047'],
 '0590542095': ['0590975145'],
 '0439064872': ['0439064864', '0439420105', '0747545774', '1551922444'],
 '0140389652': ['0525442766'],
 '0440504716': ['0440105595', '0800714083'],
 '18

In [39]:
dtype_ratings = {
    'User-ID': str,
    'ISBN': str,
    'Book-Rating': int,
}

ratings = pd.read_csv('../backend/BX-Book-Ratings.csv', encoding='cp1251', sep=';', dtype=dtype_ratings)
ratings

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [40]:
for key, value in data.items():
    ratings.loc[ratings['ISBN'].isin(value), 'ISBN'] = key

ratings.to_csv('../backend/BX-Book-Ratings-Cleaned.csv', sep=';', index=False, encoding='cp1251')