In [26]:
import pandas as pd

amazon = pd.read_csv('data\main_dataset.csv')
goodreads = pd.read_csv("data\goodreads_books.csv")
covers = pd.read_csv("data\Books.csv")

  covers = pd.read_csv("data\Books.csv")


In [27]:
# Check for duplicates in title column
a = goodreads.isbn.duplicated().sum()
b = amazon.isbn.duplicated().sum()
c = covers["Book-Title"].duplicated().sum()

print("There are {} duplicates in the goodreads dataset".format(a))
print("There are {} duplicates in the amazon dataset".format(b))
print("There are {} duplicates in the covers dataset".format(c))

There are 11882 duplicates in the goodreads dataset
There are 9873 duplicates in the amazon dataset
There are 29225 duplicates in the covers dataset


In [28]:
# Drop duplicates in the goodreads dataset
goodreads = goodreads.dropna(subset=['isbn'])
# Drop rows that have the same isbn and keep only the first in the amazon dataset
amazon = amazon.drop_duplicates(subset=['isbn'], keep='first')
# Drop duplicates in the goodreads dataset
goodreads = goodreads.dropna(subset=['isbn'])
# Drop rows that have the same isbn and keep only the first in the amazon dataset
amazon = amazon.drop_duplicates(subset=['isbn'], keep='first')
# Drop rows that have the same title and keep only those that have a corresponding isbn in the goodreads dataset otherwise keep the first in the covers dataset
# Identify duplicate titles in the covers dataset
# Step 1: Identify duplicates by title in covers
dup_titles_covers = covers[covers["Book-Title"].duplicated(keep=False)]

# Step 2 & 3: Check for matching ISBNs and prioritize entries
filtered_covers = pd.DataFrame()
for title, group in dup_titles_covers.groupby('Book-Title'):
    # Check if any ISBN in this group matches those in goodreads
    matching_isbns = group[group['ISBN'].isin(goodreads['isbn'])]
    
    if not matching_isbns.empty:
        # Keep entries with matching ISBNs
        filtered_covers = pd.concat([filtered_covers, matching_isbns], ignore_index=True)
    else:
        # Keep the first entry based on index if no matching ISBNs
        filtered_covers = pd.concat([filtered_covers, group.head(1)], ignore_index=True)

# Step 4: Remove all duplicates from covers
covers_df_no_duplicates = covers.drop(dup_titles_covers.index)

# Step 5: Concatenate the kept entries back to the covers dataset
covers = pd.concat([covers_df_no_duplicates, filtered_covers], ignore_index=True)

# Verify the result
covers["Book-Title"].duplicated().sum()

22

In [29]:
import pandas as pd

def valid_isbn10(isbn):
    """Check if the provided string is a valid ISBN-10."""
    if len(isbn) != 10:
        return False

    try:
        total = sum((10 - i) * (int(x) if x != 'X' else 10) for i, x in enumerate(isbn))
        return total % 11 == 0
    except ValueError:
        # A ValueError will be raised if a character other than a digit or 'X' is encountered.
        return False
    
def valid_isbn13(isbn):
    """Check if the provided string is a valid ISBN-13."""
    if len(isbn) != 13 or not isbn.isdigit():
        return False

    total = 0
    for i in range(12):
        if i % 2 == 0:
            total += int(isbn[i])
        else:
            total += 3 * int(isbn[i])

    check_digit = 10 - (total % 10)
    if check_digit == 10:
        check_digit = 0

    return str(check_digit) == isbn[-1]

def cleanse_dataframe(df, isbn_column):
    """Return a DataFrame containing only valid ISBN-10s in the specified column."""
    # Convert the ISBN column to strings
    df[isbn_column] = df[isbn_column].astype(str)
    # Apply the valid_isbn10 function
    return df[df[isbn_column].apply(valid_isbn10)]

def cleanse_dataframe_isbn13(df, isbn_column):
    """Return a DataFrame containing only valid ISBN-13s in the specified column."""
    df[isbn_column] = df[isbn_column].astype(str)
    return df[df[isbn_column].apply(valid_isbn13)]

def isbn10_to_isbn13(isbn10):
    if len(isbn10) != 10:
        return None

    # Add the 978 prefix
    isbn13 = "978" + isbn10[:-1]

    # Calculate the checksum for ISBN-13
    checksum = 0
    for i, char in enumerate(isbn13):
        if i % 2 == 0:
            checksum += int(char)
        else:
            checksum += 3 * int(char)

    checksum = 10 - (checksum % 10)
    if checksum == 10:
        checksum = 0

    return isbn13 + str(checksum)

In [30]:
# Print sum of invalid ISBN-10s in the goodreads dataset
print("There are {} invalid ISBN-10s in the goodreads dataset".format(len(goodreads) - len(cleanse_dataframe(goodreads, 'isbn'))))
print("There are {} invalid ISBN-13s in the goodreads dataset".format(len(goodreads) - len(cleanse_dataframe_isbn13(goodreads, 'isbn13'))))
# Print sum of invalid ISBN-10s in the amazon dataset
print("There are {} invalid ISBN-13s in the amazon dataset".format(len(amazon) - len(cleanse_dataframe_isbn13(amazon, 'isbn'))))
# Print sum of invalid ISBN-10s and 13s in the covers dataset
print("There are {} invalid ISBN-10s in the covers dataset".format(len(covers) - len(cleanse_dataframe(covers, 'ISBN'))))

There are 307 invalid ISBN-10s in the goodreads dataset
There are 866 invalid ISBN-13s in the goodreads dataset
There are 0 invalid ISBN-13s in the amazon dataset
There are 185 invalid ISBN-10s in the covers dataset


In [31]:
# Drop invalid ISBN-10s in the goodreads dataset
goodreads = cleanse_dataframe(goodreads, 'isbn')
# Drop invalid ISBN-13s in the goodreads dataset
goodreads = cleanse_dataframe_isbn13(goodreads, 'isbn13')
# Drop invalid ISBN-13s in the amazon dataset
amazon = cleanse_dataframe_isbn13(amazon, 'isbn')
# Drop invalid ISBN-10s in the covers dataset
covers = cleanse_dataframe(covers, 'ISBN')
# Transform covers ISBN to isbn13
covers['isbn13'] = covers['ISBN'].apply(isbn10_to_isbn13)
covers['isbn13'] = covers['isbn13'].astype('int64')

# Print lengths of the datasets
print("The goodreads dataset now has {} rows".format(len(goodreads)))
print("The amazon dataset now has {} rows".format(len(amazon)))
print("The covers dataset now has {} rows".format(len(covers)))

The goodreads dataset now has 39360 rows
The amazon dataset now has 22708 rows
The covers dataset now has 241972 rows


In [32]:
# Export the cleansed datasets to csv
goodreads.to_csv('data/results/goodreads_clean.csv', index=False)
amazon.to_csv('data/results/amazon_clean.csv', index=False)
covers.to_csv('data/results/covers_clean.csv', index=False)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# Load the cleansed datasets
amazon_df = pd.read_csv('data/results/amazon_clean.csv')
goodreads_df = pd.read_csv('data/results/goodreads_clean.csv')
amazon_df["id"] = 'Amazon_' + amazon_df.index.astype(str)
goodreads_df['id'] = 'Goodreads_books_' + goodreads_df.index.astype(str)


# Preprocessing the title columns
amazon_titles = amazon_df['name']
goodreads_titles = goodreads_df['title']

# Combining the titles from both datasets for TF-IDF vectorization
combined_titles = pd.concat([amazon_titles, goodreads_titles])

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_titles)

# Splitting the TF-IDF matrix back into two separate matrices for Amazon and Goodreads
split_index = len(amazon_titles)
tfidf_amazon = tfidf_matrix[:split_index]
tfidf_goodreads = tfidf_matrix[split_index:]

# Calculating the cosine similarity between every Amazon title and Goodreads title
cosine_similarities = cosine_similarity(tfidf_amazon, tfidf_goodreads)

# Function to find the best match for each Amazon title in the Goodreads dataset
def find_best_matches(cosine_similarities, threshold=0.0):
    match_results = []

    for i, row in enumerate(cosine_similarities):
        best_match_index = np.argmax(row)
        best_match_score = row[best_match_index]

        if best_match_score > threshold:
            amazon_id = amazon_df.iloc[i]['id']  # Get Amazon ID
            goodreads_id = goodreads_df.iloc[best_match_index]['id']  # Get Goodreads ID
            amazon_isbn = amazon_df.iloc[i]['isbn']
            goodreads_isbn = goodreads_df.iloc[best_match_index]['isbn13']
            is_match = amazon_isbn == goodreads_isbn
            match_results.append({
                'amazon_id': amazon_id,
                'goodreads_id': goodreads_id,
                'amazon_title': amazon_df.iloc[i]['name'],
                'goodreads_title': goodreads_df.iloc[best_match_index]['title'],
                'similarity_score': best_match_score,
                'amazon_isbn': amazon_isbn,
                'goodreads_isbn': goodreads_isbn,
                'is_match': is_match,
            })

    return pd.DataFrame(match_results)

# Finding the best matches
matched_df = find_best_matches(cosine_similarities)

# Filtering to keep only the highest similarity score match for each ISBN
final_matched_df = matched_df.sort_values(by='similarity_score', ascending=False)\
                            .drop_duplicates(subset=['amazon_isbn', 'goodreads_isbn'])\
                            .reset_index(drop=True)

final_matched_df.head()

Unnamed: 0,amazon_id,goodreads_id,amazon_title,goodreads_title,similarity_score,amazon_isbn,goodreads_isbn,is_match
0,Amazon_17525,Goodreads_books_32365,The Book of Life,The Book of Life,1.0,9780670025596,9780670025596,True
1,Amazon_14667,Goodreads_books_31157,Winterdance: the Fine Madness of Running the I...,Winterdance: The Fine Madness of Running the I...,1.0,9780156001458,9780156001458,True
2,Amazon_21077,Goodreads_books_32845,Stalking Jack the Ripper,Stalking Jack the Ripper,1.0,9780316273497,9780316273497,True
3,Amazon_10466,Goodreads_books_4196,"Blue Exorcist, Vol. 3","Blue Exorcist, Vol. 1",1.0,9781421540344,9781421540320,False
4,Amazon_7679,Goodreads_books_28881,Reunion in Death,Reunion in Death,1.0,9780425183977,9780749934408,False


In [34]:
# Load the cleansed datasets
amazon_sample = pd.read_csv('data/results/amazon_clean.csv')
goodreads_sample = pd.read_csv('data/results/goodreads_clean.csv')
covers_sample = pd.read_csv('data/results/covers_clean.csv')

# Step 1: Identify Overlapping ISBNs
overlap_with_goodreads = pd.merge(covers_sample, goodreads_sample, on='isbn13', how='inner')
# Assuming a direct comparison is possible here for simplicity:
overlap_with_amazon = pd.merge(covers_sample, amazon_sample, left_on='isbn13', right_on='isbn', how='inner')
# Combine the overlapping ISBNs
overlapping_isbns = pd.concat([overlap_with_goodreads, overlap_with_amazon]).drop_duplicates()
# Step 2: Separate Overlapping and Non-Overlapping Data
non_overlapping_covers = covers_sample[~covers_sample['isbn13'].isin(overlapping_isbns['isbn13'])]
# Step 3: Randomly Sample from Non-Overlapping Data
required_sample_size = 50000 - len(overlapping_isbns)
additional_samples = non_overlapping_covers.sample(n=required_sample_size, random_state=1)
# Combine the overlapping data with the randomly sampled non-overlapping data
reduced_covers_sample = pd.concat([overlapping_isbns, additional_samples]).drop_duplicates()
# Drop Duplicates of Titles
reduced_covers_sample = reduced_covers_sample.drop_duplicates(subset='Book-Title', keep='first')
# Check the final size of reduced_covers_sample
print(f'Size of the reduced covers_sample: {len(reduced_covers_sample)}')

# Checking overlap with goodreads_sample
overlap_with_goodreads = pd.merge(reduced_covers_sample, goodreads_sample, on='isbn13', how='inner')
# Checking overlap with amazon_sample
overlap_with_amazon = pd.merge(reduced_covers_sample, amazon_sample, left_on='isbn13', right_on='isbn', how='inner')
# Printing the number of overlapping entries
print(f'Number of overlapping entries with Goodreads: {len(overlap_with_goodreads)}')
print(f'Number of overlapping entries with Amazon: {len(overlap_with_amazon)}')

reduced_covers_sample.to_csv('data/results/covers_clean_filtered.csv', index=False)

  covers_sample = pd.read_csv('data/results/covers_clean.csv')


Size of the reduced covers_sample: 49564
Number of overlapping entries with Goodreads: 8071
Number of overlapping entries with Amazon: 1246


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# Load the cleansed datasets
covers_df = pd.read_csv("data/results/covers_clean_filtered.csv")
goodreads_df = pd.read_csv("data/results/goodreads_clean.csv")
covers_df["id"] = 'Covers_' + covers_df.index.astype(str)
goodreads_df['id'] = 'Goodreads_books_' + goodreads_df.index.astype(str)

# Preprocessing the title columns
covers_titles = covers_df['Book-Title']
goodreads_titles = goodreads_df['title']

# Combining the titles from both datasets for TF-IDF vectorization
combined_titles = pd.concat([covers_titles, goodreads_titles])

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_titles)

# Splitting the TF-IDF matrix back into two separate matrices for Amazon and Goodreads
split_index = len(covers_titles)
tfidf_covers = tfidf_matrix[:split_index]
tfidf_goodreads = tfidf_matrix[split_index:]

# Calculating the cosine similarity between every Amazon title and Goodreads title
cosine_similarities = cosine_similarity(tfidf_covers, tfidf_goodreads)

# Function to find the best match for each Amazon title in the Goodreads dataset
def find_best_matches(cosine_similarities, threshold=0.0):
    match_results = []

    for i, row in enumerate(cosine_similarities):
        best_match_index = np.argmax(row)
        best_match_score = row[best_match_index]

        if best_match_score > threshold:
            covers_id = covers_df.iloc[i]['id']  # Get Covers ID
            goodreads_id = goodreads_df.iloc[best_match_index]['id']  # Get Goodreads ID
            covers_isbn = covers_df.iloc[i]['ISBN']
            goodreads_isbn = goodreads_df.iloc[best_match_index]['isbn']
            is_match = covers_isbn == goodreads_isbn
            match_results.append({
                'covers_id': covers_id,
                'goodreads_id': goodreads_id,
                'covers_title': covers_df.iloc[i]['Book-Title'],
                'goodreads_title': goodreads_df.iloc[best_match_index]['title'],
                'similarity_score': best_match_score,
                'covers_isbn': covers_isbn,
                'goodreads_isbn': goodreads_isbn,
                'is_match': is_match
            })

    return pd.DataFrame(match_results)

# Finding the best matches
matched_df = find_best_matches(cosine_similarities)

# Filtering to keep only the highest similarity score match for each ISBN
final_matched_covers_goodreads = matched_df.sort_values(by='similarity_score', ascending=False)\
                            .drop_duplicates(subset=['covers_isbn', 'goodreads_isbn'])\
                            .reset_index(drop=True)

final_matched_covers_goodreads.head()

  covers_df = pd.read_csv("data/results/covers_clean_filtered.csv")


Unnamed: 0,covers_id,goodreads_id,covers_title,goodreads_title,similarity_score,covers_isbn,goodreads_isbn,is_match
0,Covers_4541,Goodreads_books_24907,The Last Empire: Essays 1992-2000,The Last Empire: Essays 1992-2000,1.0,037572639X,037572639X,True
1,Covers_5680,Goodreads_books_30504,All Quiet on the Western Front,All Quiet on the Western Front,1.0,0449213943,0449213943,True
2,Covers_7917,Goodreads_books_12253,To See You Again: A True Story of Love in a Ti...,To See You Again: A True Story of Love in a Ti...,1.0,0452280710,0452280710,True
3,Covers_8809,Goodreads_books_30883,Drums of Autumn,Drums of Autumn,1.0,044022425X,0525618732,False
4,Covers_7329,Goodreads_books_4049,The Castle in the Attic,The Castle in the Attic,1.0,0440409411,0440409411,True


In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# Preprocessing the title columns
covers_titles = covers_df['Book-Title']
amazon_titles = amazon_df['name']

# Combining the titles from both datasets for TF-IDF vectorization
combined_titles = pd.concat([covers_titles, amazon_titles])

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_titles)

# Splitting the TF-IDF matrix back into two separate matrices for Amazon and Covers
split_index = len(covers_titles)
tfidf_covers = tfidf_matrix[:split_index]
tfidf_amazon = tfidf_matrix[split_index:]

# Calculating the cosine similarity between every Amazon title and Covers title
cosine_similarities = cosine_similarity(tfidf_covers, tfidf_amazon)

# Function to find the best match for each Amazon title in the Covers dataset
def find_best_matches(cosine_similarities, threshold=0.0):
    match_results = []

    for i, row in enumerate(cosine_similarities):
        best_match_index = np.argmax(row)
        best_match_score = row[best_match_index]

        if best_match_score > threshold:
            covers_id = covers_df.iloc[i]['id']  # Get Covers ID
            amazon_id = amazon_df.iloc[best_match_index]['id']  # Get Goodreads ID
            covers_isbn = covers_df.iloc[i]['isbn13']
            amazon_isbn = amazon_df.iloc[best_match_index]['isbn']
            is_match = covers_isbn == amazon_isbn
            match_results.append({
                'covers_id': covers_id,
                'amazon_id': amazon_id,
                'covers_title': covers_df.iloc[i]['Book-Title'],
                'amazon_title': amazon_df.iloc[best_match_index]['name'],
                'similarity_score': best_match_score,
                'covers_isbn': covers_isbn,
                'amazon_isbn': amazon_isbn,
                'is_match': is_match
            })

    return pd.DataFrame(match_results)

# Finding the best matches
matched_df = find_best_matches(cosine_similarities)

# Filtering to keep only the highest similarity score match for each ISBN
final_matched_covers_amazon = matched_df.sort_values(by='similarity_score', ascending=False)\
                            .drop_duplicates(subset=['covers_isbn', 'amazon_isbn'])\
                            .reset_index(drop=True)

final_matched_covers_amazon.head()

Unnamed: 0,covers_id,amazon_id,covers_title,amazon_title,similarity_score,covers_isbn,amazon_isbn,is_match
0,Covers_8280,Amazon_1115,THE ORIGIN OF SPECIES,The Origin of Species,1.0,9781853267802,9781853267802,True
1,Covers_7636,Amazon_13606,The Only Astrology Book You'll Ever Need,The Only Astrology Book You'll Ever Need,1.0,9781568332314,9781589796539,False
2,Covers_8841,Amazon_11768,On the Banks of Plum Creek,On the Banks of Plum Creek,1.0,9780064400046,9780064400046,True
3,Covers_33306,Amazon_1166,A Briefer History of Time,A Briefer History of Time,1.0,9780716733898,9780593056974,False
4,Covers_7685,Amazon_16408,The Ragamuffin Gospel,The Ragamuffin Gospel,1.0,9781576737163,9781590525029,False


In [37]:
final_matched_goodreads_amazon = final_matched_df
final_matched_covers_goodreads
final_matched_covers_amazon

Unnamed: 0,covers_id,amazon_id,covers_title,amazon_title,similarity_score,covers_isbn,amazon_isbn,is_match
0,Covers_8280,Amazon_1115,THE ORIGIN OF SPECIES,The Origin of Species,1.000000,9781853267802,9781853267802,True
1,Covers_7636,Amazon_13606,The Only Astrology Book You'll Ever Need,The Only Astrology Book You'll Ever Need,1.000000,9781568332314,9781589796539,False
2,Covers_8841,Amazon_11768,On the Banks of Plum Creek,On the Banks of Plum Creek,1.000000,9780064400046,9780064400046,True
3,Covers_33306,Amazon_1166,A Briefer History of Time,A Briefer History of Time,1.000000,9780716733898,9780593056974,False
4,Covers_7685,Amazon_16408,The Ragamuffin Gospel,The Ragamuffin Gospel,1.000000,9781576737163,9781590525029,False
...,...,...,...,...,...,...,...,...
47402,Covers_29036,Amazon_19792,The Necromancer (Necromancer),The Story of the World,0.042533,9781871438208,9781933339054,False
47403,Covers_2995,Amazon_19792,Lando : The Sacketts (Sacketts),The Story of the World,0.040197,9780553276763,9781933339054,False
47404,Covers_33993,Amazon_19792,Tarzan the Untamed (Tarzan),The Story of the World,0.039005,9780345288684,9781933339054,False
47405,Covers_3000,Amazon_19792,Jubal Sackett : The Sacketts (Sacketts),The Story of the World,0.036617,9780553277395,9781933339054,False


In [44]:
# 20 % Matches randomly chosen (100)
# 50% Non-Matches randomly chosen (250)
# 15% Corner Case Matches (75)
# 15% Corner Case Non-Matches (75)

# Filter the DataFrame to include only rows where is_match is True
matched_df = final_matched_covers_goodreads[(final_matched_covers_goodreads['is_match'] == True) & (final_matched_covers_goodreads['similarity_score'] == 1)]
# Filter the DataFrame to include only rows where is_match is False
non_matched_df = final_matched_covers_goodreads[(final_matched_covers_goodreads['is_match'] == False) & (final_matched_covers_goodreads['similarity_score'] <= 0.3)]
# Define corner cases: matches with a sim score between 0.5 and 0.75
corner_cases_match_df = final_matched_covers_goodreads[(final_matched_covers_goodreads['is_match'] == True) & (final_matched_covers_goodreads['similarity_score'] >= 0.65) & (final_matched_covers_goodreads['similarity_score'] <= 0.8)]
# Define corner cases for non-matches: assuming non-matches with sim score close to the match threshold
corner_cases_non_matches_df = final_matched_covers_goodreads[(final_matched_covers_goodreads['is_match'] == False) & (final_matched_covers_goodreads['similarity_score'] < 0.9) & (final_matched_covers_goodreads['similarity_score'] > 0.7)]


# Randomly select 75 corner case entries
gs_matched_corner = corner_cases_match_df.sample(n=75, random_state=0)
# # Remove corner cases from the matched_df to avoid overlap
# matched_df_remaining = matched_df.drop(gs_matched_corner.index)
# Randomly select 100 entries from the remaining matched DataFrame
gs_matched = matched_df.sample(n=100, random_state=0)

# Randomly select 75 corner case non-matched entries
gs_non_matched_corner = corner_cases_non_matches_df.sample(n=75, random_state=0)
# # Remove corner cases from the non_matched_df to avoid overlap
# non_matched_df_remaining = non_matched_df.drop(gs_non_matched_corner.index)
# Randomly select 250 entries from the remaining non-matched DataFrame
gs_non_matched = non_matched_df.sample(n=250, random_state=0)

# Concatenate all four groups into one DataFrame
gs_combined = pd.concat([gs_matched, gs_non_matched, gs_matched_corner, gs_non_matched_corner])

# Reset the index of the combined DataFrame
gs_combined = gs_combined.reset_index(drop=True)

gs_combined.to_csv("gs_covers_goodreads.csv")

# Verify the combined DataFrame
print(gs_combined.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   covers_id         500 non-null    object 
 1   goodreads_id      500 non-null    object 
 2   covers_title      500 non-null    object 
 3   goodreads_title   500 non-null    object 
 4   similarity_score  500 non-null    float64
 5   covers_isbn       500 non-null    object 
 6   goodreads_isbn    500 non-null    object 
 7   is_match          500 non-null    bool   
dtypes: bool(1), float64(1), object(6)
memory usage: 28.0+ KB
None


In [47]:
# 20 % Matches randomly chosen (100)
# 50% Non-Matches randomly chosen (250)
# 15% Corner Case Matches (75)
# 15% Corner Case Non-Matches (75)

# Filter the DataFrame to include only rows where is_match is True
matched_df = final_matched_goodreads_amazon[(final_matched_goodreads_amazon['is_match'] == True) & (final_matched_goodreads_amazon['similarity_score'] == 1)]
# Filter the DataFrame to include only rows where is_match is False
non_matched_df = final_matched_goodreads_amazon[(final_matched_goodreads_amazon['is_match'] == False) & (final_matched_goodreads_amazon['similarity_score'] <= 0.3)]
# Define corner cases: matches with a sim score between 0.5 and 0.75
corner_cases_match_df = final_matched_goodreads_amazon[(final_matched_goodreads_amazon['is_match'] == True) & (final_matched_goodreads_amazon['similarity_score'] >= 0.65) & (final_matched_goodreads_amazon['similarity_score'] <= 0.8)]
# Define corner cases for non-matches: assuming non-matches with sim score close to the match threshold
corner_cases_non_matches_df = final_matched_goodreads_amazon[(final_matched_goodreads_amazon['is_match'] == False) & (final_matched_goodreads_amazon['similarity_score'] < 0.95) & (final_matched_goodreads_amazon['similarity_score'] > 0.7)]


# Randomly select 75 corner case entries
gs_matched_corner = corner_cases_match_df.sample(n=75, random_state=0)
# # Remove corner cases from the matched_df to avoid overlap
# matched_df_remaining = matched_df.drop(gs_matched_corner.index)
# Randomly select 100 entries from the remaining matched DataFrame
gs_matched = matched_df.sample(n=100, random_state=0)

# Randomly select 75 corner case non-matched entries
gs_non_matched_corner = corner_cases_non_matches_df.sample(n=75, random_state=0)
# # Remove corner cases from the non_matched_df to avoid overlap
# non_matched_df_remaining = non_matched_df.drop(gs_non_matched_corner.index)
# Randomly select 250 entries from the remaining non-matched DataFrame
gs_non_matched = non_matched_df.sample(n=250, random_state=0)

# Concatenate all four groups into one DataFrame
gs_combined = pd.concat([gs_matched, gs_non_matched, gs_matched_corner, gs_non_matched_corner])

# Reset the index of the combined DataFrame
gs_combined = gs_combined.reset_index(drop=True)

gs_combined.to_csv("gs_goodreads_amazon.csv")

# Verify the combined DataFrame
print(gs_combined.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   amazon_id         500 non-null    object 
 1   goodreads_id      500 non-null    object 
 2   amazon_title      500 non-null    object 
 3   goodreads_title   500 non-null    object 
 4   similarity_score  500 non-null    float64
 5   amazon_isbn       500 non-null    int64  
 6   goodreads_isbn    500 non-null    int64  
 7   is_match          500 non-null    bool   
dtypes: bool(1), float64(1), int64(2), object(4)
memory usage: 28.0+ KB
None


In [48]:
# 20 % Matches randomly chosen (100)
# 50% Non-Matches randomly chosen (250)
# 15% Corner Case Matches (75)
# 15% Corner Case Non-Matches (75)

# Filter the DataFrame to include only rows where is_match is True
matched_df = final_matched_covers_amazon[(final_matched_covers_amazon['is_match'] == True) & (final_matched_covers_amazon['similarity_score'] == 1)]
# Filter the DataFrame to include only rows where is_match is False
non_matched_df = final_matched_covers_amazon[(final_matched_covers_amazon['is_match'] == False) & (final_matched_covers_amazon['similarity_score'] <= 0.3)]
# Define corner cases: matches with a sim score between 0.5 and 0.75
corner_cases_match_df = final_matched_covers_amazon[(final_matched_covers_amazon['is_match'] == True) & (final_matched_covers_amazon['similarity_score'] >= 0.65) & (final_matched_covers_amazon['similarity_score'] <= 0.8)]
# Define corner cases for non-matches: assuming non-matches with sim score close to the match threshold
corner_cases_non_matches_df = final_matched_covers_amazon[(final_matched_covers_amazon['is_match'] == False) & (final_matched_covers_amazon['similarity_score'] < 0.95) & (final_matched_covers_amazon['similarity_score'] > 0.7)]


# Randomly select 75 corner case entries
gs_matched_corner = corner_cases_match_df.sample(n=75, random_state=0)
# # Remove corner cases from the matched_df to avoid overlap
# matched_df_remaining = matched_df.drop(gs_matched_corner.index)
# Randomly select 100 entries from the remaining matched DataFrame
gs_matched = matched_df.sample(n=100, random_state=0)

# Randomly select 75 corner case non-matched entries
gs_non_matched_corner = corner_cases_non_matches_df.sample(n=75, random_state=0)
# # Remove corner cases from the non_matched_df to avoid overlap
# non_matched_df_remaining = non_matched_df.drop(gs_non_matched_corner.index)
# Randomly select 250 entries from the remaining non-matched DataFrame
gs_non_matched = non_matched_df.sample(n=250, random_state=0)

# Concatenate all four groups into one DataFrame
gs_combined = pd.concat([gs_matched, gs_non_matched, gs_matched_corner, gs_non_matched_corner])

# Reset the index of the combined DataFrame
gs_combined = gs_combined.reset_index(drop=True)

gs_combined.to_csv("gs_covers_amazon.csv")

# Verify the combined DataFrame
print(gs_combined.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   covers_id         500 non-null    object 
 1   amazon_id         500 non-null    object 
 2   covers_title      500 non-null    object 
 3   amazon_title      500 non-null    object 
 4   similarity_score  500 non-null    float64
 5   covers_isbn       500 non-null    int64  
 6   amazon_isbn       500 non-null    int64  
 7   is_match          500 non-null    bool   
dtypes: bool(1), float64(1), int64(2), object(4)
memory usage: 28.0+ KB
None
