<a href="https://colab.research.google.com/github/SJinji/match-tables-with-fuzzy-matching/blob/main/Reveal_Tech_Case_Jinji_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
! pip install fuzzymatcher



In [28]:
! pip install recordlinkage



In [29]:
import pandas as pd
import fuzzymatcher

In [30]:
import warnings
warnings.filterwarnings("ignore")

In [31]:
# Function to load and preprocess the datasets
def load_and_preprocess_datasets(file_a, file_b):
    # Load the datasets into pandas DataFrames
    dataset_a = pd.read_csv(file_a, header=None)
    dataset_b = pd.read_csv(file_b, header=None)

    # Remove column 3 from both datasets
    dataset_a.drop(3, axis=1, inplace=True)
    dataset_b.drop(3, axis=1, inplace=True)

    # Add column names to datasets A and B
    column_names = ['id', 'company_name', 'website', 'phone_number', 'address', 'postcode', 'region', 'country']
    dataset_a.columns = column_names
    dataset_b.columns = column_names

    # Preprocess datasets
    preprocess_columns(dataset_a)
    preprocess_columns(dataset_b)

    dataset_a.columns = [f'{col}_a' for col in column_names]
    dataset_b.columns = [f'{col}_b' for col in column_names]

    return dataset_a, dataset_b

In [32]:
# Function to preprocess the columns
def preprocess_columns(df):
    # Convert company names to lowercase and replace "'" with whitespace
    df['company_name'] = df['company_name'].str.lower().str.replace("'", ' ')

    # Remove prefixes "http://" and "www." from the website column
    df['website'] = df['website'].str.replace(r'^https?://', '', case=False)
    df['website'] = df['website'].str.replace(r'^www\.', '', case=False)
    # Remove trailing slashes from the website column
    df['website'] = df['website'].str.rstrip('/')

    # Remove leading zeros from phone numbers
    df['phone_number'] = df['phone_number'].str.replace(r'\.', '')
    df['phone_number'] = df['phone_number'].str.replace(r'^\+?33|^\+?0*', '').str.replace(r'\s', '')

    # Remove commas from addresses
    df['address'] = df['address'].str.replace(',', '').str.lower()

    # Remove ".0" from postcode and convert it to an integer
    df['postcode'] = df['postcode'].astype(str).str.replace(r'\.0$', '')
    df['postcode'] = pd.to_numeric(df['postcode'], errors='coerce')

    # Convert region names to lowercase
    df['region'] = df['region'].str.lower()

    # Convert country names to lowercase
    df['country'] = df['country'].str.lower()

    # Remove '-' from specified columns
    columns_to_remove_dash = ['company_name', 'address', 'region', 'country']
    for col in columns_to_remove_dash:
        df[col] = df[col].str.replace('-', '').str.strip()

    # Remove duplicates from the DataFrame
    df.drop_duplicates(inplace=True)

    return df


In [33]:
# Function to find common IDs based on different attributes
def find_common_ids(dataset_a, dataset_b):
    # Find common ids based on id, website, or phone_number
    common_ids_id = dataset_a.merge(dataset_b, left_on='id_a', right_on='id_b', how='inner')

    # Filter out rows with non-null values in the "website_a" column
    dataset_a_website_notnull = dataset_a.dropna(subset=['website_a'])
    common_ids_website = dataset_a_website_notnull.merge(dataset_b, left_on='website_a', right_on='website_b', how='inner')

    # Filter out rows with non-null values in the "phone_number_a" column
    dataset_a_phone_notnull = dataset_a.dropna(subset=['phone_number_a'])
    common_ids_phone = dataset_a_phone_notnull.merge(dataset_b, left_on='phone_number_a', right_on='phone_number_b', how='inner')

    # Concatenate all common_ids based on different criteria to get unique common ids
    common_ids = pd.concat([common_ids_id, common_ids_website, common_ids_phone]).drop_duplicates()

    return common_ids

In [34]:
# Function to exclude common_ids and get unmatched rows
def exclude_common_ids(dataset_a, dataset_b, common_ids):
    dataset_a_only = dataset_a[~dataset_a['id_a'].isin(common_ids['id_a'])]
    dataset_b_only = dataset_b[~dataset_b['id_b'].isin(common_ids['id_b'])]

    # Fill NaN values with empty strings in dataset_a and dataset_b
    dataset_a_only.fillna('', inplace=True)
    dataset_b_only.fillna('', inplace=True)

    return dataset_a_only, dataset_b_only

In [35]:
# Function to perform fuzzy matching
def perform_fuzzy_matching(dataset_a_only, dataset_b_only):
    left_on = ["company_name_a", 'website_a', 'phone_number_a', "address_a", 'postcode_a', 'region_a', 'country_a']
    right_on = ["company_name_b", 'website_b', 'phone_number_b', "address_b", 'postcode_b', 'region_b', 'country_b']

    # Running time: 1min
    matched_results = fuzzymatcher.fuzzy_left_join(dataset_a_only,
                                                   dataset_b_only,
                                                   left_on,
                                                   right_on,
                                                   left_id_col='id_a',
                                                   right_id_col='id_b')

    # Filter out rows with NaN in 'best_match_score' column
    matched_results = matched_results[matched_results['best_match_score'].notna()]

    # Convert 'id_b' to integer data type
    matched_results['id_b'] = matched_results['id_b'].astype(int)

    return matched_results

In [36]:
# Function to filter matched_results and concatenate with common_ids
def filter_and_concat(matched_results, common_ids):
    cols = [
        "best_match_score", 'id_a', "company_name_a", 'website_a', 'phone_number_a', "address_a", 'postcode_a', 'region_a',
        'country_a', 'id_b', "company_name_b", 'website_b', 'phone_number_b', "address_b", 'postcode_b', 'region_b', 'country_b'
    ]

    # Sort the results by best_match_score in descending order
    matched_results = matched_results[cols].sort_values(by=['best_match_score'], ascending=False)

    # Filter matched_results based on best_match_score >= -0.01
    filtered_matched_results = matched_results[matched_results['best_match_score'] >= -0.01]

    # Concatenate filtered_matched_results with common_ids
    final_result = pd.concat([filtered_matched_results, common_ids], axis=0)

    return final_result


In [37]:
# Function to save the result as a CSV file
def save_to_csv(final_result, output_file):
    # Keep only the desired columns
    final_result = final_result[['id_a', 'company_name_a', 'id_b', 'company_name_b', 'best_match_score']]

    # Save the result as a CSV file
    final_result.to_csv(output_file, index=False)

In [38]:
# Main function
def main():
    # Load and preprocess datasets
    dataset_a, dataset_b = load_and_preprocess_datasets('dataset_A.csv', 'dataset_B.csv')

    # Find common IDs based on different attributes
    common_ids = find_common_ids(dataset_a, dataset_b)

    # Exclude common_ids and get unmatched rows
    dataset_a_only, dataset_b_only = exclude_common_ids(dataset_a, dataset_b, common_ids)

    # Perform fuzzy matching
    matched_results = perform_fuzzy_matching(dataset_a_only, dataset_b_only)

    # Filter and concatenate matched_results with common_ids
    final_result = filter_and_concat(matched_results, common_ids)

    # Save the result as a CSV file
    save_to_csv(final_result, 'matched_results.csv')

if __name__ == "__main__":
    main()
