In [92]:
import pandas as pd
# from thefuzz import fuzz, process
from rapidfuzz import process, fuzz
import matplotlib.pyplot as plt
import seaborn as sns

# AIM: COMPARE and INTEGRATE THE PRODUCTIONS OF 2 DATASET
## Get the overview of the products / market

In [93]:

# Reading the first CSV file and printing its head
df1 = pd.read_csv('./marketing_sample_for_amazon_com-ecommerce__20200101_20200131__10k_data.csv')

# Reading the second CSV file and printing its head
df2 = pd.read_csv('./marketing_sample_for_walmart_com-product_details__20200101_20200331__30k_data.csv')


In [94]:
# Display options for dataframe
pd.set_option('display.max_rows', 10)  # maximum number of rows to display
pd.set_option('display.max_columns', 10)  # maximum number of columns to display
pd.set_option('display.width', 200)  # width of the display in characters
pd.set_option('display.colheader_justify', 'center')  # center the column headers

print(df1.columns)
print(df2.columns)


Index(['Uniq Id', 'Product Name', 'Brand Name', 'Asin', 'Category', 'Upc Ean Code', 'List Price', 'Selling Price', 'Quantity', 'Model Number', 'About Product', 'Product Specification',
       'Technical Details', 'Shipping Weight', 'Product Dimensions', 'Image', 'Variants', 'Sku', 'Product Url', 'Stock', 'Product Details', 'Dimensions', 'Color', 'Ingredients', 'Direction To Use',
       'Is Amazon Seller', 'Size Quantity Variant', 'Product Description'],
      dtype='object')
Index(['Uniq Id', 'Crawl Timestamp', 'Product Url', 'Product Name', 'Description', 'List Price', 'Sale Price', 'Brand', 'Item Number', 'Gtin', 'Package Size', 'Category', 'Postal Code', 'Available'], dtype='object')


In [95]:
print(df2[['Product Name', 'Category']].head())
print(df2[['Brand', 'List Price', 'Sale Price', 'Gtin', 'Description']].head())

                     Product Name                                         Category                     
0  Allegiance Economy Dual-scale Digital Thermometer  Health | Medicine Cabinet | Thermometers | Dig...
1  Kenneth Cole Reaction Eau De Parfum Spray For ...  Premium Beauty | Premium Fragrance | Premium P...
2  Kid Tough Fitness Inflatable Free-Standing Pun...  Sports & Outdoors | Outdoor Sports | Hunting |...
3                                    THE FIRST YEARS                      Baby | Diapering | Baby Wipes
4  4 Pack - MD USA Seamless Toe-Wave-In Mesh Diab...            Health | Diabetes Care | Diabetic Socks
       Brand        List Price  Sale Price      Gtin                        Description                    
0  Cardinal Health     11.11       11.11    707389636164   We aim to show you accurate product informati...
1     Kenneth Cole     23.99       23.99    191565696101   We aim to show you accurate product informati...
2         BONK FIT     30.76       30.76    85552300

## Kiểm tra những features bị thiếu dữ liệu

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', 10):  # more options can be specified also
    print(df1.count())
    print()
    print(df2.count())

Uniq Id                  10002
Product Name             10002
Brand Name                   0
Asin                         0
Category                  9172
Upc Ean Code                34
List Price                   0
Selling Price             9895
Quantity                     0
Model Number              8230
About Product             9729
Product Specification     8370
Technical Details         9212
Shipping Weight           8864
Product Dimensions         479
Image                    10002
Variants                  2478
Sku                          0
Product Url              10002
Stock                        0
Product Details              0
Dimensions                   0
Color                        0
Ingredients                  0
Direction To Use             0
Is Amazon Seller         10002
Size Quantity Variant        0
Product Description          0
dtype: int64

Uniq Id            30000
Crawl Timestamp    30000
Product Url        30000
Product Name       30000
Description       

### Lọc bỏ những feature bị mất dữ liệu hơn 20%

In [97]:
def filter_features(df, threshold):
    threshold = threshold * len(df)

# Filter features with more than 80% non-NaN values
    selected_features = df.count()[df.count() > threshold].index.tolist()

    # Print the feature names
    print("Features with more than 80% values:")
    print("\n".join(selected_features))
    return selected_features

In [98]:
filter_features(df1, 0.8)
filter_features(df2, 0.8)

Features with more than 80% values:
Uniq Id
Product Name
Category
Selling Price
Model Number
About Product
Product Specification
Technical Details
Shipping Weight
Image
Product Url
Is Amazon Seller
Features with more than 80% values:
Uniq Id
Crawl Timestamp
Product Url
Product Name
Description
List Price
Sale Price
Brand
Gtin
Category
Available


['Uniq Id',
 'Crawl Timestamp',
 'Product Url',
 'Product Name',
 'Description',
 'List Price',
 'Sale Price',
 'Brand',
 'Gtin',
 'Category',
 'Available']

## Đồng bộ dữ liệu bằng cách đổi tên
Từ danh sách features dùng được ở trên, ta chọn những tên có nghĩa tương tự để đồng bộ.

In [100]:
column_mapping = {
    # Mapping for `df1`
    'Uniq Id': 'Unique ID',
    'Product Name': 'Product Name',
    'Category': 'Category',
    'Selling Price': 'Sale Price',
    'Model Number': 'Model Number',
    'About Product': 'Description',
    'Product Specification': 'Specifications',
    'Technical Details': 'Technical Details',
    'Shipping Weight': 'Shipping Weight',
    'Image': 'Image',
    'Product Url': 'Product URL',
    'Is Amazon Seller': 'Is Amazon Seller',
    
    # Mapping for `df2`
    'Crawl Timestamp': 'Crawl Timestamp',
    'Description': 'Description',
    'List Price': 'List Price',
    'Sale Price': 'Sale Price',
    'Brand': 'Brand Name',
    'Gtin': 'UPC/EAN Code',
    'Available': 'Availability'
}


In [101]:
# Rename columns for df1
df1_cleaned = df1.rename(columns=column_mapping)

# Rename columns for df2
df2_cleaned = df2.rename(columns=column_mapping)


In [102]:
filter_features(df1_cleaned, 0.8)
filter_features(df2_cleaned, 0.8)

Features with more than 80% values:
Unique ID
Product Name
Category
Sale Price
Model Number
Description
Specifications
Technical Details
Shipping Weight
Image
Product URL
Is Amazon Seller
Features with more than 80% values:
Unique ID
Crawl Timestamp
Product URL
Product Name
Description
List Price
Sale Price
Brand Name
UPC/EAN Code
Category
Availability


['Unique ID',
 'Crawl Timestamp',
 'Product URL',
 'Product Name',
 'Description',
 'List Price',
 'Sale Price',
 'Brand Name',
 'UPC/EAN Code',
 'Category',
 'Availability']

## Tạo tập features chung để thực hiện ghép

In [103]:

common_features = set(filter_features(df1_cleaned, 0.8)) & set(filter_features(df2_cleaned, 0.8))
print('Common features:', common_features)

Features with more than 80% values:
Unique ID
Product Name
Category
Sale Price
Model Number
Description
Specifications
Technical Details
Shipping Weight
Image
Product URL
Is Amazon Seller
Features with more than 80% values:
Unique ID
Crawl Timestamp
Product URL
Product Name
Description
List Price
Sale Price
Brand Name
UPC/EAN Code
Category
Availability
Common features: {'Product Name', 'Unique ID', 'Product URL', 'Sale Price', 'Category', 'Description'}


### Chọn một số features còn lại có ít cho việc ghép dữ liệu
Feature 'Category' được dùng để lọc trước nên không có trong usable_features

In [104]:
usable_features = common_features - {'Unique ID', 'Product URL', 'Sale Price', 'Category'}

usable_features = sorted(usable_features, reverse=True)


In [105]:
usable_features

['Product Name', 'Description']

## Chuẩn hóa dữ liệu đã lọc

### Feature 'Sale Price' 
Giá tiền có ký tự đặc biệt và một số giá trị dạng khoảng (ví dụ 74.99 - 249.99)

In [106]:
import re

def clean_price_column(price_column):
    def clean_price_range(price_str):
        # Regular expression to match price range (e.g., '74.99 - 249.99')
        match = re.match(r"(\d+(\.\d+)?)\s*-\s*(\d+(\.\d+)?)", price_str.strip())
        
        if match:
            # Extracting the lower and upper price bounds
            lower_price = float(match.group(1))
            upper_price = float(match.group(3))
            return lower_price, upper_price
        else:
            return None, None  # Return None if no valid range is found

    def clean_price(value):
        # Check for price range
        if isinstance(value, str) and ' - ' in value:
            lower_price, upper_price = clean_price_range(value)
            return lower_price, upper_price if lower_price is not None and upper_price is not None else None
        else:
            # Remove common currency symbols and commas
            value = str(value).replace('$', '').replace('€', '').replace('₹', '').replace('£', '').replace(',', '')
            try:
                return float(value)  # Convert to float directly
            except ValueError:
                return None  # In case the value cannot be converted to a float
    
    return price_column.apply(clean_price)


In [107]:
df1_cleaned['Sale Price'] = clean_price_column(df1_cleaned['Sale Price'])
df2_cleaned['Sale Price'] = clean_price_column(df2_cleaned['Sale Price'])

### Ghép dữ liệu ở feature 'Product Name' và 'Description' để chuẩn bị cho việc fuzzy matching

In [108]:
def preprocess_and_concat(df, common_features):
    df[common_features] = df[common_features].fillna('')
    df['concat'] = df[common_features].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [109]:
preprocess_and_concat(df1_cleaned, list(usable_features))
preprocess_and_concat(df2_cleaned, list(usable_features))
df1_cleaned.columns


Index(['Unique ID', 'Product Name', 'Brand Name', 'Asin', 'Category', 'Upc Ean Code', 'List Price', 'Sale Price', 'Quantity', 'Model Number', 'Description', 'Specifications', 'Technical Details',
       'Shipping Weight', 'Product Dimensions', 'Image', 'Variants', 'Sku', 'Product URL', 'Stock', 'Product Details', 'Dimensions', 'Color', 'Ingredients', 'Direction To Use', 'Is Amazon Seller',
       'Size Quantity Variant', 'Product Description', 'concat'],
      dtype='object')

### Tách feature 'Category' thành những category chuẩn
Category là feature gồm nhiều category khác nhau được ngăn bởi dấu '|' (ví dụ: 'Camping Gear | Sports and Outdoors')

In [113]:
# Tokenize category to a dict with index
def tokenize_category(df):
    return df['Category'].apply(lambda x: x.lower().split(' | '))

In [123]:

df1_cleaned['Category'].fillna('', inplace=True)
df2_cleaned['Category'].fillna('', inplace=True)


In [124]:
token2 =tokenize_category(df2_cleaned)
dict2 = dict()
for idx, token in enumerate(token2):
    for cate in token:
        if cate not in dict2:
            dict2[cate] = set()  # Initialize the list if the category is not in the dictionary
        dict2[cate].add(idx)

In [125]:
token1 =tokenize_category(df1_cleaned)
dict1 = dict()
for idx, token in enumerate(token1):
    for cate in token:
        if cate not in dict1:
            dict1[cate] = set()  # Initialize the list if the category is not in the dictionary
        dict1[cate].add(idx)

In [126]:
print(len(dict1))
print(len(dict2))

1163
4678


#### Tính toán trước điểm fuzzy của từng category để tăng tốc quá trình fuzzy matching

In [127]:

fuzzy_matching_list = {}
# Perform fuzzy matching with 2 token dict 1, 2
for token1 in dict1.keys():
    for token2 in dict2.keys():
        score = fuzz.token_set_ratio(token1, token2)
        
        fuzzy_matching_list[(token1, token2)] = score
            


In [128]:
fuzzy_matching_list

{('sports & outdoors', 'health'): 8.695652173913047,
 ('sports & outdoors', 'medicine cabinet'): 12.121212121212125,
 ('sports & outdoors', 'thermometers'): 34.48275862068965,
 ('sports & outdoors', 'digital thermometers'): 32.432432432432435,
 ('sports & outdoors', 'premium beauty'): 32.25806451612904,
 ('sports & outdoors', 'premium fragrance'): 23.529411764705884,
 ('sports & outdoors', 'premium perfume'): 25.0,
 ('sports & outdoors', 'sports & outdoors'): 100.0,
 ('sports & outdoors', 'outdoor sports'): 90.3225806451613,
 ('sports & outdoors', 'hunting'): 16.66666666666667,
 ('sports & outdoors', 'hunting clothing'): 24.24242424242425,
 ('sports & outdoors', 'all hunting clothing'): 27.02702702702703,
 ('sports & outdoors', 'baby'): 0.0,
 ('sports & outdoors', 'diapering'): 23.07692307692308,
 ('sports & outdoors', 'baby wipes'): 22.22222222222223,
 ('sports & outdoors', 'diabetes care'): 26.66666666666667,
 ('sports & outdoors', 'diabetic socks'): 32.25806451612904,
 ('sports & ou

## Thực hiện fuzzy matching

In [133]:
matching_product_list = []
# Matching Category to the best category in df2
for idx, row in df1_cleaned.iterrows():
    # Tokenize category for df2
    token1 = row['Category'].lower().split(' | ')
    if token1 == ['']:
        continue
    print(idx, token1)
    # Compare with dict1 using fuzzy matching
    common_list = None
    for _, cate in enumerate(token1):
        for dict_cate in dict2.keys():
            score = fuzzy_matching_list[(cate, dict_cate)]
            if score > 80:
                # print(f"{cate} -> {dict_cate} with score {score}")
                if common_list is None:
                    common_list = set(dict2[dict_cate])
                else:
                    common_list = common_list | set(dict2[dict_cate])
                # print("common_list:", common_list)
    if common_list is None:
        continue
    df2_strings = df2_cleaned.loc[list(common_list), 'concat'].tolist()
    top_match = process.extract(row['concat'], df2_strings, scorer=fuzz.token_set_ratio, score_cutoff=60)
    if top_match:
        for index, x in enumerate(top_match):
            top_match[index] = (x[0], x[1], list(common_list)[x[2]])
            # print(f"{row['Product Name']} -> {df2_cleaned['Product Name'][match_index]} with score {score}")
        # print(f"{row['Product Name']} -> {df1_cleaned['Product Name'][match_index]} with score {score}")
        matching_product_list.append((idx, top_match))
        print("===============================================")
        print(f"Matched {len(matching_product_list)} products")
        print("===============================================")

    

0 ['sports & outdoors', 'outdoor recreation', 'skates, skateboards & scooters', 'skateboarding', 'standard skateboards & longboards', 'longboards']
1 ['toys & games', 'learning & education', 'science kits & toys']
2 ['toys & games', 'arts & crafts', 'craft kits']
3 ['toys & games', 'hobbies', 'models & model kits', 'model kits', 'airplane & jet kits']
4 ['toys & games', 'puzzles', 'jigsaw puzzles']
6 ['clothing, shoes & jewelry', 'costumes & accessories', 'kids & baby', 'girls', 'costumes']
7 ['toys & games', 'arts & crafts', 'drawing & painting supplies', 'crayons']
8 ['home & kitchen', 'home décor', 'window treatments', 'window stickers & films', 'window films']
10 ['toys & games', 'baby & toddler toys']
11 ['toys & games', 'collectible toys', 'statues, bobbleheads & busts', 'statues']
12 ['baby products', 'nursery', 'décor', 'window treatments', 'valances']
13 ['toys & games', 'building toys', 'building sets']
14 ['toys & games', 'arts & crafts', 'craft kits']
15 ['toys & games', 'b

In [77]:
matching_product_list

[(0,
  [('Blank Pro Complete Skateboard Natural 7.75 Black Wheels Black Trucks We aim to show you accurate product information. Manufacturers, suppliers and others provide what you see here, and we have not verified it. See our disclaimer |This professional skateboard includes Flavor trucks which will push you towards only the tastiest tricks. Coupled with 52mm wheels and ABEC 7 bearings, this complete skateboard is youre first step to becoming a modern day skating legend. This board comes ready to ride. So just hop on and go. Great Shape and Awesome POP! - 7ply 100% Canadian Maple. Moose Decks are made by one of the premier manufacturers in the world. Mellow kicks in the nose and tail ensure better foot placement. The kicks also have major concave in the pocket area, which cups the back of the foot and adds stiffness to the flat area behind the trucks. There is a significant lateral concave and a very slight rocker for long lasting stiffness and pop. These decks are great for park and

In [134]:
len(matching_product_list)

211

### So sánh điểm và kết quả

In [136]:
for pair in matching_product_list:
    # print(pair)
    df1_idx = pair[0]
    df2_topmatch = pair[1]
    for _, score, df2_idx in df2_topmatch:
        if score < 60:
            continue
        print(score)
        print(df1_cleaned["Sale Price"][df1_idx], df1_cleaned["Product Name"][df1_idx])
        print(df2_cleaned["Sale Price"][df2_idx], df2_cleaned["Product Name"][df2_idx])



60.97560975609756
(None, None) Huffy Kids Bikes 16 & 20 inch with Streamers and BMX Pegs
129.99 RoyalBaby Buttons Green 12 inch Kids Bicycle
60.97560975609756
(None, None) Huffy Kids Bikes 16 & 20 inch with Streamers and BMX Pegs
129.96 RoyalBaby Buttons Matte Blue 16 inch Kids Bicycle
65.1063829787234
15.99 Swing-N-Slide Blue Child Seat
39.41 Swing-N-Slide Nest Swing, 40 in. Diameter, Green with Black Nylon Ropes
64.52991452991452
15.99 Swing-N-Slide Blue Child Seat
273.99 Swing-N-Slide 5 Foot Super Speed Wave Slide with Lifetime Warranty, Green
64.52991452991452
15.99 Swing-N-Slide Blue Child Seat
82.99 Swing-N-Slide Metal 360° See Saw Spinner - Red, Yellow, Blue
64.23982869379014
15.99 Swing-N-Slide Blue Child Seat
23.24 Swing-N-Slide Extra-Duty Green Swing Seat with Coated Chains
63.94849785407725
15.99 Swing-N-Slide Blue Child Seat
152.26 Swing-N-Slide Wrangler DIY Play Set Hardware Kit (Wood and Slide not included)
86.92660550458716
13.99 Skip Hop Bandana Buddies Baby Activity an