In [2]:
# 이름이 약간 다르더라도(대소문자, 스페이스, 오타 등) 가장 유사한 향수 이름을 매칭하기 위한 라이브러리
# !pip install thefuzz[speedup] pandas
!pip install "thefuzz[speedup]" # mac zsh version

Collecting thefuzz[speedup]
  Downloading thefuzz-0.22.1-py3-none-any.whl.metadata (3.9 kB)
Collecting rapidfuzz<4.0.0,>=3.0.0 (from thefuzz[speedup])
  Downloading rapidfuzz-3.13.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (12 kB)
Downloading thefuzz-0.22.1-py3-none-any.whl (8.2 kB)
Downloading rapidfuzz-3.13.0-cp39-cp39-macosx_11_0_arm64.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: rapidfuzz, thefuzz
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [thefuzz]
[1A[2KSuccessfully installed rapidfuzz-3.13.0 thefuzz-0.22.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
# csv 파일 인코딩 정보 확인 위함
!pip install chardet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [7]:
import chardet

with open("./dataset/final_perfume_data.csv", "rb") as f:
    rawdata = f.read(10000)  # 파일 앞부분 10KB만 검사
    result = chardet.detect(rawdata)
    print(result)

{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}


In [11]:
import pandas as pd

for enc in ['utf-8-sig', 'utf-8', 'cp949', 'euc-kr', 'latin1']:
    try:
        # df1 = pd.read_csv("./dataset/final_perfume_data.csv", encoding=enc)
        df2 = pd.read_csv("./dataset/parfumo_datos.csv", encoding=enc)
        print(f"Success with encoding: {enc}")
        break
    except Exception as e:
        print(f"Failed with encoding: {enc}, error: {e}")
        print("\n")
        
        
# df1 : Success with encoding: latin1
# df2 : Success with encoding: utf-8-sig
        

Success with encoding: utf-8-sig


In [13]:
import pandas as pd
import re
from thefuzz import process

# 1. CSV 불러오기 (+ 인코딩)
# description , 이미지 있는 데이터
# Perfume Recommendation Dataset => https://www.kaggle.com/datasets/nandini1999/perfume-recommendation-dataset?utm_source=chatgpt.com
df1 = pd.read_csv("./dataset/final_perfume_data.csv", encoding="latin1")   # Perfume Recommendation Dataset
# 농도 메인어코드 탑/미들/베이스노트 
# Parfumo Fragrance Dataset => https://www.kaggle.com/datasets/olgagmiufana1/parfumo-fragrance-dataset
df2 = pd.read_csv("./dataset/parfumo_datos.csv", encoding="utf-8-sig")   # Parfumo Fragrance Dataset
# 더 필요한 칼럼 : 가격 용량 부향률 성별

# 2. 번호/특수문자 제거 함수
def clean_name(name):
    if pd.isna(name):
        return ""
    # "#숫자 " 패턴 제거 + 소문자 변환 + 앞뒤 공백 제거
    name = re.sub(r"^#\d+\s*", "", str(name))
    name = name.lower().strip()
    return name

# 3. 전처리 컬럼 생성
df1["Name_clean"] = df1["Name"].apply(clean_name)
df2["Name_clean"] = df2["Name"].apply(clean_name)

# 4. fuzzy matching 함수 수정
def match_name(name, choices, threshold=85):
    result = process.extractOne(name, choices, score_cutoff=threshold)
    if result is None:
        return None
    match_name, score = result[0], result[1]
    return match_name

# 5. 매칭 수행
matches = []
for name in df1["Name_clean"]:
    matched_name = match_name(name, df2["Name_clean"], threshold=85)
    matches.append(matched_name)

df1["Matched_Name_clean"] = matches

# 6. 병합
df_merged = pd.merge(df1, df2, left_on="Matched_Name_clean", right_on="Name_clean", how="left", suffixes=("_kaggle", "_parfumo"))

# 7. 결과 저장
df_merged.to_csv("dataset/perfume_merged_fuzzy.csv", index=False)

print(f"병합된 데이터셋 크기: {df_merged.shape}") # 3.3MB
print(df_merged[["Name_kaggle", "Matched_Name_clean", "Name_parfumo"]].head(10))


병합된 데이터셋 크기: (2218, 20)
                            Name_kaggle  \
0                  Tihota Eau de Parfum   
1                           Sola Parfum   
2                        Kagiroi Parfum   
3          Velvet Fantasy Eau de Parfum   
4   A Blvd. Called Sunset Eau de Parfum   
5  Freckled and Beautiful Eau de Parfum   
6           Exit the King Eau de Parfum   
7                          Eshu Extrait   
8                    Saringkarn Extrait   
9                        Arsalan Parfum   

                                  Matched_Name_clean  \
0                   méditation de la lune le ré noir   
1                         parfum exaltant le ré noir   
2                         parfum exaltant le ré noir   
3   #flower power ramón monegal 2010 eau de toilette   
4  on a clear day you can see forever cb i hate p...   
5  *cough cough* i'm sick. sixteen92 2024 extrait...   
6  - havana, glass of vanilla cocktail on the bea...   
7  *cough cough* i'm sick. sixteen92 2024 extrait...  