### 데이터 전처리 과정

In [2]:
import pandas as pd

df = pd.read_csv('./whisky_reviews.csv')
df_len = len(df)

print("총 데이터 개수", df_len)

총 데이터 개수 985


In [3]:
# 비슷한 컬럼 그룹화
tags_col = "Tags"
comment_cols = ["Nose Comment", "Taste Comment", "Finish Comment"]
score_cols = ["Nose Score", "Taste Score", "Finish Score"]

print("=== NA 결측치 체크 (NaN) ===")
def pct_nan(col):
    return df[col].isna().mean() * 100

for c in [tags_col] + comment_cols + score_cols:
    if c in df.columns:
        print(f"{c:15s} NaN% = {pct_nan(c):6.2f}% | NaN count = {df[c].isna().sum()}")

=== NA 결측치 체크 (NaN) ===
Tags            NaN% =  62.44% | NaN count = 615
Nose Comment    NaN% =  27.41% | NaN count = 270
Taste Comment   NaN% =  27.31% | NaN count = 269
Finish Comment  NaN% =  27.41% | NaN count = 270
Nose Score      NaN% =  53.91% | NaN count = 531
Taste Score     NaN% =  53.91% | NaN count = 531
Finish Score    NaN% =  53.91% | NaN count = 531


In [4]:
df["comment_count"] = sum((df[c].str.len() > 0).astype(int) for c in comment_cols)
keep = (df["comment_count"] >= 3)

dropped_less_comments_df = df.loc[keep].copy().drop('comment_count', axis=1)
print("코멘트 3개 전부 포함 데이터 개수", len(dropped_less_comments_df))
dropped_less_comments_df.head()

코멘트 3개 전부 포함 데이터 개수 714


Unnamed: 0,Whisky Name,Link,Tags,Nose Score,Nose Comment,Taste Score,Taste Comment,Finish Score,Finish Comment
1,Springbank10-year-old,https://www.whiskybase.com/whiskies/whisky/416...,Green-House,94.0,The nose is full of aromatic power. We still h...,96.0,"On the palate it is surprisingly fresh and ""al...",95.0,"Long finish on liquorice, camphor, smoke, ash ..."
2,Ardbeg1967 Kb,https://www.whiskybase.com/whiskies/whisky/230...,,,"Peat, nuts, celery, well integrated, leather, ...",,"Leather, shoe polish, sherry, mints, cigar, sm...",,"Banana, smoke, citrus, black tea, long and nutty"
3,Springbank35-year-old,https://www.whiskybase.com/whiskies/whisky/110...,"Chocolate,Dried Fruit,Fresh Fruit,New Wood,Oil...",100.0,"Fresh, rich, intensive, complex, fruity, plums...",99.0,"Oh yes, smooth, warm, sweet oak wood, little v...",98.0,"Long, warm, very smooth oak lingering - nevere..."
4,Springbank1966,https://www.whiskybase.com/whiskies/whisky/143...,,,Like a chimera of Genting King and Honey Rum. ...,,"Oceanic fino Shirley. Rhubarb, ginkgo fruit, n...",,"Long-term, coffee, salt, fino, black pepper, c..."
5,Brora1972,https://www.whiskybase.com/whiskies/whisky/953...,"Leathery,Oily,Toasted",96.0,"Famyard smells galore, hay, cowbarns, animal s...",95.0,At first you get a lovely bag of fruit gums th...,95.0,"Good length, remaining composed, absolutely no..."


In [5]:
dropped_less_comments_df["comment_len_sum"] = dropped_less_comments_df[comment_cols].apply(lambda r: sum(len(x) for x in r if x), axis=1)

LEN_SUM = 200
keep = (dropped_less_comments_df["comment_len_sum"] >= LEN_SUM)

dropped_less_comments_and_length_df = dropped_less_comments_df.loc[keep].copy().drop('comment_len_sum', axis=1)
print(f"코멘트 3개 전부 포함 및 전체 길이 합산 {LEN_SUM} 이상 데이터 개수", len(dropped_less_comments_and_length_df))
dropped_less_comments_and_length_df.head()

코멘트 3개 전부 포함 및 전체 길이 합산 200 이상 데이터 개수 676


Unnamed: 0,Whisky Name,Link,Tags,Nose Score,Nose Comment,Taste Score,Taste Comment,Finish Score,Finish Comment
1,Springbank10-year-old,https://www.whiskybase.com/whiskies/whisky/416...,Green-House,94.0,The nose is full of aromatic power. We still h...,96.0,"On the palate it is surprisingly fresh and ""al...",95.0,"Long finish on liquorice, camphor, smoke, ash ..."
2,Ardbeg1967 Kb,https://www.whiskybase.com/whiskies/whisky/230...,,,"Peat, nuts, celery, well integrated, leather, ...",,"Leather, shoe polish, sherry, mints, cigar, sm...",,"Banana, smoke, citrus, black tea, long and nutty"
3,Springbank35-year-old,https://www.whiskybase.com/whiskies/whisky/110...,"Chocolate,Dried Fruit,Fresh Fruit,New Wood,Oil...",100.0,"Fresh, rich, intensive, complex, fruity, plums...",99.0,"Oh yes, smooth, warm, sweet oak wood, little v...",98.0,"Long, warm, very smooth oak lingering - nevere..."
4,Springbank1966,https://www.whiskybase.com/whiskies/whisky/143...,,,Like a chimera of Genting King and Honey Rum. ...,,"Oceanic fino Shirley. Rhubarb, ginkgo fruit, n...",,"Long-term, coffee, salt, fino, black pepper, c..."
5,Brora1972,https://www.whiskybase.com/whiskies/whisky/953...,"Leathery,Oily,Toasted",96.0,"Famyard smells galore, hay, cowbarns, animal s...",95.0,At first you get a lovely bag of fruit gums th...,95.0,"Good length, remaining composed, absolutely no..."


In [7]:
import re

def clean_text(x: str) -> str:
    s = str(x)

    # 이스케이프된 줄바꿈/탭 처리
    s = s.replace("\\n", " ").replace("\\t", " ")

    # 제어문자 제거/치환
    s = s.replace("\r", " ").replace("\n", " ").replace("\t", " ")

    # 영어 이외 값 제거
    s = re.sub(r'[^\x00-\x7F]+', '', s)

    # 줄바꿈은 공백으로 합치기(임베딩 입력 안정화)
    s = re.sub(r"\s*\n\s*", " ", s)

    # 공백 정리
    s = re.sub(r"\s+", " ", s).strip()
    return s

cleaned_text_df = dropped_less_comments_and_length_df.copy()
for col in comment_cols:
    cleaned_text_df[col] = cleaned_text_df[col].apply(lambda r: clean_text(r))


cleaned_text_df[cleaned_text_df['Whisky Name'] == 'Ardbeg1975 DL']
print("코멘트 3개 텍스트 클렌징 후 데이터 개수", len(cleaned_text_df))

코멘트 3개 텍스트 클렌징 후 데이터 개수 676


In [8]:
def merge_unique_text(series: pd.Series, sep=" | "):
    vals = [v.strip() for v in series.fillna("").astype(str).tolist() if v and v.strip()]
    seen, out = set(), []
    for v in vals:
        if v not in seen:
            seen.add(v)
            out.append(v)
    return sep.join(out)

agg = {}
for col in cleaned_text_df.columns:
    agg[col] = merge_unique_text

merged = cleaned_text_df.groupby("Whisky Name", as_index=False).agg(agg)
print("Whisky Name 중복 제거 후 데이터 개수", len(merged))

Whisky Name 중복 제거 후 데이터 개수 449


## 데이터 임베딩 과정

In [35]:
def parse_tags(x: str) -> list[str]:
    s = clean_text(x)
    if not s:
        return []
    # 콤마 기반 split
    tags = [t.strip() for t in s.split(",") if t.strip()]
    # 중복 제거(순서 유지)
    seen = set()
    out = []
    for t in tags:
        key = t.lower()
        if key not in seen:
            seen.add(key)
            out.append(t)
    return out

def build_embed_text(row: pd.Series) -> str:
    name = clean_text(row.get("Whisky Name", ""))
    tags = parse_tags(row.get("Tags", ""))
    # ns = row.get("Nose Score", "")
    # ts = row.get("Taste Score", "")
    # fs = row.get("Finish Score", "")

    nose = row.get("Nose Comment", "")
    taste = row.get("Taste Comment", "")
    finish = row.get("Finish Comment", "")

    parts = []
    parts.append(f"Whisky: {name}")
    if tags:
        parts.append("Tags: " + ", ".join(tags))

    # 점수의 결측치가 많으므로 해당 데이터는 임베딩 가치가 없다고 판단. 나중에 원할 시 해당 항목 추가
    # parts.append(f"Scores: nose={ns}, taste={ts}, finish={fs}")

    if nose:
        parts.append(f"Nose: {nose}")
    if taste:
        parts.append(f"Taste: {taste}")
    if finish:
        parts.append(f"Finish: {finish}")

    return "\n".join(parts).strip()

merged["embed_text"] = merged.apply(build_embed_text, axis=1)

In [36]:
import uuid
import os
from tqdm import tqdm

from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from sentence_transformers import SentenceTransformer

# =========================
# 설정값
# =========================
QDRANT_URL = os.getenv('QDRANT_URL')
COLLECTION_NAME = os.getenv('QDRANT_COLLECTION')

EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
BATCH_SIZE = 128

def reset_collection(client: QdrantClient, collection: str, vector_size: int):
    try:
        client.delete_collection(collection_name=collection)
    except Exception:
        pass

    client.create_collection(
        collection_name=collection,
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
    )

def stable_point_id(name: str, link: str = "") -> str:
    base = f"{(name or '').strip()}|{(link or '').strip()}"
    if base.strip("|").strip() == "":
        base = str(uuid.uuid4())
    return str(uuid.uuid5(uuid.NAMESPACE_URL, base))

# qdrant 클라이언트 init
client = QdrantClient(url=QDRANT_URL)
merged_df = merged.copy()

model = SentenceTransformer(EMBED_MODEL)
vector_size = model.get_sentence_embedding_dimension()

# 삽입 전 콜렉션 초기화
reset_collection(client, COLLECTION_NAME, vector_size)

total = len(merged_df)
for start in tqdm(range(0, total, BATCH_SIZE), desc="Upserting to Qdrant"):
    chunk = merged_df.iloc[start : start + BATCH_SIZE]

    texts = chunk["embed_text"].tolist()

    # texts 벡터화
    vectors = model.encode(
        texts,
        batch_size=BATCH_SIZE,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=False,
    )

    points = []
    for (_, row), vec, text in zip(chunk.iterrows(), vectors, texts):
        name = str(row.get("Whisky Name", "") or "").strip()
        link = str(row.get("Link", "") or "").strip()
        pid = stable_point_id(name, link)

        payload = {
            "whisky_name": name,
            "link": link,
            # "tags": str(row.get("Tags", "") or "").strip(),
            "text": text,
        }

        points.append(PointStruct(id=pid, vector=vec.tolist(), payload=payload))
    # qdrant DB에 삽입
    client.upsert(collection_name=COLLECTION_NAME, points=points)

cnt = client.count(collection_name=COLLECTION_NAME, exact=True).count
print(f"Done. collection='{COLLECTION_NAME}', points={cnt}")

Upserting to Qdrant: 100%|██████████| 4/4 [00:02<00:00,  1.55it/s]

Done. collection='whisky_reviews', points=449





In [1]:
import os, sys

current_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(parent_dir)

from llm.embeddings import process_embedding_data
import pandas as pd
df = pd.read_csv('./whisky_reviews.csv')

process_embedding_data(df=df)

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,Whisky Name,Link,Tags,Nose Score,Nose Comment,Taste Score,Taste Comment,Finish Score,Finish Comment,comment_count,embed_text
0,A Secret Speyside Distillery1968 BR,https://www.whiskybase.com/whiskies/whisky/215...,"Citric,Cooked Fruit,Fresh Fruit,Honey,New Wood...",97.0,"Intense, tropical fruits galore, mango, pineap...",96.0,"Oily, creamy, fruity, little punchy, spicy, hi...",94.0,"Long, warm, very fruity, spicy, little zesty, ...",3,Whisky: A Secret Speyside Distillery1968 BR\nT...
1,Aberlour1958,https://www.whiskybase.com/whiskies/whisky/409...,"Leathery,Old Wood",95.0,Sangria sherry fruity nose. You can sense the ...,91.0,"oak citrus mild sour, honey, slight coffee, li...",92.0,"The sherry sour, old oak, more licorice twig, ...",3,"Whisky: Aberlour1958\nTags: Leathery, Old Wood..."
2,Ardbeg10-year-old Guaranteed,https://www.whiskybase.com/whiskies/whisky/368...,"Chocolate,Fragant,Honey,Leafy,Leathery,Malt Ex...",,"Herbal, minty, eucalyptus, along with fishing ...",,Very herbal. Chartreuse. Medicinal. Gentle pea...,,Short.,3,Whisky: Ardbeg10-year-old Guaranteed\nTags: Ch...
3,Ardbeg1815,https://www.whiskybase.com/whiskies/whisky/689...,"Coal-gas,Chocolate,Citric,Cooked Mash,Fragant,...",,unmoved: You smell Ardbeg gentle and mild sedu...,,like a warm liquid kiss he touches your tongue...,,long all the flavors that you tasted in the mo...,3,"Whisky: Ardbeg1815\nTags: Coal-gas, Chocolate,..."
4,Ardbeg1963 GM,https://www.whiskybase.com/whiskies/whisky/145...,"Chocolate,Fresh Fruit,Leathery,Mossy,Vanilla,V...",95.0,The distillery character is recognizable from ...,96.0,The texture is wonderfully oily. The first fla...,94.0,"Despite the low alcohol, the finish is relativ...",3,"Whisky: Ardbeg1963 GM\nTags: Chocolate, Fresh ..."
...,...,...,...,...,...,...,...,...,...,...,...
444,Tobermory1972 MI,https://www.whiskybase.com/whiskies/whisky/879...,"Citric,Coal-gas,Dried Fruit,Mossy,Smokey,Toast...",95.0,"A lot is happening here, lemons, minerality, s...",94.0,"Great sweetness and peat balance, layers of ch...",94.0,"Long, very dry yet complex, warm, complex, ext...",3,"Whisky: Tobermory1972 MI\nTags: Citric, Coal-g..."
445,Tomatin36-year-old,https://www.whiskybase.com/whiskies/whisky/829...,,94.0,"Intense, fruity, truck loads of apricot, peach...",94.0,"Oily, creamy, sirupy, good sweetness, super fr...",92.0,"Medium long, warm, fruity, spicy, little more ...",3,"Whisky: Tomatin36-year-old\nNose: Intense, fru..."
446,Tormore1966 RWD,https://www.whiskybase.com/whiskies/whisky/918...,"Chocolate,Citric,Cooked Fruit,Dried Fruit,Hay-...",95.0,"Big sherry, from fino to PX. Fruit basket: red...",95.0,"Powerful. Crystal clear sherry, instantly knoc...",94.0,"Long, spicy, fruity, peaty.",3,"Whisky: Tormore1966 RWD\nTags: Chocolate, Citr..."
447,Western Highland1965 TWA,https://www.whiskybase.com/whiskies/whisky/194...,Vanilla,91.0,"A nose above all marked by spices, without bei...",93.0,In the mouth it is incredibly soft and deep. T...,93.0,"Long and sweet finish, on multivitamin fruit j...",3,Whisky: Western Highland1965 TWA\nTags: Vanill...
