In [None]:
# %pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.0-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.1-cp313-cp313-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   ---- ----------------------------------- 1.0/8.7 MB 18.8 MB/s eta 0:00:01
   --------------------------------- ------ 7.3/8.7 MB 18.9 MB/s eta 0:00:01
   ---------------------------------------- 8.7/8.7 MB 15.8 MB/s eta 0:00:00
Using cached joblib-1.5.1-py3-none-any.whl (307 kB)
Downloading scipy-1.16.0-cp313-cp313-win_amd64.whl (38.4 MB)
   ---------------------------------------- 0.0/38.4 MB ? eta -:--:--
   -----

In [2]:
import json
from sklearn.feature_extraction.text import CountVectorizer
# 키워드 후보 리스트
REVIEW_KEYWORDS = ["연비", "주행감", "실내공간", "디자인", "하이브리드", "정숙성", "가속", "승차감", "가격", "안전성"]
DESCRIPTION_KEYWORDS = ["sleek", "modern", "aerodynamic", "SUV", "sedan", "electric", "hybrid", "dynamic", "spacious", "luxury"]

# 한글 키워드 추출 함수
def extract_korean_keywords(text, keyword_list):
    return [kw for kw in keyword_list if kw in text]

# 영어 키워드 추출 함수
def extract_english_keywords(text, keyword_list):
    vectorizer = CountVectorizer(vocabulary=keyword_list, lowercase=True)
    X = vectorizer.fit_transform([text.lower()])
    return [kw for kw, count in zip(vectorizer.get_feature_names_out(), X.toarray()[0]) if count > 0]

# assign_metadata 함수
def assign_metadata(docs, doc_type='review'):
    result = []
    for item in docs:
        if doc_type == 'review':
            car_name_kr = item.get('car_name', '').strip()
            car_name_en = car_name_kr.replace(' ', '').replace('하이브리드', 'Hybrid')  # 간단 영문 변환
            review_text = item.get('review', '')
            review_keywords = extract_korean_keywords(review_text, REVIEW_KEYWORDS)
            review_length = len(review_text)
            item.update({
                'car_name_kr': car_name_kr,
                'car_name_en': car_name_en,
                'review_keywords': review_keywords,
                'review_length': review_length
            })
        elif doc_type == 'description':
            car_name_en = item.get('car_name', '').replace('_', ' ').strip()
            car_name_kr = car_name_en.replace('Hybrid', '하이브리드')  # 간단 한글 변환
            description_text = item.get('description', '')
            description_keywords = extract_english_keywords(description_text, DESCRIPTION_KEYWORDS)
            item.update({
                'car_name_en': car_name_en,
                'car_name_kr': car_name_kr,
                'description_keywords': description_keywords
            })
        result.append(item)
    return result

# JSON 데이터 로드
with open('hyundai_car_reviews.json', 'r', encoding='utf-8') as f:
    hyundai_car_reviews = json.load(f)

with open('hyundaicar_descript_merge_all.json', 'r', encoding='utf-8') as f:
    hyundaicar_descript = json.load(f)

# 메타데이터 태깅
result_docs = assign_metadata(hyundai_car_reviews, doc_type='review')
json_docs = assign_metadata(hyundaicar_descript, doc_type='description')



In [3]:
# JSON 데이터 로드
with open('hyundai_design_philosophy.json', 'r', encoding='utf-8') as f:
    hyundai_design_philosophy = json.load(f)

In [None]:
from langchain.schema import Document

# flatten_tags: tags 딕셔너리를 tags_성능, tags_공간 형태로 평탄화
def flatten_tags(tags_dict):
    return {f"tags_{k}": v for k, v in tags_dict.items()}

# 리뷰 Document 생성 (review_keywords 포함)
review_documents = [
    Document(
        page_content=f"data_id: {item.get('data_id')} | car_name: {item.get('car_name_kr')}, {item.get('car_name_en')}| review: {item.get('review')}",
        metadata={
            "review_length": item.get('review_length'),
            "review_keywords": item.get('review_keywords', []),
            **flatten_tags(item.get('tags', {}))
        }
    )
    for item in result_docs
]

# description Document 생성 (description_keywords 포함)
description_documents = [
    Document(
        page_content=f"car_name: {item.get('car_name_kr')}, {item.get('car_name_en')} | description: {item.get('description')}",
        metadata={
            "description_keywords": item.get('description_keywords', []),
            "url": item.get("image_path", [])
        }
    )
    for item in json_docs
]

philosophy_documents = [
    Document(
        page_content=item['content'],
        metadata={
            "id": item.get('id'),
            "source_document": item.get("source_document")
        }
    )
    for item in hyundai_design_philosophy
]

total_documents = philosophy_documents + review_documents

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=30
)
split_docs_1 = text_splitter.split_documents(total_documents)
print(f"✅ 총 {len(split_docs_1)}건의 청크된 문서 생성 완료")

✅ 총 881건의 청크된 문서 생성 완료


In [6]:
split_docs_2 = text_splitter.split_documents(description_documents)
print(f"✅ 총 {len(split_docs_2)}건의 청크된 문서 생성 완료")

✅ 총 139건의 청크된 문서 생성 완료


In [7]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
import os

QDRANT_PATH = "./Qdrant_DB"
EMBEDDING_DIM = 3072
HOST = os.getenv("HOST_PUBLIC_IP")

client = QdrantClient(host=HOST, port=6333)

# description용 컬렉션 생성
client.recreate_collection(
    collection_name="description_vector_store",
    vectors_config=VectorParams(
        size=EMBEDDING_DIM,
        distance=Distance.COSINE
    )
)
print("✅ description_vector_store 컬렉션 생성 완료")

# review용 컬렉션 생성 (예: 임베딩 차원이 다르면 size 조정)
client.recreate_collection(
    collection_name="feedback_vector_store",
    vectors_config=VectorParams(
        size=EMBEDDING_DIM, 
        distance=Distance.COSINE
    )
)
print("✅ feedback_vector_store 컬렉션 생성 완료")


  client.recreate_collection(


✅ description_vector_store 컬렉션 생성 완료


  client.recreate_collection(


✅ feedback_vector_store 컬렉션 생성 완료


In [7]:
from dotenv import load_dotenv

load_dotenv()

True

In [8]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Qdrant

embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")

vector_store = Qdrant(
    client=client,
    collection_name="description_vector_store",
    embeddings=embedding_model
)

vector_store.add_documents(split_docs_1)

print(f"✅ Qdrant에 벡터 저장 완료")

  vector_store = Qdrant(


✅ Qdrant에 벡터 저장 완료


In [None]:
len(embedding_model.embed_query("hello world"))

In [9]:
vector_store = Qdrant(
    client=client,
    collection_name="feedback_vector_store",
    embeddings=embedding_model
)

vector_store.add_documents(split_docs_2)

print(f"✅ Qdrant에 벡터 저장 완료")

✅ Qdrant에 벡터 저장 완료


In [15]:
# 문서 내용 확인
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient

client = QdrantClient(host="3.35.81.92", port=6333)
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = Qdrant(
    client=client,
    collection_name="description_vector_store",
    embeddings=embedding_model
)

# 전체 문서 일부만 확인 (최대 5개)
docs = vector_store.similarity_search("아무 쿼리", k=5)
for doc in docs:
    print(doc.page_content)
    print(doc.metadata)

car_name: 그랜저, 그랜저 | description: This car features a sleek estate body type with a balanced, elongated proportion. The smooth, aerodynamic surface enhances its modern look. Subtle lighting accents are integrated into the design, with a minimalist LED strip at the rear. The grill is narrow and
{'description_keywords': ['sleek', 'modern', 'aerodynamic', 'dynamic'], 'url': 'car_images_all/3_1_그랜저.png', '_id': '008e453c-840b-45d0-add0-7446ff80a11e', '_collection_name': 'description_vector_store'}
car_name: 투싼, 투싼 | description: This car features a modern SUV body type with sleek, balanced proportions. The surface has sharp, angular lines and smooth curves, creating a dynamic look. The lighting includes narrow, LED accents that enhance its contemporary feel. The grill is narrow and
{'description_keywords': ['sleek', 'modern', 'dynamic'], 'url': 'car_images_all/4_6_투싼.png', '_id': '32cd2b06-4965-40fe-84c0-50d33cb29222', '_collection_name': 'description_vector_store'}
car_name: 코나, 코나 | desc

In [25]:
docs[0].metadata.get("url")
# type(docs)

'car_images_all/3_1_그랜저.png'

In [24]:
search_result = vector_store.search(
    collection_name="description_vector_store",
    query_vector=embedding_model,
    limit=1,  # top 1
    with_payload=True
    )
search_result

TypeError: VectorStore.search() missing 2 required positional arguments: 'query' and 'search_type'