In [173]:
import pandas as pd 

In [174]:
# Read the online retail dataset
data = pd.read_csv('amazon.csv')
df = data[:100].copy()
df.dropna(subset=['rating_count'], inplace=True)

df['sub_category'] = df['category'].astype(str).str.split('|').str[-1]
df['main_category'] = df['category'].astype(str).str.split('|').str[0]

In [175]:
df.columns

Index(['product_id', 'product_name', 'category', 'discounted_price',
       'actual_price', 'discount_percentage', 'rating', 'rating_count',
       'about_product', 'user_id', 'user_name', 'review_id', 'review_title',
       'review_content', 'img_link', 'product_link', 'sub_category',
       'main_category'],
      dtype='object')

In [176]:
df1 = df.copy()
df1['product_name'] = df1['product_name'].str.lower() 
df1 = df1.drop_duplicates(subset=['product_name'])    # Remove duplicates based on 'product_name'

In [177]:
print(df.shape)
print(df1.shape)

(100, 18)
(100, 18)


In [178]:
df1['product_name'][0]

'wayona nylon braided usb to lightning fast charging and data sync cable compatible for iphone 13, 12,11, x, 8, 7, 6, 5, ipad air, pro, mini (3 ft pack of 1, grey)'

In [179]:
df1['about_product']

0     High Compatibility : Compatible With iPhone 12...
1     Compatible with all Type C enabled devices, be...
2     【 Fast Charger& Data Sync】-With built-in safet...
3     The boAt Deuce USB 300 2 in 1 cable is compati...
4     [CHARGE & SYNC FUNCTION]- This cable comes wit...
                            ...                        
95    Supports 150Mbps Wireless data transmission ra...
96    Compatible with MI Smart TV 4A 32 inch LED TV ...
97    The cable comes with 3 Different pins allowing...
98    Fastest USB 3.0 and Gigabit solution ensure hi...
99    【Power Delivery Fast Charging】: Charge your iP...
Name: about_product, Length: 100, dtype: object

In [180]:
df2 = df1[['product_id','product_name', 'about_product','main_category','sub_category', 'actual_price','discount_percentage','rating','rating_count' ]]

In [181]:
df2.head()

Unnamed: 0,product_id,product_name,about_product,main_category,sub_category,actual_price,discount_percentage,rating,rating_count
0,B07JW9H4J1,wayona nylon braided usb to lightning fast cha...,High Compatibility : Compatible With iPhone 12...,Computers&Accessories,USBCables,"₹1,099",64%,4.2,24269
1,B098NS6PVG,ambrane unbreakable 60w / 3a fast charging 1.5...,"Compatible with all Type C enabled devices, be...",Computers&Accessories,USBCables,₹349,43%,4.0,43994
2,B096MSW6CT,sounce fast phone charging cable & data sync u...,【 Fast Charger& Data Sync】-With built-in safet...,Computers&Accessories,USBCables,"₹1,899",90%,3.9,7928
3,B08HDJ86NZ,boat deuce usb 300 2 in 1 type-c & micro usb s...,The boAt Deuce USB 300 2 in 1 cable is compati...,Computers&Accessories,USBCables,₹699,53%,4.2,94363
4,B08CF3B7N1,portronics konnect l 1.2m fast charging 3a 8 p...,[CHARGE & SYNC FUNCTION]- This cable comes wit...,Computers&Accessories,USBCables,₹399,61%,4.2,16905


In [182]:
df2.to_csv('amazon_rag.csv', index=False)

### 문서 불러오기
- 문서 데이터 유형
- CSVLoader CSV 파일
- 디렉토리로더 지정된 디렉토리의 모든 파일
- 비정형 많은 파일 유형(https://docs.unstructured.io/platform/supported-file-types) 참조)
- JSONLoader JSON 파일
- BSHTMLL로더 HTML 파일

In [183]:
# from langchain.document_loaders.csv_loader import CSVLoader
# from langchain_community.document_loaders import DirectoryLoader, TextLoader
# from langchain_community.document_loaders import NotionDirectoryLoader, NotionDBLoader

# loader = DirectoryLoader("../", glob="**/*.md", loader_cls=TextLoader)
# docs = loader.load()

In [184]:
from typing import List
from dotenv import load_dotenv
import os

from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores import Chroma
from langchain.schema.document import Document


# This will expose your Langchain api token as an environment variable
load_dotenv()

# def read_csv(file_path: str, source_column: str = "about_product") -> List[Document]:
def read_csv(file_path: str, source_column: str = "product_name") -> List[Document]:
    """Reads a CSV file and returns a list of Documents.

    Args:
        file_path (str): The path to the CSV file to read.
        source_column (str, optional): The name of the column in the CSV file that contains the text data. Defaults to "Description".

    Returns:
        List[Document]: A list of Documents, where each Document contains the text data from the corresponding row in the CSV file.

    Raises:
        FileNotFoundError: If the CSV file does not exist.
        IOError: If there is an error reading the CSV file.
    """

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File does not exist: {file_path}")

    loader = CSVLoader(file_path=file_path, source_column=source_column)
    data = loader.load()

    return data

## 임베딩 모델 불러오기 

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
#reference: https://python.langchain.com/docs/integrations/providers/huggingface/#huggingfaceembeddings
# model_name = "intfloat/multilingual-e5-large-instruct"
model_name = 'intfloat/multilingual-e5-large' # intfloat/multilingual-e5-large

# Function to load embeddings model
def load_embeddings_model(model_name: str) -> HuggingFaceEmbeddings:
    """Loads a Hugging Face Transformer model and returns an Embeddings object.

    Args:
        model_name (str): The name of the Hugging Face Transformer model to load.

    Returns:
        HuggingFaceEmbeddings: An Embeddings object that can be used to encode text into embeddings.
    """
    embedding_function = HuggingFaceEmbeddings(
        model_name=model_name,
        huggingfacehub_api_token=os.environ["api_key"], #토큰업데이트
        model_kwargs={'device':'cpu'},
        encode_kwargs={'normalize_embeddings':True}
    )
    return embedding_function


### 토큰 접속
- reference: https://huggingface.co/docs/hub/security-tokens

# 임베딩 테스트

- intfloat/multilingual-e5-large
- 고성능 모델이라고 함 그래서 시간이 오래 걸리는 듯?

In [186]:
from sentence_transformers import SentenceTransformer

# 모델 선택
model_name = 'intfloat/multilingual-e5-large' # 
embeddings = SentenceTransformer(model_name)

# 임베딩 테스트
text = "This is a test document."
query_result = embeddings.encode(text)  
print(query_result[:3])  


[ 0.00283389 -0.00129713 -0.02809599]


# 문서 벡터화?

In [187]:
def vectorize_documents(data: List[Document], embedding_function: HuggingFaceEmbeddings) -> Chroma:
    """Vectorizes a list of Documents using a Hugging Face Transformer model.

    Args:
        data (List[Document]): A list of Documents to vectorize.
        embedding_function (HuggingFaceEmbeddings): An Embeddings object that can be used to encode text into embeddings.

    Returns:
        Chroma: A Chroma object that contains the vectorized documents.
    """

    ## Chroma, as a vector database, cosine similarity by default for searches.
    db = Chroma.from_documents(data, embedding=embedding_function, 
                            #    collection_metadata={"hnsw:space": "l2"}
                               collection_metadata={"hnsw:space": "cosine"}
                               )
    return db

#### 벡터 스토어 임베딩 데이터 저장

In [188]:
import pandas as pd
from langchain.schema import Document

def read_csv_with_macroman(file_path, source_column):
    """Reads a CSV file with MacRoman encoding and converts to Document objects."""
    try:
        df = pd.read_csv(file_path, encoding='MacRoman')
        if source_column not in df.columns:
            raise ValueError(f"Column '{source_column}' not found in the CSV file.")
        return [Document(page_content=row[source_column]) for _, row in df.iterrows()]
    except Exception as e:
        raise RuntimeError(f"Error reading CSV with MacRoman encoding: {e}")


In [189]:
def init_llm():
    """Initializes the LLM by reading the CSV file, loading the embeddings model, and vectorizing the documents.

    Returns:
        Chroma: A Chroma object that contains the vectorized documents.
    """
    file_path = 'amazon_rag.csv'
    source_column = 'product_name'
    try:
        # MacRoman 인코딩으로 데이터 로드
        data = read_csv_with_macroman(file_path, source_column)
        if not data:
            raise ValueError("No data found in the CSV file.")
    except Exception as e:
        raise RuntimeError(f"Error loading data: {e}")

    # 모델 로드
    model_name = 'sentence-transformers/all-mpnet-base-v2'
    embedding_function = load_embeddings_model(model_name)

    # 데이터 벡터화
    try:
        db = vectorize_documents(data, embedding_function)
    except Exception as e:
        raise RuntimeError(f"Error vectorizing documents: {e}")

    return db


In [None]:
import os
print("Hugging Face Token:", os.environ.get("api_key"))


Hugging Face Token: None


In [None]:
db = init_llm()

# 위 내용에서 api key에서 에러가 남 이유를 모르겠음.

# 비슷한 내용 검색

In [None]:
# Query the vector database
query = "iPhone USB charger and adapter"
found_docs = db.similarity_search_with_score(query, k=5)


In [None]:
# Load documents
found_docs

# 프롬프트 생성

In [None]:
from transformers import AutoTokenizer
import transformers 
import torch

model = "ArliAI/Gemma-2-2B-ArliAI-RPMax-v1.1"#'ArliAI/Mistral-Small-22B-ArliAI-RPMax-v1.1'
tokenizer = AutoTokenizer.from_pretrained(
    model,  
    use_auth_token= "api_key", 
)

# pipeline without device_map
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16 if torch.cuda.is_available() else None,  # GPU 사용 가능 여부에 따라 설정
    device=0 if torch.cuda.is_available() else -1,  # GPU(0) 또는 CPU(-1) 선택
)



Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.61s/it]


In [199]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline(pipeline=pipeline)


In [203]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain import PromptTemplate

promptTemplate_fstring = """
You are a customer service assistant, tasked with providing clear and concise answers based on the given context. 
Context:
{context}
Question:
{query}
Answer:
"""

# Example usage
context = "Your order is currently being processed and is expected to ship within 2-3 business days."
query = "My delivery is delayed. Can you check the status of my order?"


prompt = promptTemplate_fstring.format(context=context, query=query)
print(prompt)



You are a customer service assistant, tasked with providing clear and concise answers based on the given context. 
Context:
Your order is currently being processed and is expected to ship within 2-3 business days.
Question:
My delivery is delayed. Can you check the status of my order?
Answer:



In [209]:
from langchain_core.prompts import PromptTemplate

template = """Question: {question}
Answer:"""

prompt = PromptTemplate.from_template(template)
print(prompt)
chain = prompt | hf

question = "Where is my home?"

print(chain.invoke({"question": question}))

input_variables=['question'] input_types={} partial_variables={} template='Question: {question}\nAnswer:'




Question: Where is my home?
Answer: Well, you can look it up on Google


In [205]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Query definition
query = "suggest cool iPhone USB charger and adapter"
# query = "what is the iphone cable?"
# query = "What is the caracteristic of iPhone USB charger and adapter"

# Perform vector search
doc_context = run_vector_search(query)

# Extract relevant columns
doc = doc_context[['product_name', 'about_product']]
# doc = doc_context[[ 'about_product']]

# print(doc)
# Convert context to string
context = doc.to_string(index=False)

#You are an assistant in customer service. Use the following context to answer the question:

# Define the prompt template
# promptTemplate_fstring = """
# Context:
# {context}
# Question:
# {query}
# Answer:
# """

promptTemplate_fstring = """
You are a customer service assistant, tasked with providing clear and concise answers based on the given context. 
Context:
{context}
Question:
{query}
Answer:
"""

# Initialize the prompt
prompt = PromptTemplate(
    # input_variables=["query", "context"],
    template=promptTemplate_fstring,
)

# print(prompt)
# Create the chain
chain = LLMChain(prompt=prompt, llm=hf)

# Run the chain and get the response
response = chain.run({"query": query, "context": context})

# Print the response
print(response)

NameError: name 'run_vector_search' is not defined