In [115]:
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from bs4 import BeautifulSoup as Soup
from urllib.parse import urlparse, urljoin
import re

##dbs星展, sinopac永豐
bank_urls = {'dbs':'https://www.dbs.com.tw/personal-zh/cards/dbs-credit-cards/default.page', 'sinopac':'https://bank.sinopac.com/sinopacBT/personal/credit-card/introduction/list.html',
             'cathy': 'https://www.cathaybk.com.tw/cathaybk/personal/product/credit-card/cards/'}
credit_card_urls = {}
soup_documents = {}

for bank, url in bank_urls.items():
    loader = RecursiveUrlLoader(url=url)
    docs = loader.load()
    if bank == 'dbs':
        links = list(set([a['href'] for a in Soup(docs[0].page_content).find_all('a', href=True) if (a['href'].startswith('/personal-zh/cards')) and (a['href'].endswith('hyperlink') or a['href'].endswith('cta'))]))
        credit_card_urls[bank] = [urljoin(url, i) for i in links]
    elif bank == 'sinopac':
        links = list(set([a['href'] for a in Soup(docs[0].page_content).find_all('a', href=True) if a['href'].startswith('./')]))
        credit_card_urls[bank] = [urljoin(url, i) for i in links]
    elif bank == 'cathy':
        def parser(html):
            ## main page
            divs = Soup(html, 'html.parser').find_all('div', {'class':'cubre-m-compareCard -credit'})
            if divs:
                divs = [re.sub(r'\n+', '\n', i.text) for i in divs]
                txt = (' ').join(divs)
                txt = re.sub(r' +', ' ', txt)
                txt = [t.replace('\n立即申辦','', 1) for t in txt.split('詳細說明') if t.startswith('\n立即申辦')]
                txt = ('').join(txt).replace('\n \n', '\n')
            else:
                ## deeper page
                divs = Soup(html, 'html.parser').find_all('div', class_=["cubre-o-textContent", "cubre-m-colorBanner__title","cubre-m-iconEssay__title","cubre-m-horGraphic__title","cubre-m-remind__title","cubre-m-puzzle__title","cubre-a-kvTitle -card"])
                divs = [d.text for d in divs if '您將離開本行官網 前往外部網站' not in d.text]
                uni_divs = []
                for d in divs[:len(divs)-1]:
                    if d not in uni_divs:
                        uni_divs.append(d)
                txt = re.sub(r'\n+', '\n', ('\n').join(uni_divs))
                txt = re.sub(r' +', ' ', txt)
            return txt
            
        docs = RecursiveUrlLoader(url=url, extractor=parser).load()
        soup_documents['cathy'] = docs
        

In [3]:
# from langchain_community.document_loaders.url_selenium import SeleniumURLLoader
# documents = {}
# for bank, links in credit_card_urls.items():
#     documents[bank] = SeleniumURLLoader(urls=links).load()
        

In [120]:
def sinopac_extractor(html: str) -> str:
    soup = Soup(html, "html.parser")
    divs_txt = [s.text for s in soup.find_all('div', {'class':'tab-box'})]
    divs_txt.insert(0, re.sub(r'\n\n+', '\n', soup.find('div', {'class':'info'}).text))
    div_set = []
    for txt in divs_txt:
        if txt not in div_set:
            div_set.append(txt)
    txt = re.sub(r'[\t\r\xa0]', '', ('\n').join(div_set))
    txt = re.sub(r'  +', ' ', txt)
    txt = re.sub(r'\n+', '\n', txt)
    return txt
def dbs_extractor(html:str)->str:
    soup = Soup(html, "html.parser")
    divs_txt = [s.text for s in soup.find_all('div', {'class':'flpweb-legacy'})]
    if divs_txt:
        div_set = []
        for txt in divs_txt:
            if txt not in div_set:
                div_set.append(txt)
        return ('\n').join(div_set)
    else:
        txt = soup.text
        txt = re.sub(r'[\t\r\xa0]', '', txt)
        txt = re.sub(r'  +', ' ', txt)
        txt = re.sub(r'\n+', '\n', txt)
        return txt

for bank, links in credit_card_urls.items():
    docs = []
    if bank == 'sinopac':
        for url in links:
            doc = RecursiveUrlLoader(url=url, extractor=sinopac_extractor).load()
            doc[0].metadata['bank'] = 'sinopac'
            docs.extend(doc)
    elif bank == 'dbs':
        for url in links:
            doc = RecursiveUrlLoader(url=url, extractor=dbs_extractor).load()
            doc[0].page_content = doc[0].page_content.replace('個人網路銀行\nCard+ 信用卡數位服務\n企業網路銀行\n','')
            doc[0].metadata['bank'] = 'dbs'
            docs.extend(doc)
    soup_documents[bank] = docs

In [127]:
import mlflow
from langchain.chat_models import ChatOllama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from langchain_huggingface import HuggingFaceEmbeddings

splitter = RecursiveCharacterTextSplitter(chunk_size=400,
                                        length_function=len,
                                        is_separator_regex=True,
                                        chunk_overlap=50,
                                        separators=['。'])
banks = list(bank_urls.keys())
chunks = splitter.split_documents(soup_documents[banks[0]])
for b in banks[1:]:
    chunks.extend(splitter.split_documents(soup_documents[b]))

embedding = HuggingFaceEmbeddings(model_name='BAAI/bge-base-zh-v1.5', model_kwargs={'device':'mps'})
collection_name = 'credit_card'
qdrant_url = 'http://localhost:6333'
vec_store = QdrantVectorStore.from_documents(chunks,
                              collection_name=collection_name,
                              embedding = embedding,
                              force_recreate=True,
                              url = qdrant_url)



In [128]:
retriever = vec_store.as_retriever(search_kwargs={'k':10, 'score_threshold':0.5})

In [143]:
from langchain_community.cross_encoders import HuggingFaceCrossEncoder 
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain.retrievers import ContextualCompressionRetriever
from langchain_core.runnables import RunnablePassthrough
from opik.integrations.langchain import OpikTracer
import os
import opik
opik.configure(use_local=True)

# Create the Opik tracer
opik_tracer = OpikTracer(tags=["langchain", "ollama"])
os.environ["OPIK_PROJECT_NAME"] = "credit-card-helper"

reranker = HuggingFaceCrossEncoder(model_name='BAAI/bge-reranker-base')
compressor = CrossEncoderReranker(model=reranker, top_n=5)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)

model = ChatOllama(model='qwen2:7b-instruct', temperature=0).with_config({"callbacks": [opik_tracer]})

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI question-answering assistant. Your task is to answer the question based on the provided documents. The documents are part of the text from the description of the credit cards.
    The documents are not all relevant to the question. Please filter and reply with an answer. No pre-amble or explanation.
    
    Documents: 
    {context}

    Question:
    {question}

    Answer:""",
)

def format_docs(docs):
    return "\n\n".join([f"DOCUMENT {index}\nTitle: {d.metadata['title']}\n{d.page_content}" for index, d in enumerate(docs)])


chain = (
    {"context": compression_retriever | format_docs, "question": RunnablePassthrough()}
    | QUERY_PROMPT
    | model
    | StrOutputParser()
)

question = "星展銀行有什麼信用卡？優惠分別到什麼時候？"
response = chain.invoke(question)
# compression_retriever.invoke(question)
print(response)

OPIK: Configuration saved to file: /Users/sarah/.opik.config
OPIK: Started logging traces to the "credit-card-helper" project at http://localhost:5173/default/redirect/projects?name=credit-card-helper.


星展銀行提供以下兩種信用卡：

1. **星展優仕商務卡**：此卡的活動期間為2024年1月1日至2024年12月31日。該卡有特定的停車優惠，僅限持卡人本人使用，每日限用一次，且跨日取車適用於當日的一次停車優惠。

2. **星展豐盛御璽卡**：關於此卡的信息未在提供的文檔中詳細說明其活動期間或特定優惠。但可以確認的是，該卡提供信用卡刷卡優惠和相關產品資訊查詢服務。

請注意，所有其他業者經營之網站均由業者自行負責，包括客戶隱私權保護及客戶資訊安全事項，不屬於星展銀行（台灣）控制或負責範疇。


In [149]:
from opik.evaluation.metrics import Hallucination
from opik.evaluation.models import LiteLLMChatModel

model = LiteLLMChatModel(
    name="ollama/llama3.2",
    base_url="http://localhost:11434"
)

hallucination_metric = Hallucination(
    model=model
)

In [150]:
hallucination_metric.score(
    input=question,
    output=response,
)


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



AuthenticationError: litellm.AuthenticationError: AuthenticationError: OpenAIException - The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [148]:
from litellm import completion

response = completion(
    model="ollama/llama3.2", 
    messages=[{ "content": "respond in 20 words. who are you?","role": "user"}], 
    api_base="http://localhost:11434"
)
print(response)


ModelResponse(id='chatcmpl-d0d148d7-553c-4c37-870e-bf5ae60c931d', created=1734684381, model='ollama/llama3.2', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='stop', index=0, message=Message(content="I'm an AI designed to provide helpful and informative responses, answering your questions and assisting with various tasks effectively.", role='assistant', tool_calls=None, function_call=None))], usage=Usage(completion_tokens=23, prompt_tokens=38, total_tokens=61, completion_tokens_details=None, prompt_tokens_details=None))
