In [67]:
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from bs4 import BeautifulSoup as Soup
from urllib.parse import urlparse, urljoin

##dbs星展, sinopac永豐
bank_urls = {'dbs':'https://www.dbs.com.tw/personal-zh/cards/dbs-credit-cards/default.page', 'sinopac':'https://bank.sinopac.com/sinopacBT/personal/credit-card/introduction/list.html'}
credit_card_urls = {}

for bank, url in bank_urls.items():
    loader = RecursiveUrlLoader(url=url)
    docs = loader.load()
    if bank == 'dbs':
        links = list(set([a['href'] for a in Soup(docs[0].page_content).find_all('a', href=True) if (a['href'].startswith('/personal-zh/cards')) and (a['href'].endswith('hyperlink') or a['href'].endswith('cta'))]))
        credit_card_urls[bank] = [urljoin(url, i) for i in links]
    elif bank == 'sinopac':
        links = list(set([a['href'] for a in Soup(docs[0].page_content).find_all('a', href=True) if a['href'].startswith('./')]))
        credit_card_urls[bank] = [urljoin(url, i) for i in links]

In [68]:
from langchain_community.document_loaders.url_selenium import SeleniumURLLoader
documents = {}
for bank, links in credit_card_urls.items():
    documents[bank] = SeleniumURLLoader(urls=links).load()
        

In [258]:
from bs4 import BeautifulSoup as Soup
import re
soup_documents = {}

def sinopac_extractor(html: str) -> str:
    soup = Soup(html, "html.parser")
    divs_txt = [s.text for s in soup.find_all('div', {'class':'tab-box'})]
    divs_txt.insert(0, re.sub(r'\n\n+', '\n', soup.find('div', {'class':'info'}).text))
    div_set = []
    for txt in divs_txt:
        if txt not in div_set:
            div_set.append(txt)
    txt = re.sub(r'[\t\r\xa0]', '', ('\n').join(div_set))
    txt = re.sub(r'  +', ' ', txt)
    txt = re.sub(r'\n+', '\n', txt)
    return txt
def dbs_extractor(html:str)->str:
    soup = Soup(html, "html.parser")
    divs_txt = [s.text for s in soup.find_all('div', {'class':'flpweb-legacy'})]
    if divs_txt:
        div_set = []
        for txt in divs_txt:
            if txt not in div_set:
                div_set.append(txt)
        return ('\n').join(div_set)
    else:
        txt = soup.text
        txt = re.sub(r'[\t\r\xa0]', '', txt)
        txt = re.sub(r'  +', ' ', txt)
        txt = re.sub(r'\n+', '\n', txt)
        return txt

for bank, links in credit_card_urls.items():
    docs = []
    if bank == 'sinopac':
        for url in links:
            doc = RecursiveUrlLoader(url=url, extractor=sinopac_extractor).load()
            doc[0].metadata['bank'] = 'sinopac'
            docs.extend(doc)
    elif bank == 'dbs':
        for url in links:
            doc = RecursiveUrlLoader(url=url, extractor=dbs_extractor).load()
            doc[0].metadata['bank'] = 'dbs'
            docs.extend(doc)
    soup_documents[bank] = docs

In [260]:
for doc in soup_documents['dbs']:
    doc.page_content = doc.page_content.replace('個人網路銀行\nCard+ 信用卡數位服務\n企業網路銀行\n','')

In [287]:
import mlflow
from langchain.chat_models import ChatOllama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from langchain_huggingface import HuggingFaceEmbeddings

splitter = RecursiveCharacterTextSplitter(chunk_size=400,
                                        length_function=len,
                                        is_separator_regex=True,
                                        chunk_overlap=50,
                                        separators=['。'])

chunks = splitter.split_documents(soup_documents['dbs'])
chunks.extend(splitter.split_documents(soup_documents['sinopac']))
chunks
embedding = HuggingFaceEmbeddings(model_name='BAAI/bge-base-zh-v1.5', model_kwargs={'device':'mps'})
collection_name = 'credit_card'
qdrant_url = 'http://localhost:6333'
vec_store = QdrantVectorStore.from_documents(chunks,
                              collection_name=collection_name,
                              embedding = embedding,
                              force_recreate=True,
                              url = qdrant_url)



In [288]:
retriever = vec_store.as_retriever(search_kwargs={'k':10, 'score_threshold':0.5})


In [289]:
from langchain_community.cross_encoders import HuggingFaceCrossEncoder 
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain.retrievers import ContextualCompressionRetriever
from langchain_core.runnables import RunnablePassthrough

reranker = HuggingFaceCrossEncoder(model_name='BAAI/bge-reranker-base')
compressor = CrossEncoderReranker(model=reranker, top_n=5)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)

model = ChatOllama(model='qwen2:7b-instruct', temperature=0)

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI question-answering assistant. Your task is to answer the question based on the provided documents. The documents are part of the text from the description of the credit cards.
    The documentsa are not all relevant to the question. Please filter and reply with an answer. No pre-amble or explanation.
    
    Documents: 
    {context}

    Question:
    {question}

    Answer:""",
)

def format_docs(docs):
    return "\n\n".join([f"DOCUMENT {index}\nTitle: {d.metadata['title']}\n{d.page_content}" for index, d in enumerate(docs)])


chain = (
    {"context": compression_retriever | format_docs, "question": RunnablePassthrough()}
    | QUERY_PROMPT
    | model
    | StrOutputParser()
)


chain.invoke("星展eco永續世界商務卡回饋是幾%?")

'星展eco永續世界商務卡的回饋率如下：\n- 国内一般消费：笔笔回馈0.88%，无上限。\n- 国外一般消费：笔笔回馈1.5%，无上限。\n- 星展支持的社会企业及中小企业、Tesla充电资费、Gogoro电池资费享最高10%回饋（含原国内0.88%回饋 + 加码9.12%，每月上限300点）。'

In [291]:
compression_retriever.invoke('星展eco永續世界商務卡國內回饋是幾%?')

[Document(metadata={'source': 'https://www.dbs.com.tw/personal-zh/cards/dbs-credit-cards/eco-world?pid=tw-pweb-personal-zh_cards_dbs-credit-cards_default_page-hyperlink', 'content_type': 'text/html', 'title': '星展eco永續世界商務卡 | 星展銀行（台灣）', 'description': '國內/外一般消費最優回饋1.5%，eco消費享最優10%回饋', 'language': None, 'bank': 'dbs', '_id': 'af9a6e3d-525c-4208-abaa-b62003952131', '_collection_name': 'credit_card'}, page_content='。\n最高10%現金紅利回饋權益適用期間：新卡開卡後～2024/12/31國內一般消費，筆筆回饋0.88%，回饋無上限國外一般消費，筆筆回饋1.5%，回饋無上限星展支持的社會企業及中小企業／Tesla充電資費／Gogoro 電池資費享最高10%回饋\xa0（含原國內0.88%回饋 + 加碼9.12% 每月上限300點）\n 星展eco永續世界商務卡現金紅利回饋活動注意事項活動期間：新卡開卡後~2024/12/31。（換卡前已回饋點數仍持續有效，將併入累計）'),
 Document(metadata={'source': 'https://www.dbs.com.tw/personal-zh/cards/dbs-credit-cards/eco-world?pid=tw-pweb-personal-zh_cards_dbs-credit-cards_default_page-cta', 'content_type': 'text/html', 'title': '星展eco永續世界商務卡 | 星展銀行（台灣）', 'description': '國內/外一般消費最優回饋1.5%，eco消費享最優10%回饋', 'language': None, 'bank': 'dbs', '_id': 'fb1adcc2-6f75-46cf-968f-558f8d8