In [2]:
from langchain.document_loaders import AsyncChromiumLoader
from langchain.document_transformers import Html2TextTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PlaywrightURLLoader
from langchain_community.document_loaders import SeleniumURLLoader
from langchain_community.document_loaders import WebBaseLoader
from bs4 import BeautifulSoup
from tqdm import tqdm
from urllib.parse import urlparse
import re
import pandas as pd

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
async def get_blog_urls(base_url:str):
  loader=AsyncChromiumLoader([base_url])
  transform=Html2TextTransformer()
  docs=await loader.aload()
  html_content = docs[0].page_content
  return html_content

#### Run within notebook

In [None]:
base_url = "https://www.mind.org.uk/information-support/types-of-mental-health-problems/"
docs = await get_blog_urls(base_url)



#### Run outside notebook

In [None]:
import asyncio
base_url = "https://www.mind.org.uk/information-support/types-of-mental-health-problems/"
docs = asyncio.run(get_blog_urls(base_url))

## Mental Issue

In [None]:
base_url = "https://www.mind.org.uk/information-support/types-of-mental-health-problems/"
html_content = await get_blog_urls(base_url)



In [None]:
soup = BeautifulSoup(html_content, 'html.parser')
hrefs = [a['href'] for a in soup.find_all('a', href=True)]
filtered_urls = [href for href in hrefs if href.startswith('/information-support/types-of-mental-health-problems/')]
filtered_urls=filtered_urls[1:]
base_url = "https://www.mind.org.uk"
mental_issue_urls = [base_url + url for url in filtered_urls]

## Mental tips

In [None]:
base_url="https://www.mind.org.uk/information-support/tips-for-everyday-living/"
html_content = await get_blog_urls(base_url)



In [None]:
soup = BeautifulSoup(html_content, 'html.parser')
hrefs = [a['href'] for a in soup.find_all('a', href=True)]
filtered_urls = [href for href in hrefs if href.startswith('/information-support/tips-for-everyday-living/')]
filtered_urls=filtered_urls[1:]
base_url = "https://www.mind.org.uk"
tips_urls = [base_url + url for url in filtered_urls]

## Cleaning function

In [4]:
def clean_content(content):
    # Step 1: Replace newline characters and non-breaking spaces
    content = content.replace('\n', '').replace('\xa0', ' ')

    # Step 2: Use regex to remove multiple spaces and strip extra spaces at the beginning and end
    content = re.sub(r'\s+', ' ', content).strip()

    # Step 3: Remove everything starting from 'Post Views' until the end of the content
    content = re.sub(r'Post Views:.*', '', content, flags=re.DOTALL)

    return content

## Dating Thai

In [5]:
#Find last page
base_url=f"https://www.alljitblog.com/category/จิตวิทยาชีวิตคู่/"
html_content = await get_blog_urls(base_url)
soup = BeautifulSoup(html_content, 'html.parser')

# Find the pagination section
pagination = soup.find('div', class_='box-pagination')
if pagination:
    # Extract all page numbers from the pagination section
    pages = pagination.find_all('a', class_='page-numbers')

    # Convert to integers and find the largest number
    last_page = max([int(page.text) for page in pages if page.text.isdigit()])

    print(f"The last page number is: {last_page}")
else:
    print("Pagination not found.")


#Find all blog url
dating_th_urls=[]
for i in tqdm(range(1, last_page+1)):
  base_url=f"https://www.alljitblog.com/category/จิตวิทยาชีวิตคู่/page/{i}/"
  html_content = await get_blog_urls(base_url)
  soup = BeautifulSoup(html_content, 'html.parser')
  hrefs = [a['href'] for a in soup.find_all('a', href=True)]
  filtered_urls = [href for href in hrefs if href.startswith('https://www.alljitblog.com/')]
  filtered_urls=filtered_urls[1:]
  filtered_urls= list(set(filtered_urls))
  unwanted_substrings = ['?cat', '.com/#', '/author/admin-alljit/','/category/']
  filtered_urls=[url for url in filtered_urls if not any(substring in url for substring in unwanted_substrings)]
  dating_th_urls.extend(filtered_urls)


dating_th_urls = list({urlparse(url).scheme + '://' + urlparse(url).netloc + urlparse(url).path for url in dating_th_urls})
dating_th_urls.remove('https://www.alljitblog.com/')

USER_AGENT environment variable not set, consider setting it to identify your requests.


The last page number is: 3


  0%|          | 0/3 [00:00<?, ?it/s]USER_AGENT environment variable not set, consider setting it to identify your requests.
 33%|███▎      | 1/3 [00:01<00:02,  1.39s/it]USER_AGENT environment variable not set, consider setting it to identify your requests.
 67%|██████▋   | 2/3 [00:02<00:01,  1.18s/it]USER_AGENT environment variable not set, consider setting it to identify your requests.
100%|██████████| 3/3 [00:03<00:00,  1.13s/it]


In [6]:
loader_multiple_pages = WebBaseLoader(dating_th_urls,encoding = 'utf-8')
dating_data = loader_multiple_pages.load()
for data in dating_data:
  data.page_content = data.page_content.replace('\n', '')
  data.page_content = data.page_content.replace('\xa0', ' ')
  data.page_content = re.sub(r'\s+', ' ', data.page_content).strip()
  data.page_content = clean_content(data.page_content)

## Psychiatrist Thai

In [7]:
#Find last page
base_url=f"https://www.alljitblog.com/category/psychiatrist/"
html_content = await get_blog_urls(base_url)
soup = BeautifulSoup(html_content, 'html.parser')

# Find the pagination section
pagination = soup.find('div', class_='box-pagination')
if pagination:
    # Extract all page numbers from the pagination section
    pages = pagination.find_all('a', class_='page-numbers')

    # Convert to integers and find the largest number
    last_page = max([int(page.text) for page in pages if page.text.isdigit()])

    print(f"The last page number is: {last_page}")
else:
    print("Pagination not found.")


#Find all blog url
psy_th_urls=[]
for i in tqdm(range(1, last_page+1)):
  base_url=f"https://www.alljitblog.com/category/psychiatrist/page/{i}/"
  html_content = await get_blog_urls(base_url)
  soup = BeautifulSoup(html_content, 'html.parser')
  hrefs = [a['href'] for a in soup.find_all('a', href=True)]
  filtered_urls = [href for href in hrefs if href.startswith('https://www.alljitblog.com/')]
  filtered_urls=filtered_urls[1:]
  filtered_urls= list(set(filtered_urls))
  unwanted_substrings = ['?cat', '.com/#', '/author/admin-alljit/','/category/']
  filtered_urls=[url for url in filtered_urls if not any(substring in url for substring in unwanted_substrings)]
  psy_th_urls.extend(filtered_urls)

psy_th_urls = list({urlparse(url).scheme + '://' + urlparse(url).netloc + urlparse(url).path for url in psy_th_urls})
psy_th_urls.remove('https://www.alljitblog.com/')

USER_AGENT environment variable not set, consider setting it to identify your requests.


The last page number is: 2


  0%|          | 0/2 [00:00<?, ?it/s]USER_AGENT environment variable not set, consider setting it to identify your requests.
 50%|█████     | 1/2 [00:01<00:01,  1.63s/it]USER_AGENT environment variable not set, consider setting it to identify your requests.
100%|██████████| 2/2 [00:02<00:00,  1.41s/it]


In [8]:
loader_multiple_pages = WebBaseLoader(psy_th_urls,encoding = 'utf-8')
psy_data = loader_multiple_pages.load()
for data in psy_data:
  data.page_content = data.page_content.replace('\n', '')
  data.page_content = data.page_content.replace('\xa0', ' ')
  data.page_content = re.sub(r'\s+', ' ', data.page_content).strip()
  data.page_content = clean_content(data.page_content)

In [9]:
dating_data.extend(psy_data)

In [12]:
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains import HypotheticalDocumentEmbedder
from langchain_community.query_constructors.chroma import ChromaTranslator


import re
from tqdm import tqdm
from langchain_openai import ChatOpenAI

In [13]:
api_key='xxxx'
doc_embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001", task_type="retrieval_document",google_api_key=api_key
)
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    safety_settings={
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    },
    google_api_key=api_key,
)

In [1]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [16]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [14]:
embeddings = HypotheticalDocumentEmbedder.from_llm(llm, doc_embeddings, "web_search")

In [16]:
vector_store = Chroma.from_documents(dating_data,
                                    embeddings,
                                    collection_name="relationship",
                                    persist_directory='vector_stores')

In [19]:
retriever=vector_store.as_retriever(search_type='mmr', search_kwargs={"k": 2})

In [20]:
system_prompt = '''
You are a relationship advisor chatbot. Your role is to provide thoughtful, empathetic, and actionable advice to users based on the given context. You will receive a variable `{context}` that provides important information about the user's situation or emotional state. When responding:

1. **Empathy First**: Always approach each situation with understanding, compassion, and without judgment, considering the emotional tone provided in `{context}`.
2. **Tailor Responses**: Use the details in `{context}` to customize your advice. Whether the user is feeling hurt, confused, or happy, adjust your tone and suggestions accordingly.
3. **Balanced Advice**: Provide balanced perspectives, considering both emotional and practical aspects of relationship dynamics. Always factor in the specific details of `{context}`.
4. **Clarity**: Keep responses clear, concise, and free from jargon. Ensure your advice is actionable and suited to the user's situation as described in `{context}`.
5. **Non-Biased**: Avoid taking sides in conflicts; instead, focus on encouraging healthy communication, mutual respect, and personal growth. Be sensitive to any biases or specific issues mentioned in `{context}`.
6. **Emotionally Supportive**: Be positive, uplifting, and sensitive to the emotions involved in the conversation, as described in `{context}`.
7. **Encourage Communication**: When appropriate, remind users of the importance of open and honest communication with their partners, adjusting advice based on their specific needs from `{context}`.
8. **Resource Suggestion**: In cases where external help might be useful (such as therapy or professional consultation), gently suggest these resources, especially if `{context}` suggests a need for deeper intervention.

You should be respectful of all types of relationships and inclusive of different genders, orientations, and cultural backgrounds. Always adapt your tone to the user's emotional state, as indicated by `{context}`, providing comfort and support where necessary.
'''


In [21]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [22]:
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [23]:
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [24]:
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [30]:
from langfuse.callback import CallbackHandler
langfuse_handler = CallbackHandler(
    public_key="pk-lf-243fb455-4133-42b7-8a1d-c9b9b36940bb",
    secret_key="sk-lf-dc23288b-e32f-4c02-b552-c748e4f2471e",
    host="http://localhost:3000"
)

In [31]:
print(conversational_rag_chain.invoke(
    {"input": "xxx"},
    config={
        "configurable": {"session_id": "abc"},
        "callbacks": [langfuse_handler]
    },
)['answer'])

ERROR:langfuse:Unexpected error occurred. Please check your request and contact support: https://langfuse.com/support.
ERROR:langfuse:Unexpected error occurred. Please check your request and contact support: https://langfuse.com/support.
ERROR:langfuse:Unexpected error occurred. Please check your request and contact support: https://langfuse.com/support.
ERROR:langfuse:Unexpected error occurred. Please check your request and contact support: https://langfuse.com/support.


ดูเหมือนว่าคุณอาจจะรู้สึกไม่สบายใจหรือไม่พอใจกับสถานการณ์นี้ ถ้าคุณต้องการพูดคุยหรือแชร์ความรู้สึกเพิ่มเติม ฉันยินดีที่จะฟังและช่วยเหลือค่ะ คุณสามารถบอกได้เลยว่าคุณรู้สึกอย่างไร หรือมีอะไรที่อยากจะพูดคุยเพิ่มเติมไหมคะ?
