In [1]:
from langchain.document_loaders import AsyncChromiumLoader
from langchain.document_transformers import Html2TextTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PlaywrightURLLoader
from langchain_community.document_loaders import SeleniumURLLoader
from langchain_community.document_loaders import WebBaseLoader
from bs4 import BeautifulSoup
from tqdm import tqdm
from urllib.parse import urlparse
import re

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
async def get_blog_urls(base_url:str):
  loader=AsyncChromiumLoader([base_url])
  transform=Html2TextTransformer()
  docs=await loader.aload()
  html_content = docs[0].page_content
  return html_content

In [3]:
def clean_content(content):
    # Step 1: Replace newline characters and non-breaking spaces
    content = content.replace('\n', '').replace('\xa0', ' ')

    # Step 2: Use regex to remove multiple spaces and strip extra spaces at the beginning and end
    content = re.sub(r'\s+', ' ', content).strip()

    # Step 3: Remove everything starting from 'Post Views' until the end of the content
    content = re.sub(r'Post Views:.*', '', content, flags=re.DOTALL)

    return content

#### Run within notebook

In [None]:
base_url = "https://www.mind.org.uk/information-support/types-of-mental-health-problems/"
docs = await get_blog_urls(base_url)

#### Run outside notebook

In [None]:
import asyncio
base_url = "https://www.mind.org.uk/information-support/types-of-mental-health-problems/"
docs = asyncio.run(get_blog_urls(base_url))

## Mental Issue

In [None]:
base_url = "https://www.mind.org.uk/information-support/types-of-mental-health-problems/"
html_content = await get_blog_urls(base_url)

In [None]:
soup = BeautifulSoup(html_content, 'html.parser')
hrefs = [a['href'] for a in soup.find_all('a', href=True)]
filtered_urls = [href for href in hrefs if href.startswith('/information-support/types-of-mental-health-problems/')]
filtered_urls=filtered_urls[1:]
base_url = "https://www.mind.org.uk"
mental_issue_urls = [base_url + url for url in filtered_urls]

## Mental tips

In [None]:
base_url="https://www.mind.org.uk/information-support/tips-for-everyday-living/"
html_content = await get_blog_urls(base_url)

In [None]:
soup = BeautifulSoup(html_content, 'html.parser')
hrefs = [a['href'] for a in soup.find_all('a', href=True)]
filtered_urls = [href for href in hrefs if href.startswith('/information-support/tips-for-everyday-living/')]
filtered_urls=filtered_urls[1:]
base_url = "https://www.mind.org.uk"
tips_urls = [base_url + url for url in filtered_urls]

## Dating Thai

In [None]:
#Find last page
base_url=f"https://www.alljitblog.com/category/จิตวิทยาชีวิตคู่/"
html_content = await get_blog_urls(base_url)
soup = BeautifulSoup(html_content, 'html.parser')

# Find the pagination section
pagination = soup.find('div', class_='box-pagination')
if pagination:
    # Extract all page numbers from the pagination section
    pages = pagination.find_all('a', class_='page-numbers')

    # Convert to integers and find the largest number
    last_page = max([int(page.text) for page in pages if page.text.isdigit()])

    print(f"The last page number is: {last_page}")
else:
    print("Pagination not found.")


#Find all blog url
dating_th_urls=[]
for i in tqdm(range(1, last_page+1)):
  base_url=f"https://www.alljitblog.com/category/จิตวิทยาชีวิตคู่/page/{i}/"
  html_content = await get_blog_urls(base_url)
  soup = BeautifulSoup(html_content, 'html.parser')
  hrefs = [a['href'] for a in soup.find_all('a', href=True)]
  filtered_urls = [href for href in hrefs if href.startswith('https://www.alljitblog.com/')]
  filtered_urls=filtered_urls[1:]
  filtered_urls= list(set(filtered_urls))
  unwanted_substrings = ['?cat', '.com/#', '/author/admin-alljit/','/category/']
  filtered_urls=[url for url in filtered_urls if not any(substring in url for substring in unwanted_substrings)]
  dating_th_urls.extend(filtered_urls)


dating_th_urls = list({urlparse(url).scheme + '://' + urlparse(url).netloc + urlparse(url).path for url in dating_th_urls})
dating_th_urls.remove('https://www.alljitblog.com/')

In [5]:
loader_multiple_pages = WebBaseLoader(dating_th_urls,encoding = 'utf-8')
dating_data = loader_multiple_pages.load()
for data in dating_data:
  data.page_content = data.page_content.replace('\n', '')
  data.page_content = data.page_content.replace('\xa0', ' ')
  data.page_content = re.sub(r'\s+', ' ', data.page_content).strip()
  data.page_content = clean_content(data.page_content)

## Psychiatrist Thai

In [None]:
#Find last page
base_url=f"https://www.alljitblog.com/category/psychiatrist/"
html_content = await get_blog_urls(base_url)
soup = BeautifulSoup(html_content, 'html.parser')

# Find the pagination section
pagination = soup.find('div', class_='box-pagination')
if pagination:
    # Extract all page numbers from the pagination section
    pages = pagination.find_all('a', class_='page-numbers')

    # Convert to integers and find the largest number
    last_page = max([int(page.text) for page in pages if page.text.isdigit()])

    print(f"The last page number is: {last_page}")
else:
    print("Pagination not found.")


#Find all blog url
psy_th_urls=[]
for i in tqdm(range(1, last_page+1)):
  base_url=f"https://www.alljitblog.com/category/psychiatrist/page/{i}/"
  html_content = await get_blog_urls(base_url)
  soup = BeautifulSoup(html_content, 'html.parser')
  hrefs = [a['href'] for a in soup.find_all('a', href=True)]
  filtered_urls = [href for href in hrefs if href.startswith('https://www.alljitblog.com/')]
  filtered_urls=filtered_urls[1:]
  filtered_urls= list(set(filtered_urls))
  unwanted_substrings = ['?cat', '.com/#', '/author/admin-alljit/','/category/']
  filtered_urls=[url for url in filtered_urls if not any(substring in url for substring in unwanted_substrings)]
  psy_th_urls.extend(filtered_urls)

psy_th_urls = list({urlparse(url).scheme + '://' + urlparse(url).netloc + urlparse(url).path for url in psy_th_urls})
psy_th_urls.remove('https://www.alljitblog.com/')

In [7]:
loader_multiple_pages = WebBaseLoader(psy_th_urls,encoding = 'utf-8')
psy_data = loader_multiple_pages.load()
for data in psy_data:
  data.page_content = data.page_content.replace('\n', '')
  data.page_content = data.page_content.replace('\xa0', ' ')
  data.page_content = re.sub(r'\s+', ' ', data.page_content).strip()
  data.page_content = clean_content(data.page_content)

## Data prep to elastic

In [3]:
import json
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

from langchain_elasticsearch import ElasticsearchRetriever
from langchain_google_genai import ChatGoogleGenerativeAI
from typing import Dict
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.embeddings import SentenceTransformerEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "source": {"type": "text"},
            "title": {"type": "text"},
            "description": {"type": "text"},
            "section": {"type": "keyword"},
            "content": {"type": "text"},
            "content_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "title_description_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "relationship_consult"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

In [12]:
relationship_df = pd.DataFrame([{'source': d.metadata['source'], 'title': d.metadata.get('title'), 'description': d.metadata.get('description',''),'content': d.page_content,'section':'relationship'} for d in dating_data])

In [13]:
psy_df = pd.DataFrame([{'source': d.metadata['source'], 'title': d.metadata.get('title'), 'description': d.metadata.get('description',''),'content': d.page_content,'section':'psycology'} for d in psy_data])

In [14]:
all_df = pd.concat([relationship_df, psy_df], ignore_index=True)

In [15]:
all_df = all_df.drop_duplicates(subset=['title'])

In [None]:
documents=[]
for index, row in tqdm(all_df.iterrows()):
    documents.append({"source": row['source'], 
                      "title": row["title"],
                      "description": row["description"],
                      "content": row["content"],
                      "section": row["section"]})

In [None]:
model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
model = SentenceTransformer(model_name)

In [None]:
for doc in tqdm(documents):
    content = doc['content']
    title = doc['title']
    description = doc['description']
    td = f"Title:{title}\nDescription:{description}"

    doc['content_vector'] = model.encode(content)
    doc['title_description_vector'] = model.encode(td)

In [None]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

## Testing query

In [18]:
query = "I broke up with my girlfriend yesterday"
v_q = model.encode(query)
knn_query = {
    "field": "content_vector",
    "query_vector": v_q,
    "k": 3,
    "num_candidates": 10000,
    "boost": 0.5,
}
keyword_query = {
    "bool": {
        "must": {
            "multi_match": {
                "query": query,
                "fields": ["content", "title","description"],
                "type": "best_fields",
                "boost": 0.5,
            }
        },
    }
}
response = es_client.search(
    index=index_name,
    query=keyword_query,
    knn=knn_query,
    size=3
)

In [None]:
for hit in response['hits']['hits']:
    id = hit['_id']
    score = hit['_score']
    title = hit['_source']['title']
    text = hit['_source']['content']
    pretty_output = (f"\nID: {id}\nTitle: {title}\nContent: {text}\nScore: {score}")
    print(pretty_output)

## Testing with Langchain

In [4]:
model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
embeddings = SentenceTransformerEmbeddings(model_name=model_name)

  embeddings = SentenceTransformerEmbeddings(model_name=model_name)


In [5]:
es_url = 'http://localhost:9200'

In [6]:
def hybrid_query(query: str) -> Dict:
    vector = embeddings.embed_query(query)  # same embeddings as for indexing
    return {
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["content", "title","description"],
                        "type": "best_fields",
                        "boost": 0.5,
                    }
                },
            }
        },
        "knn": {
            "field": "content_vector",
            "query_vector": vector,
            "k": 3,
            "num_candidates": 10000,
            "boost": 0.5,
        },
        "size": 3,
    }

In [8]:
index_name = "relationship_consult"
hybrid_retriever = ElasticsearchRetriever.from_es_params(
    index_name=index_name,
    body_func=hybrid_query,
    content_field='content',
    url=es_url,
)

In [9]:
query = "เศร้า"
hybrid_results = hybrid_retriever.invoke(query)
for result in hybrid_results:
    print(result.metadata['_source']['title'], result.metadata['_score'])

ทำความรู้จักกับ โรคซึมเศร้า โดยจิตแพทย์ - Alljit Blog 2.001563
แฟนป่วยโรคซึมเศร้า ส่วนเราเอาไงดี? รับมืออย่างไร - Alljit Blog 1.9367619
มีแฟนแต่รู้สึกเหงา นักจิตวิทยามองว่าอย่างไร? - Alljit Blog 0.36788744


In [24]:
def elastic_search(query):
    return hybrid_retriever.invoke(query)

In [25]:
def build_prompt(query, search_results):
    prompt_template = """
You are female relationship counselor name Saddie. Answer the QUESTION based on the CONTEXT with empathy.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"Title: {doc.metadata['_source']['title']}\nDescription: {doc.metadata['_source']['description']}\nContent: {doc.page_content}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

## With LLM

In [26]:
import getpass
import os

os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

In [36]:
from langfuse.decorators import observe
from langfuse.callback import CallbackHandler
from langfuse import Langfuse

In [37]:
langfuse_handler = CallbackHandler(
  secret_key="sk-lf-566804f8-f5cb-48d7-a4b0-65a9bc0b8c83",
  public_key="pk-lf-f1703f17-7916-4954-8907-ec3de4a17ca9",
  host="http://localhost:3000"
)

In [26]:
api_key='xxxx'

In [27]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    safety_settings={
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    },
    top_p=0.9,           
    top_k=50,          
    temperature=0.7,      
    google_api_key=api_key,
)

In [70]:
@observe()
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [26]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory

In [28]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [29]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.5,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [30]:
system_prompt = '''
You are Saddie a relationship advisor chatbot. Your role is to provide thoughtful, empathetic, and actionable advice to users based on the given context. You will receive a variable `{context}` that provides important information about the user's situation or emotional state. When responding:

1. **Empathy First**: Always approach each situation with understanding, compassion, and without judgment, considering the emotional tone provided in `{context}`.
2. **Tailor Responses**: Use the details in `{context}` to customize your advice. Whether the user is feeling hurt, confused, or happy, adjust your tone and suggestions accordingly.
3. **Balanced Advice**: Provide balanced perspectives, considering both emotional and practical aspects of relationship dynamics. Always factor in the specific details of `{context}`.
4. **Clarity**: Keep responses clear, concise, and free from jargon. Ensure your advice is actionable and suited to the user's situation as described in `{context}`.
5. **Non-Biased**: Avoid taking sides in conflicts; instead, focus on encouraging healthy communication, mutual respect, and personal growth. Be sensitive to any biases or specific issues mentioned in `{context}`.
6. **Emotionally Supportive**: Be positive, uplifting, and sensitive to the emotions involved in the conversation, as described in `{context}`.
7. **Encourage Communication**: When appropriate, remind users of the importance of open and honest communication with their partners, adjusting advice based on their specific needs from `{context}`.
8. **Resource Suggestion**: In cases where external help might be useful (such as therapy or professional consultation), gently suggest these resources, especially if `{context}` suggests a need for deeper intervention.

You should be respectful of all types of relationships and inclusive of different genders, orientations, and cultural backgrounds. Always adapt your tone to the user's emotional state, as indicated by `{context}`, providing comfort and support where necessary.
'''

In [31]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(hybrid_retriever, question_answer_chain)

In [32]:
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, hybrid_retriever, contextualize_q_prompt
)

In [33]:
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [34]:
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [None]:
print(conversational_rag_chain.invoke(
    {"input": "ฉันเศร้า"},
    config={
        "configurable": {"session_id": "abc"},
        "callbacks": [langfuse_handler]
    },  
)["answer"])