In [2]:
from bs4 import BeautifulSoup
import requests
import re
import certifi
import json

In [3]:
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/122.0.0.0 Safari/537.36"
    )
}

In [20]:
def craw_text_data(website_url):
    response = requests.get(website_url, headers=headers)
    html_content = response.text
    soup = BeautifulSoup(html_content, "html.parser")
    paragraphs = soup.find_all("p")
    # list_items = soup.find_all("li")

    text_list = [p.get_text(strip=True) for p in paragraphs]
    # li_text_list = [li.get_text(strip=True) for li in list_items]

    # text_list = text_list + li_text_list
    # Nối các đoạn văn bản, làm sạch khoảng trắng thừa
    text = " ".join(text_list)

    # các dấu ký tự như \t, \n bị lặp nhiều lần ở đầu mỗi chuỗi (\s+) sẽ được thay thế bằng 1 dấu cách (" ")
    text = re.sub(r"\s+", " ", text).strip()

    # các chuỗi có dạng "[số]" cũng cần được loại bỏ
    text = re.sub(r"\[\d+\]", "", text)
    
    return text


In [76]:
def craw_meta_content_data(website_url):
    response = requests.get(website_url, headers=headers)
    html_content = response.text
    soup = BeautifulSoup(html_content, "html.parser")
    ui_keywords = [
        "viewport", "robots", "theme-color", "charset", "generator",
        "referrer", "google", "apple", "facebook", "twitter","msapplication", "http-equiv",
        "format-detection", "image", "og:type"
    ]

    metas = soup.find_all("meta", attrs={"content": True})
    text_list = []

    for meta in metas:
        # Lấy tên thuộc tính liên quan (nếu có)
        meta_name = (meta.get("name") or meta.get("property") or meta.get("http-equiv") or "").lower()

        # Bỏ qua nếu meta_name chứa từ khóa UI
        if any(keyword in meta_name for keyword in ui_keywords):
            continue

        text_list.append(meta["content"])

    # Làm sạch nội dung
    text = " ".join(text_list)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"\[\d+\]", "", text)

    return text

In [None]:
def craw_special_text_data(website_url):
    response = requests.get(website_url)
    soup = BeautifulSoup(response.text, "html.parser")

    event_paragraphs = []

    for h1 in soup.find_all("h1"):
        a_tag = h1.find("a", href=True)
        if not a_tag or "/events/event/" not in a_tag["href"]:
            continue

        title = a_tag.get_text(strip=True)
        link = a_tag["href"]

        parent_div = h1.find_parent("div", class_="copyContent")
        if not parent_div:
            continue

        date_div = parent_div.find("div", class_="eventdate")
        datetime = date_div.get_text(strip=True) if date_div else ""

        description = ""
        if date_div and date_div.next_sibling:
            sibling = date_div.next_sibling
            while sibling and isinstance(sibling, str) and sibling.strip() == "":
                sibling = sibling.next_sibling
            if sibling and isinstance(sibling, str):
                description = sibling.strip()

        read_more_link = ""
        read_more = parent_div.find("a", class_="button green right")
        if read_more:
            read_more_link = read_more["href"]

        start_date = ""
        location = ""
        event_url = ""
        json_script = parent_div.find_next("script", type="application/ld+json")
        if json_script:
            try:
                data = json.loads(json_script.string)
                start_date = data.get("startDate", "")
                location = data.get("location", {}).get("address", "")
                event_url = data.get("url", "")
            except json.JSONDecodeError:
                pass

        # Tạo đoạn văn bản tiếng Anh hoàn chỉnh cho RAG
        paragraph = f"""
Event: {title}
Time: {datetime or start_date}
Location: {location if location else 'Unknown'}
Description: {description}
More info: {event_url if event_url else link}
""".strip()
        paragraph = re.sub(r"\t+", " ", paragraph).strip()
        paragraph = re.sub(r"\n+", ";", paragraph).strip()
        event_paragraphs.append(paragraph)

        all_even = "\n\n".join(event_paragraphs)
    return all_even

In [61]:
def get_table_web(soup):
    # Tìm bảng đầu tiên có class wikitable
    table = soup.find("table", {"class": "wikitable sortable"})

    # Trích xuất hàng
    rows = table.find_all("tr")

    # Parse dữ liệu
    data = []
    for row in rows[1:]:  # Bỏ header
        cols = row.find_all("td")
        if len(cols) == 4:
            name = cols[0].get_text(strip=True)
            neighborhood = cols[1].get_text(strip=True)
            typ = cols[2].get_text(strip=True)
            summary = cols[3].get_text(strip=True)
            data.append(f"Name: {name}, Neighborhood: {neighborhood}, Type: {typ}, Summary: {summary}")
    text = ".".join(data)
    return text

def craw_table_text_data(website_url):
    response = requests.get(website_url, headers=headers)
    html_content = response.text
    soup = BeautifulSoup(html_content, "html.parser")
    paragraphs = soup.find_all("p")

    text_list = [p.get_text(strip=True) for p in paragraphs]

    # Nối các đoạn văn bản, làm sạch khoảng trắng thừa
    text = " ".join(text_list)

    # các dấu ký tự như \t, \n bị lặp nhiều lần ở đầu mỗi chuỗi (\s+) sẽ được thay thế bằng 1 dấu cách (" ")
    text = re.sub(r"\s+", " ", text).strip()

    # các chuỗi có dạng "[số]" cũng cần được loại bỏ
    text = re.sub(r"\[\d+\]", "", text)

    table_data = get_table_web(soup)

    text = [text, table_data]

    
    return "\n\n".join(text)



In [21]:
# website_url = "https://www.picklesburgh.com/"
website_url = "https://www.cmu.edu/about/"
# website_url = "https://en.wikipedia.org/wiki/History_of_Pittsburgh"
text = craw_text_data(website_url)
text

'A private, global research university, Carnegie Mellon stands among the world\'s most renowned educational institutions, and sets its own course.Start the journey here. Over the past 10 years, more than 400 startups linked to CMU have raised more than $7 billion in follow-on funding. Those investment numbers are especially high because of the sheer size of Pittsburgh’s growing autonomous vehicles cluster – including Uber, Aurora, Waymo and Motional – all of which are here because of their strong ties to CMU. With cutting-edge brain science, path-breaking performances, innovative startups, driverless cars, big data, big ambitions, Nobel and Turing prizes, hands-on learning, and a whole lot of robots, CMU doesn\'t imagine the future, we create it. Many seek Pittsburgh for being ahot spot for entrepreneurshipand amodel for future cities. Others come for the city\'sburgeoning food scene. Visit us » You’ll find CMU locations nationwide — and worldwide. Silicon Valley. Qatar. Africa. Washin

In [None]:
def create_db_from_web_pdf(website_url):
    url = website_url
    filename = "data/2024_operating_budget.pdf"
    response = requests.get(url, verify=False)
    with open(filename, "wb") as f:
        f.write(response.content)
create_db_from_web_pdf("https://apps.pittsburghpa.gov/redtail/images/23255_2024_Operating_Budget.pdf")

: 

In [18]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
loader = DirectoryLoader(path = "data", glob="*.pdf", loader_cls = PyPDFLoader)
documents = loader.load()
documents = [doc.model_dump()["page_content"] for doc in documents]
"\n".join(documents)

'\n\nCITY OF PITTSBURGH\nEd Gainey, Mayor\nDeputy Mayor Jake Pawlak,\nDirector of the Office of Management and Budget\nJake Wheatley, Chief of Staff\nKyle Chintalapalli, Chief Economic Development Officer\nPatrick Cornell, Chief Financial Officer\nLisa Frank, Chief Operating and Administrative Officer\nJennifer Gula, Director of Finance\n3\nCITY OF PITTSBURGH\nOffice of Management and Budget\nLinnea Lincoln, Assistant Director - Operating and Special Revenue\nSheri Rolewski, Senior Budget Analyst\nKiersten Walmsley, Senior Budget Analyst\nAdam Clevenger, Budget Analyst\nElizabeth Schellin, Budget Analyst\nDavid Hutchinson, Assistant Director - Capital and Asset Management\nBrendan Coticchia, Manager, Capital Budget\nEric Shultz, Budget Analyst\nLaurie Loper, Budget Technician\nSpecial thanks to Danelle Jones for cover design\n4\nTable of Contents\nBudget Authorizing Legislation  ............................................................................................................

In [22]:
website_url_info_history_pb = ["https://en.wikipedia.org/wiki/Pittsburgh", "https://en.wikipedia.org/wiki/History_of_Pittsburgh",
                    "https://www.britannica.com/place/Pittsburgh", "https://www.visitpittsburgh.com/",
                    "https://www.pittsburghpa.gov/City-Government/Finances-Budget/Taxes/Tax-Forms"]
pdf_url_info_history_pb = ["https://apps.pittsburghpa.gov/redtail/images/23255_2024_Operating_Budget.pdf"]

special_website_url_event_pb = ["https://downtownpittsburgh.com/events/"]
# lấy bằng hàm special

website_url_event_pb = ["https://www.pghcitypaper.com/pittsburgh/EventSearch?v=d"]

website_url_music_pb = ["https://pittsburghopera.org/"]
website_url_museums_pb = ["https://carnegiemuseums.org/", "https://www.heinzhistorycenter.org/",
                          "https://www.thefrickpittsburgh.org/", "https://en.wikipedia.org/wiki/List_of_museums_in_Pittsburgh"]
# link cuối trong list trên có table, cần lấy bằng hàm có table

website_url_food_pb = ["https://www.visitpittsburgh.com/events-festivals/food-festivals/", "https://www.picklesburgh.com/",
                       "https://www.pghtacofest.com/", "https://pittsburghrestaurantweek.com/", "https://littleitalydays.com/",
                       "https://bananasplitfest.com/"]
# (list bắt đầu từ 0)link 1, 2, 3 lấy bằng meta_content

website_url_sports_pb = ["https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/", "https://www.steelers.com/", "https://www.nhl.com/penguins/"]


website_url_info_history_cmu = ["https://www.cmu.edu/about/"]

In [23]:
from prepare_vector_db import create_db

general_info_history_text, event_pittsburgh, music_culture, sport = create_db(website_url_info_history_pb, pdf_url_info_history_pb,
              special_website_url_event_pb, website_url_event_pb, website_url_music_pb,
              website_url_museums_pb, website_url_food_pb, website_url_sports_pb, website_url_info_history_cmu)

In [28]:
import os

model_path = "models/Meta-Llama-3-8B-Instruct-Q3_K_L.gguf"
print("Tồn tại file:", os.path.exists(model_path))

Tồn tại file: True
