In [1]:
import os
import requests
import json
from hashlib import sha256
from urllib.parse import urlparse, quote
import bs4
from bs4 import BeautifulSoup
from datetime import datetime
from time import sleep

from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage

from src.prompts import IMAGE_EXTRACTOR_PROMPT

from dotenv import load_dotenv
load_dotenv()

True

## Utils

In [2]:
class InvalidAPIKey(Exception):
    message = 'Invalid API Key'
    def __init__(self):
        super().__init__(self.message)
        

class InvalidURL(Exception):
    message = 'Invalid URL'
    def __init__(self):
        super().__init__(self.message)

In [3]:
class GeminiImageExtractor:
    prompt = IMAGE_EXTRACTOR_PROMPT
    model_list = ['gemini-1.5-flash-latest',
                  'gemini-1.5-pro-latest']
    def __init__(self, model_name='gemini-1.5-pro-latest'):
        self.api = self._get_api_()
        self.chat_model = ChatGoogleGenerativeAI(model=model_name, google_api_key=self.api)
        self.text_content = {
            "type": "text",
            "text": self.prompt
        }
        
    def extract(self, image_path, sleep_time=0):
        image_content = {
            "type": "image_url",
            "image_url": image_path
        }
        message = HumanMessage(content=[self.text_content, image_content])
        
        result = self.chat_model.invoke([message])
        
        sleep(sleep_time)
        return result.content
        
    def _get_api_(self):
        key = os.getenv('GOOGLE_API_KEY')
        if not key:
            raise InvalidAPIKey
        
        return key

In [4]:
def write_json(path, items):
    with open(path, 'w') as file:
        json.dump(items, file, indent=4)

In [5]:
def is_subdirectory(href):
    return href.startswith('/')

In [6]:
def is_attachment(url):
    # TODO: generalize to various file types
    return url.endswith('.pdf')

In [7]:
def preprocess_website(soup):
    # get the main content
    main_content = soup.find('section', {'id': 'sp-main-body'})
    
    # decompose navigate elements (next or previous page navigators)
    navigator_element = main_content.find('ul', {'class': 'pager pagenav'})
    if navigator_element:
        navigator_element.decompose()
        
    return main_content

In [8]:
def get_web_soup(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = bs4.BeautifulSoup(response.content, 'html.parser')
        return soup
    
    return None

In [9]:
def encode_url(url):
    if is_subdirectory(url):
        raise InvalidURL
    
    parsed_url = urlparse(url)
    subdirectory = parsed_url.path
    encoded_subdirectory = quote(subdirectory)
    encoded_url = f'{parsed_url.scheme}://{parsed_url.netloc}{encoded_subdirectory}'
    
    return encoded_url

In [10]:
def website_is_updated(url, hash_value):
    soup = get_web_soup(url)
    if soup:
        main_soup = preprocess_website(soup)
        return sha256(main_soup.encode()).hexdigest() != hash_value
    
    raise Exception('Can not connect to destination URL')

In [11]:
def get_links(soup, internal_link=True, external_link=False, attachment=True, start_url=None):    
    # get links
    links = set()
    for a in soup.find_all('a', href=True):
        if attachment and is_attachment(a['href']):
            links.add(a['href'])
        
        else:
            is_internal_link = is_subdirectory(a['href'])
            if is_internal_link and internal_link:
                links.add(encode_url(start_url + a['href']))
            elif not is_internal_link and external_link:
                links.add(encode_url(a['href']))
    
    return links

In [12]:
def parse_website_image(soup, extractor, start_url=None):
    for img in soup.find_all('img', src=True):
        url = img['src']
        if is_subdirectory(url):
            try:
                url = start_url + url
            except:
                raise InvalidURL
            
        parse_content = extractor.extract(url, sleep_time=5)
        if not parse_content.startswith('nothing'):
            img.insert_after(parse_content)
            
    return soup

In [13]:
def parse_website_url(soup, start_url):
    for a in soup.find_all('a', href=True):
        if a.string and a.string.strip() != a['href']:
            original_url = start_url + a['href'] if is_subdirectory(a['href']) else a['href']
            a.string += f' ({original_url})'
        
    return soup

In [14]:
def parse_website(soup, parse_reference=True, parse_image=False, start_url=None):
    # kill all script and style elements
    for script in soup(['script', 'style']):
        script.extract()
        
    # parse image
    if parse_image:
        image_extractor = GeminiImageExtractor(model_name='gemini-1.5-flash-latest')
        soup = parse_website_image(soup, image_extractor, start_url=start_url)
        
    # parse references
    if parse_reference:
        soup = parse_website_url(soup, start_url)
        
    # parse content
    content = soup.get_text()
    lines = (line.strip() for line in content.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split('  '))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    
    return text

In [15]:
def crawl_website(url, parse_reference=True, parse_image=False, hash=False):
    reference_urls = set()
    page_content = None
    hash_value = None
    
    soup = get_web_soup(url)
    if soup:
        main_soup = preprocess_website(soup)
        
        # hash
        if hash:
            hash_value = sha256(main_soup.encode()).hexdigest()
        
        # extract start url
        parsed_url = urlparse(url)
        start_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        
        # get references
        reference_urls.update(get_links(main_soup, internal_link=True, external_link=False, attachment=True, start_url=start_url))
        
        # parse content            
        main_content = parse_website(main_soup, parse_reference, parse_image, start_url)
        page_content = (main_content, soup.title.string)
        
    return page_content, reference_urls, hash_value

In [16]:
def webpage_to_documents(url, page_content, title):
    return Document(
        page_content=page_content,
        metadata={
            'source': url,
            'title': title
        }
    )

## Crawling

**rule:** There is not any 2 articles have different release year

In [17]:
def main():
    # read crawled file
    sitemap_path = 'sitemap.json'
    with open(sitemap_path, 'r', encoding='utf-8') as f:
        storage_urls = json.load(f)
    
    start_url = 'https://tuyensinh.hcmus.edu.vn' 
    news_url = 'https://tuyensinh.hcmus.edu.vn/th%C3%B4ng-tin-tuy%E1%BB%83n-sinh-%C4%91%E1%BA%A1i-h%E1%BB%8Dc'
    need2crawl_url = set(storage_urls['base_url']).difference(news_url)
    crawled_url = set()
    documents = []
    
    soup = get_web_soup(news_url)
    if soup:
        main_soup = preprocess_website(soup)
        
        # check latest article
        latest_article_tag = main_soup.find('li')
        latest_article_url = latest_article_tag.find('a')['href']
        
        if latest_article_url not in storage_urls['article_url']:
            latest_article_release_date = latest_article_tag.find('span', class_="mod-articles-category-date").text.strip()
            random_crawled_url = None
            valid_year = datetime.now().year
            
            if storage_urls['article_url']:
                random_crawled_url = list(storage_urls['article_url'])[0]
                
            # case 2: article is of new year
            if random_crawled_url and random_crawled_url['release_date'][-4:] == latest_article_release_date[-4:]:
                # delete old crawled article
                valid_year = random_crawled_url['release_date'][-4:]

            # case 3: update of this year
            else:
                storage_urls['article_url'].clear()

                
            # get new articles
            new_article_urls = set()
            for tag in main_soup.find_all('li'):
                release_date = tag.find('span', class_="mod-articles-category-date").text.strip()
                url = tag.find('a', class_="mod-articles-category-title")['href']
                if url and is_subdirectory(url):
                    url = start_url + url
                if url not in storage_urls['article_url'] and int(release_date[-4:]) == valid_year:
                    new_article_urls.add((encode_url(url), release_date))
                    
            need2crawl_url.update(new_article_urls)
                    
            # crawl new article
            for url, release_date in new_article_urls:
                crawled_url.add(url)
                
                page_content, references, hash_value = crawl_website(url, hash=True, parse_image=True)
                storage_urls['article_url'][url] = {
                    'release_date': release_date,
                    'hash_value': hash_value, # TODO: whether article need to be hashed?
                }
                documents.append(webpage_to_documents(url, page_content[0], page_content[1]))
                need2crawl_url.update(references)
                
        # crawl base url
        for url in storage_urls['base_url'].keys():
            hash_value = storage_urls['base_url'][url].get('hash_value')
            if website_is_updated(url, hash_value):
                page_content, references, hash_value = crawl_website(url, hash=True, parse_image=True)
                storage_urls['base_url'][url] = {
                    'hash_value': hash_value,
                }
                
                need2crawl_url.update(references)
                documents.append(webpage_to_documents(url, page_content[0], page_content[1]))
        
        crawled_url.update(storage_urls['base_url'].keys())
        
        # check if complete all references
        # TODO: Handle remaining URLs
        need2crawl_url = need2crawl_url.difference(crawled_url)
        
        # update sitemap file
        write_json(sitemap_path, storage_urls)
            
        return documents

In [18]:
docs = main()

In [19]:
json_docs = [json.loads(doc.json(ensure_ascii=False)) for doc in docs]
with open('tmp_docs.json', 'w', encoding='utf-8') as f:
    json.dump(json_docs, f, ensure_ascii=False, indent=4)