In [4]:
import os
import requests
import bs4
from bs4 import BeautifulSoup
from datetime import datetime
from dotenv import load_dotenv

from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI

from src.prompts import IMAGE_EXTRACTOR_PROMPT

In [7]:
from langchain_google_genai import G

DefaultCredentialsError: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.

## Global variables

In [2]:
documents = []
urls = set()
start_domain = [r'https://tuyensinh.hcmus.edu.vn']

## Utils

In [None]:
class InvalidAPIKey(Exception):
    message = 'Invalid API Key'
    def __init__(self):
        super().__init__(self.message)
        

class GeminiImageExtractor:
    prompt = IMAGE_EXTRACTOR_PROMPT
    model_list = ['gemini-1.5-flash-latest',
                  'gemini-1.5-pro-latest',
                  'gemini-1.0-pro-latest']
    def __init__(self, model_name='gemini-1.0-pro-latest'):
        self.api = self._get_api_()
        self.chat_model = ChatGoogleGenerativeAI(model=model_name, google_api_key=self.api)
        self.text_content = {
            "type": "text",
            "text": self.prompt
        }
        
    def extract(self, image_path):
        image_content = {
            "type": "image_url",
            "image_url": image_path
        }
        
        result = self.chat_model.invoke([[self.text_content, image_content]])
        return result.content
        
    def _get_api_(self):
        key = os.getenv('GOOGLE_API_KEY')
        if not key:
            raise InvalidAPIKey
        
        return key

In [3]:
def is_subdirectory(href):
    return href.startswith('/')

In [4]:
def is_attachment(url):
    # TODO: generalize of various file types
    return url.endswith('.pdf')

In [5]:
def get_web_soup(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = bs4.BeautifulSoup(response.content, 'html.parser')
        return soup
    
    return None

In [6]:
def get_links(soup, internal_link=True, external_link=False, attachment=True):
    # decompose navigate elements (next or previous page navigators)
    navigator_element = soup.find('ul', {'class': 'pager pagenav'})
    if navigator_element:
        navigator_element.decompose()
    
    # get links
    links = set()
    for a in soup.find_all('a', href=True):
        if attachment and is_attachment(a['href']):
            links.add(a['href'])
        
        else:
            is_internal_link = is_subdirectory(a['href'])
            if is_internal_link and internal_link:
                links.add(a['href'])
            elif not is_internal_link and external_link:
                links.add(a['href'])
    
    return links

In [None]:
def parse_image(soup, extractor=None):
    for img in soup.find_all('img', src=True):
        

In [None]:
def parse_website_url(soup):
    for a in soup.find_all('a', href=True):
        if a.string and a.string.strip() != a['href']:
            original_url = a['href']
            a.string += f' ({original_url}) '
        
        elif a.string is None:
            a.string = a['href']

In [7]:
def parse_website_content(soup):
    # get the main content
    main_content = soup.find('section', {'id': 'sp-main-body'})
    
    # decompose navigate elements (next or previous page navigators)
    navigator_element = main_content.find('ul', {'class': 'pager pagenav'})
    if navigator_element:
        navigator_element.decompose()
    
    
    
    # kill all script and style elements
    for script in main_content(['script', 'style']):
        script.extract()
        
    # parse content
    content = main_content.get_text()
    lines = (line.strip() for line in content.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split('  '))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text

In [8]:
def crawl_article(url):
    reference_urls = set()
    page_content = None
    soup = get_web_soup(url)
    if soup:
        main_body = soup.find('section', {'id': 'sp-main-body'})
        reference_urls.update(get_links(main_body, internal_link=True, external_link=False, attachment=True))
        main_content = parse_website_content(soup)
        page_content = (main_content, soup.title.string)
        
    return page_content, reference_urls

In [9]:
def webpage_to_documents(url, page_content, title):
    return Document(
        page_content=page_content,
        metadata={
            'source': url,
            'title': title
        }
    )

## Get admission data (Thông tin tuyển sinh)

### Get latest article URL 

In [10]:
start_url = r'https://tuyensinh.hcmus.edu.vn'
url = r'https://tuyensinh.hcmus.edu.vn/th%C3%B4ng-tin-tuy%E1%BB%83n-sinh-%C4%91%E1%BA%A1i-h%E1%BB%8Dc'

In [11]:
response = requests.get(url)
if response.status_code == 200:
    # get content
    soup = BeautifulSoup(response.content, 'html.parser')
    main_page = soup.get_text()
    article_list = soup.find_all('li')
    for article in article_list:
        release_date = article.find('span', class_="mod-articles-category-date")
        if release_date:
            article_release_year = release_date.text.strip()[-4:]
            current_year = str(datetime.now().year)
            if article_release_year == current_year:
                article_url = start_url + article.find('a', class_="mod-articles-category-title")['href']
                urls.add(article_url)

### Get article content

In [12]:
crawled_urls = []
reference_urls = set()
article_contents = []
for url in urls:
    crawled_urls.append(url)
    
    soup = get_web_soup(url)
    if soup:
        page_content, references = crawl_article(url)
        references = [start_url + reference for reference in references]
        reference_urls.update(references)
        article_contents.append(page_content)

In [19]:
urls.update(reference_urls)
urls = urls.difference(crawled_urls)

In [22]:
urls

{'https://tuyensinh.hcmus.edu.vn/th%C3%B4ng-tin-tuy%E1%BB%83n-sinh/phuongthuctuyensinh',
 'https://tuyensinh.hcmus.edu.vnhttps://tuyensinh.hcmus.edu.vn/attachments/article/137/TB01_TB XTT_UTXT_Bo 2024_PT1.pdf',
 'https://tuyensinh.hcmus.edu.vnhttps://tuyensinh.hcmus.edu.vn/attachments/article/138/TB02_TB UTXTT_UTXT_DHQG 2024_PT2.pdf',
 'https://tuyensinh.hcmus.edu.vnhttps://tuyensinh.hcmus.edu.vn/attachments/article/139/TB04_TB xet tuyen DH DGNL 2024_PT4.pdf',
 'https://tuyensinh.hcmus.edu.vnhttps://tuyensinh.hcmus.edu.vn/attachments/article/140/TB05_TB xet tuyen KQHT  THPT QT 2024_PT5.pdf',
 'https://tuyensinh.hcmus.edu.vnhttps://tuyensinh.hcmus.edu.vn/attachments/article/141/TB03_TB xet tuyen CC NNQT 2024_PT6_update.pdf',
 'https://tuyensinh.hcmus.edu.vnhttps://tuyensinh.hcmus.edu.vn/attachments/article/142/ĐỀ ÁN TUYỂN SINH TRÌNH ĐỘ ĐẠI HỌC NĂM 2024.pdf'}