In [60]:
import requests
from bs4 import BeautifulSoup
import re
import os
import mimetypes
from slugify import slugify

In [61]:
def create_directory(directory_path):
    """Creates directory if it does not already exist"""
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
        print(f"Directory '{directory_path}' created.")
    else:
        print(f"Directory '{directory_path}' already exists.")

In [62]:
url = "https://www.customs.gov.np/page/fts-fy-208081"

# send request to url to fetch html page
res = requests.get(url, verify=False)
# raise exception if status not 200 ie OK
res.raise_for_status()

# use bs4 to parse html page
soup = BeautifulSoup(res.text, 'html.parser')
# select ul element that contains list of fiscal year links
ul_element = soup.select_one('.news')

# parse links from ul list
links = {}
for li in ul_element.find_all('li'):
    # title is used to extract fiscal year
    title = li.text
    # regex to find fiscal year as they are formatted as:  वैदेशिक व्यापारको तथ्यांक आ. ब. २०७७/७८
    # ie. 4 digits followed by / and some characters? and then 2 digits
    match = re.findall(r'(\d{4}).*(\d{2})', title)

    if match:
        match = match[0]
        year = f"fy-{int(match[0])}-{int(match[1])}"
    else:
        raise Exception("unable to parse fiscal year")
 
    # link to page containing fall files for the fiscal year
    link = li.find('a').get('href')
    links[year] = link




In [None]:
# path to base dir to store all fiscal year data
base_dir = "data"
for fy, link in links.items():
    # dir to store all data for that fiscal year
    fy_dir = os.path.join(base_dir, fy)
    # if fiscal year folder exists, assume data exists so skip download
    if os.path.exists(fy_dir):
        print(f'directory for {fy} already exists, skipping downloads')
        continue
    create_directory(fy_dir)
    # making request to link of fiscal year document
    res = requests.get(link, verify=False)
    res.raise_for_status()

    # parse the html text
    soup = BeautifulSoup(res.text, 'html.parser')

    # select ul tag that contains list of file links
    ul_element = soup.select_one('.style1 > ul:nth-child(2)')

    for li in ul_element.find_all('li'):
        # use this text as name of file as it contains year and month of data
        title = slugify(li.text)
        # link of excel/pdf files
        link = li.find('a').get('href')
        # make request for file
        res = requests.get(link, verify=False)
        res.raise_for_status()

        # determine mime type of file 
        mime_type = res.headers.get('Content-Type')
        extension = mimetypes.guess_extension(mime_type) or '.unknown'

        # save file with filename from title and extension as determined
        with open(os.path.join(fy_dir, title+extension), 'wb') as f:
            f.write(res.content)
