In [None]:
#--Importing Selenium packages--#
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from scrapy import Selector
from selenium_stealth import stealth

#--Importing other required packages--#
import os
import csv
import time
import requests

# import concurrent.futures


# Function to download the pdf file
def downlaodPdfs(data):
  fileUrl = data.split(" | ")[0]
  fileName = data.split(" | ")[1]
  msg = True

  #--Fetching data as byte streams to handle downloading of large size files--#
  r = requests.get(fileUrl, stream = True)
  #--Creating a pdf file--#
  with open(f"downloads/{fileName}","wb") as pdf:
    #--Divide the received data into splits of 1MB & loop over it--#
    for indx,chunk in enumerate(r.iter_content(chunk_size=1024)):
      #--If the 1st chunk of the stream bytes doesn't have the PDF extension, then skip it & set the msg flag as false--#
      if indx == 0 and b'%PDF' not in chunk:               
        msg = False
        break
      #--If the 1st chunk of the stream bytes have the PDF extension, then write the data to the pdf file--#
      else: 
        pdf.write(chunk)
  #--If msg flag is false, which means the pdf doesn't have any data. So delete the pdf file that is being created--#
  if not msg:
    print("Skipping the file since it is empty!\n") 
    os.remove(f"downloads/{fileName}")
  else:
    print(f"{fileName} downloaded successfully!\n")
  return msg

#--Function to write each row data(which is in the form of a dictionary) to a csv file--#
def writeCSV(data, fieldName, FILE_NAME):
    fileExists = os.path.isfile(FILE_NAME)
    with open(FILE_NAME, 'a', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldName, lineterminator='\n')
        #--Funvtion to check if file exists, then skip the column names and if not then write the column names to the csv file--#
        #--By This we are making sure, the column names are only written once to the csv file--# 
        if not fileExists:
            writer.writeheader()
        #--Insert the dictionary data to a new line in the csv--#
        writer.writerow(data)

# Function to fetch the Image code for column 7 - Standard
def getImageCode(imgUrlLi):
  li = []
  for imgUrl in imgUrlLi:
    if "ico_use2.gif" in imgUrl:
      li.append('1')
    elif "ico_use7.gif" in imgUrl:
      li.append('2')
    elif "ico_use3.gif" in imgUrl:
      li.append('3')
    elif "ico_use8.gif" in imgUrl:
      li.append('4')
  return li


# Make sure you are using Python 3.5 and above
# Run the requirements.txt first file to install the packages. Open the terminal & run the below command
# (On Windows): pip install requirements.txt
# (On Mac/Linux): pip3 install requirements.txt
# Check your google chrome version & download the respective chromedriver from here: https://chromedriver.chromium.org/downloads
# If your chromedriver is in the same folder as that of this .ipynb file, then use the path => "./chromedriver"

CHROMEDRIVER_PATH = "./chromedriver"
# CHROMEDRIVER_PATH = "chromedriver"

#--Column Names--#
FIELD_NAMES = ['Sl No', 'Company Name', 'Start Time', 'End Time', 'Published Date', 'Industry', 'Standard', 'fileNameWeb', 'fileNamePdf', 'fileNameText']

FILE_NAME_WITH_BLANKS = "data1.csv"    #--File name for data containg Blank PDFs
FILE_NAME_WITHOUT_BLANKS = "data2.csv" #--File name for data not containg Blank PDFs
page_num = 1
slNoWB = 0                             #--Serial number for data containg Blank PDFs
slNoWOB = 1                            #--Serial number for data not containg Blank PDFs
# pdfDwnldUrlLi = []

#--Create downloads folder if not exists to save the PDF files--#
if not (os.path.isdir("./downloads")):
  os.mkdir("./downloads")

#--Initializing the chrome driver--#
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument("start-maximized")

#  --Uncomment the below two lines if running on linux / Mac--  #
# chrome_options.add_argument('--no-sandbox')
# chrome_options.add_argument('--disable-dev-shm-usage')

chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(CHROMEDRIVER_PATH, chrome_options=chrome_options)

stealth(driver,
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
        )

# Open chrome browser in the background and visit the url
driver.get("https://www.ksa.or.kr/ksi/4982/subview.do?enc=Zm5jdDF8QEB8JTJGa3JjYSUyRmtzaSUyRjElMkZ2aWV3LmRvJTNGcGFnZSUzRDElMjZzcmNoQ29sdW1uJTNEJTI2c3JjaFdyZCUzRCUyNnNvcnRDb2x1bW4lM0QlMjY%3D")
# Wait till the page is fully loaded
WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//tbody[last()]/tr")))

# Looping over the 156 pages
while True:
  page_num += 1
  # Extracting the HTML DOM
  html = driver.page_source
  respObj = Selector(text=html)

  # Looping over the data in each page
  dwnldFiles = respObj.xpath("//tbody[last()]/tr")
  for file in dwnldFiles:
    slNoWB += 1

    #--Xpath expressions to extract data from the HTML DOM--#
    companyName = file.xpath("normalize-space(.//td[1]/text())").get()
    startEndDate = file.xpath("normalize-space(.//td[2]/text())").get()    
    startDate = startEndDate.split("~")[0]
    endDate = startEndDate.split("~")[1]
    fileNamePdf_WB = f'''{slNoWB}_{companyName}_{startDate.split(".")[0]}_{endDate.split(".")[0]}'''
    fileNamePdf_WOB = f'''{slNoWOB}_{companyName}_{startDate.split(".")[0]}_{endDate.split(".")[0]}'''
    pdfUrl = f'''https://www.ksa.or.kr{file.xpath(".//td/a/@href").get()}'''
    # pdfDwnldUrlLi.append(f'''{pdfUrl} | {fileNamePdf}.pdf''')

    # Saving each row data as a dictionary
    dataDict = {
        FIELD_NAMES[0]: slNoWB,
        FIELD_NAMES[1]: companyName,
        FIELD_NAMES[2]: startDate,
        FIELD_NAMES[3]: endDate,
        FIELD_NAMES[4]: file.xpath("normalize-space(.//td[3]/text())").get(),
        FIELD_NAMES[5]: file.xpath("normalize-space(.//td[4]/text())").get(),
        FIELD_NAMES[6]: ",".join(getImageCode(file.xpath(".//td[5]/img/@src").getall())),
        FIELD_NAMES[7]: file.xpath("normalize-space(.//td/a/text())").get(),
        FIELD_NAMES[8]: fileNamePdf_WB,
        FIELD_NAMES[9]: fileNamePdf_WB
    }
    print(dataDict)

    #--Write the dictionary to the csv file--#
    writeCSV(dataDict, FIELD_NAMES, FILE_NAME_WITH_BLANKS)

    #--Logic to skip the rows / pdf file that don't contain nay data
    if downlaodPdfs(f'''{pdfUrl} | {fileNamePdf_WOB}.pdf'''):
      dataDict['Sl No'] = slNoWOB
      dataDict['fileNamePdf'] = fileNamePdf_WOB
      dataDict['fileNameText'] = fileNamePdf_WOB
      writeCSV(dataDict, FIELD_NAMES, FILE_NAME_WITHOUT_BLANKS)
      slNoWOB += 1

  #--Handling the pagination--#
  nextPage1 = respObj.xpath(f"//div[contains(@class, 'paging')]//ul/li/a[text()='{page_num}']")
  nextPage2 = respObj.xpath("//a[@class='_next']")
  if nextPage1:
    nextPageElem = driver.find_element_by_xpath(f"//div[contains(@class, 'paging')]//ul/li/a[text()='{page_num}']")
    driver.execute_script("arguments[0].click()", nextPageElem)
    time.sleep(2)
    WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//tbody[last()]/tr")))
  elif nextPage2:
    nextPageElem = driver.find_element_by_xpath("//a[@class='_next']")
    driver.execute_script("arguments[0].click()", nextPageElem)
    time.sleep(2)
    WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//tbody[last()]/tr")))
  else:
    break

#--Shutting down chrome once all the data are being scrapped--#
driver.quit()
  
# print("\n\nDownloading PDFs...\n\n")

# with concurrent.futures.ProcessPoolExecutor() as executor:
#   executor.map(downlaodPdfs, pdfDwnldUrlLi)