# Importing libraries

In [12]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import inspect
import time
import pandas as pd

In [2]:
inspect.ismodule(webdriver)

True

# Setting up

In [3]:
# Setting up the driver
options = Options()
#options.add_argument('--headless')
driver = webdriver.Chrome(options = options)

In [4]:
# Opening NQR website
driver.get("https://www.nqr.gov.in/archive")

In [5]:
# Initializing dictionary
data = {key: [] for key in ['sector', 'qf_code', 'title', 'download_link']}

# Initializing wait instance
wait = WebDriverWait(driver, timeout=20)


# Iterating through pages
iteration = 0
while True:
    try:
        iteration +=1
        print(f"Starting page {iteration}")
        
        # Wait for archive table to be visible
        wait.until(
                EC.visibility_of_all_elements_located((By.ID, "archivetable"))
                    )
        
        # Extract data from archive table
        archive_table = driver.find_element(By.ID, "archivetable")
        rows = archive_table.find_elements (By.TAG_NAME, "tr")[1:]
        for row in rows:
            cols = row.find_elements(By.TAG_NAME, "td")
            
            # Extracting sector name
            data['sector'].append(cols[1].text)

            # Extracting QF code
            data['qf_code'].append(cols[2].text)

            # Extracting title
            data['title'].append(cols[3].text)

            # Extracting download_link
            try:
                data['download_link'].append(cols[4].find_element(By.TAG_NAME, "a").get_attribute("href"))
            except Exception as e:
                print(e)
                data['download_link'].append("")
                continue

        # First row of the table
        old_row = ""
        for col in rows[0].find_elements(By.TAG_NAME, "td"):
            old_row+= col.text + "|"
        
        # Click next button
        pagination_button = wait.until(
            EC.element_to_be_clickable((By.ID, 'archivetable_next'))
        )
        print("clicked next button\n")
        driver.execute_script("arguments[0].click();", pagination_button)

        # Check if new table's first row is diff from previous table
        def check (driver):
            # Waiting for table content to change
            archive_table = driver.find_element(By.ID, "archivetable")
            rows_new = archive_table.find_elements (By.TAG_NAME, "tr")[1:]
            new_row = ""
            for col in rows_new[0].find_elements(By.TAG_NAME, "td"):
                new_row+= col.text + "|"
            return old_row !=new_row

        wait.until(check)

    except Exception as e:
        print(e)
        break


Starting page 1
clicked next button

Starting page 2
clicked next button

Starting page 3
Message: no such element: Unable to locate element: {"method":"tag name","selector":"a"}
  (Session info: chrome=138.0.7204.157); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception
Stacktrace:
#0 0x5bda0ad1a87a <unknown>
#1 0x5bda0a7bf2e0 <unknown>
#2 0x5bda0a810e00 <unknown>
#3 0x5bda0a810ff1 <unknown>
#4 0x5bda0a804b76 <unknown>
#5 0x5bda0a8368dd <unknown>
#6 0x5bda0a804a6a <unknown>
#7 0x5bda0a836a7e <unknown>
#8 0x5bda0a85c71c <unknown>
#9 0x5bda0a836683 <unknown>
#10 0x5bda0a802b5b <unknown>
#11 0x5bda0a803f31 <unknown>
#12 0x5bda0acdf7cb <unknown>
#13 0x5bda0ace35d4 <unknown>
#14 0x5bda0acc62c9 <unknown>
#15 0x5bda0ace4178 <unknown>
#16 0x5bda0acaa6bf <unknown>
#17 0x5bda0ad07e78 <unknown>
#18 0x5bda0ad08056 <unknown>
#19 0x5bda0ad19b96 <unknown>
#20 0x7f991069caa4 <unknown>
#21 0x7f9910729c3c <unknown

In [6]:
# Quitting driver
driver.quit()

# Saving extracted data

In [13]:
df = pd.DataFrame(data)

In [14]:
df.head()

Unnamed: 0,sector,qf_code,title,download_link
0,Agriculture,AGR/ Q0503,Coconut Grower,https://nqr.gov.in/sites/default/files/NSQF%20...
1,Telecom,2015/TEL/TSSC/00983,Installation Engineer- SDH& DWDM,https://nqr.gov.in/sites/default/files/NSQF%20...
2,Tourism & Hospitality,THC/Q2701,Dishwasher-Manual and Machine,https://nqr.gov.in/sites/default/files/NSQF-Di...
3,Tourism & Hospitality,THC/Q7601,Boat Jetty Incharge,https://nqr.gov.in/sites/default/files/NSQF-Bo...
4,Tourism & Hospitality,THC/Q0405,Commi 1,https://nqr.gov.in/sites/default/files/NSQF-Co...


In [15]:
# Exporting
df.to_csv('Archived_QFs_26Jul26.csv')