# Importing libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
import pyperclip
import ast
import pandas as pd

# Setting up

In [21]:
# List of IDs for qualification files
with open ("PDF_suffix_webscraped.txt", 'r') as f:
    content = f.read()
qf_list = content.strip().split(",")
print(f"Length of qf list: {len(qf_list)}")

Length of qf list: 2227


In [None]:
# URL to be appended to PDF suffixes
base_url = "https://www.nqr.gov.in/qualifications/"

In [None]:
# Intializing empty dictionary to collect scraped data
data_dict = {"Job name": [],
       "NQR code": [],
       "Job description": [],
       "Qf link": []}

In [None]:
# Extracting data (in batches)
batch = qf_list[2000:]
for i, id in enumerate(batch):


    # Status
    print(f"Fetching {i+1}/{len(batch)} file with id {id}")

    
    # Fetch url
    url = base_url + str(id)
    try:
        options = Options()
        options.add_argument("--headless=new")
        driver = webdriver.Chrome(options = options)
        driver.get(url)
        #print("Link opened successfully")
    
        # Extract relavant portions
        ## Job name
        try:
            # Wait for element to be located
            wait = WebDriverWait(driver, 10)
            def page_loaded (driver):
                element = driver.find_element(By.XPATH, "/html/body/div/div[3]/div[2]/div/div/h1")
                return element.text.strip() != ""
            wait.until(page_loaded)
            #print("page fully loaded successfully")
        except Exception as e:
                print ("Exception occured")

        try:
            element = driver.find_element(By.XPATH, "/html/body/div/div[3]/div[2]/div/div/h1")
            data_dict['Job name'].append(element.text)
        except Exception as e:
            print(f"Failed to fetch job name for sno {i+1} with id {id}")
            data_dict['Job name'].append(None)
            print(e)

        ## NQR code
        try:
            element = driver.find_element(By.XPATH, "/html/body/div[1]/div[3]/div[2]/div/div/div/span")
            data_dict['NQR code'].append(element.text)
        except Exception as e:
            print(f"Failed to fetch job nqr code for sno {i+1} with id {id}")
            data_dict['NQR code'].append(None)
            print(e)
        
        ## Job description
        try:
            element = driver.find_element(By.XPATH, "/html/body/div[1]/section/div/div[1]/div/div[2]/div[2]/p")
            data_dict['Job description'].append(element.text)
        except Exception as e:
            print(e)
            print(f"Failed to fetch job description for sno {i+1} with id {id}")
            data_dict['Job description'].append(None)

        ## QF link
        try:
            element = driver.find_element(By.XPATH, "/html/body/div[1]/section/div/div[2]/aside/div[4]/ul[1]/li")
            data_dict['Qf link'].append(element.find_element(By.TAG_NAME, "a").get_attribute("href"))
        except Exception as e:
            print(e)
            print(f"Failed to fetch qf link for sno {i+1} with id {id}")
            data_dict['Qf link'].append(None)
    
    
    except Exception as e:
        print(e)
        continue

    finally:
        driver.quit()
    


Fetching 1/227 file with id 13499
Fetching 2/227 file with id 13503
Fetching 3/227 file with id 13508
Fetching 4/227 file with id 13510
Fetching 5/227 file with id 13511
Fetching 6/227 file with id 13512
Fetching 7/227 file with id 13514
Fetching 8/227 file with id 13517
Fetching 9/227 file with id 13518
Fetching 10/227 file with id 13519
Fetching 11/227 file with id 13525
Fetching 12/227 file with id 13526
Fetching 13/227 file with id 13527
Fetching 14/227 file with id 13529
Fetching 15/227 file with id 13531
Fetching 16/227 file with id 13532
Fetching 17/227 file with id 13534
Fetching 18/227 file with id 13536
Fetching 19/227 file with id 13538
Fetching 20/227 file with id 13540
Fetching 21/227 file with id 13543
Fetching 22/227 file with id 13545
Fetching 23/227 file with id 13547
Fetching 24/227 file with id 13548
Fetching 25/227 file with id 13550
Fetching 26/227 file with id 13556
Fetching 27/227 file with id 13563
Fetching 28/227 file with id 13564
Fetching 29/227 file with id 

In [None]:
# Converting dictionary into dataframe
df = pd.DataFrame(data_dict)
df

# Exporting dataframe

In [None]:
df.to_csv("Active QF list with links 2Sept25.csv")