# Importing libraries

In [2]:
import pandas as pd
import requests
import fitz
import re
from tqdm import tqdm
import os
import numpy as np

# Importing dataset

In [None]:
# CSV with scraped data
df_scraped = pd.read_csv('Active QF list with links2Sept25.csv', usecols=[1,2,3,4])
df_scraped = df_scraped[~df_scraped.isna().any(axis = 1)]
df_scraped = df_scraped.reset_index(drop = True)
display(df_scraped.head())
print(df_scraped.shape)

Unnamed: 0,Job name,NQR code,Job description,Qf link
0,Line Patrolling Man (Oil Gas),2022/HYC/HSSCI/06782,The primary role of Line Patrolling Man (Oil &...,https://www.nqr.gov.in/qualification/file/STT-...
1,Process Instrument Technician (Oil & Gas),QG-04-HC-01019-2023-V2-HSSCI,The individual in this position is responsible...,https://www.nqr.gov.in/qualification/file/STT-...
2,Excavator-Pipeline,2022/HYC/HSSCI/06781,The individual in this job is responsible for ...,https://www.nqr.gov.in/qualification/file/STT-...
3,Fire Safety Technician (Oil & Gas),2020/HYC/HSSCI/3611,The main responsibility of the fire safety tec...,https://www.nqr.gov.in/qualification/file/STT-...
4,Hindi Typist,2020/OAFM/MEPSC/03792,"The Hindi Typist, is responsible for formattin...",https://www.nqr.gov.in/qualification/file/QFil...


(2225, 4)


In [4]:
# Excel from NQR website
df_website = pd.read_excel(r'NQR website QF list 2Sept25.xlsx', header = 2)
df_website.head()

Unnamed: 0,S No.,Title,Code,Description,Sector Name,Level,Maximum Notational Hours,Minimum Notational Hours,Version,Originally Approved,Valid Till,Awarding Body,Certifying Bodies,Proposed Occupation,Progression Pathway,Qualifcation Type,Adopted Qualifcation,Training Delivery Hours
0,1,Line Patrolling Man (Oil Gas),2020/HYC/HSSCI/3770,The primary role of Line Patrolling Man (Oil &...,Hydrocarbon,Level 3,330 Hours,330 Hours,Version,17 Nov 2022,16 Nov 2025,Hydrocarbon Sector Skill Council (HSSCI),Hydrocarbon Sector Skill Council,Pipeline Maintenance,Senior Line Patrolling Man,General Qualification,N.A.,"{""Theory"":""90"",""Practical"":""150"",""Employabilit..."
1,2,Process Instrument Technician (Oil & Gas),2020/HYC/HSSCI/3769,The individual in this position is responsible...,Hydrocarbon,Level 4,480 Hours,480 Hours,Version,29 Sep 2023,28 Sep 2026,Hydrocarbon Sector Skill Council (HSSCI),Hydrocarbon Sector Skill Council,"Operations - Oil & Gas Pipeline, Calibration a...",Vertical Progression - Level 4.5: Senior Proce...,General Qualification,N.A.,"{""Theory"":""105"",""Practical"":""285"",""Employabili..."
2,3,Excavator-Pipeline,2020/HYC/HSSCI/3612,The individual in this job is responsible for ...,Hydrocarbon,Level 2,240 Hours,240 Hours,Version,17 Nov 2022,16 Nov 2025,Hydrocarbon Sector Skill Council (HSSCI),Hydrocarbon Sector Skill Council,Operations-Oil & Gas pipeline,Helper-Hydrocarbon Pipeline,General Qualification,N.A.,"{""Theory"":""60"",""Practical"":""120"",""Employabilit..."
3,4,Fire Safety Technician (Oil & Gas),2020/HYC/HSSCI/3611,The main responsibility of the fire safety tec...,Hydrocarbon,Level 4,450 Hours,450 Hours,Version,17 Nov 2022,16 Nov 2025,Hydrocarbon Sector Skill Council (HSSCI),Hydrocarbon Sector Skill Council,"Management of Health, Safety and Environment (...",Senior Fire Safety Technician,General Qualification,N.A.,"{""Theory"":""120"",""Practical"":""240"",""Employabili..."
4,5,Hindi Typist,2020/OAFM/MEPSC/03792,"The Hindi Typist, is responsible for formattin...",Management,Level 4,450 Hours,390 Hours,Version,17 Nov 2022,17 Nov 2025,Management & Entrepreneurship and Professional...,Management Entrepreneurship and Professional S...,Office Support,Multi-functional Office Executive,General Qualification,N.A.,"{""Theory"":""150"",""Practical"":""180"",""Employabili..."


# Updating excel sheet

In [None]:
# Merging links with website excel

## Dropping sno 557 (index 556) because it has duplicate NQR code with another row
df_website = df_website.drop(labels = 556, axis = 0)

## Merging
df_merged = pd.merge(df_website,df_scraped[['NQR code', 'Qf link']],
                     left_on= 'Code',
                     right_on= 'NQR code',
                     validate="1:1",
                     how = 'inner')

In [None]:
# Cleaning merged file
df_merged = df_merged.drop('S No.', axis = 1)
df_merged['pdf_number'] = df_merged.index

# Reading pdfs

In [None]:
# Loop through each row in df and download the pdf
base_path = './v1_Downloaded active QF pdfs/'
for i in tqdm(range(0,df_merged.shape[0])):
    try:
        # Extract URL from dataframe
        file_path = base_path + str(i) + ".pdf"
        url = df_merged.loc[i,'Qf link']

        # Make GET request on the url
        response = requests.get(url)

        # Store content if request successful
        if response.status_code == 200:
            df_merged.loc[i,'working_qflink'] = 'yes'
            with open(file_path, 'wb') as file:
                file.write(response.content)
        else:
            print(f"File with index {i} not downloaded")
            df_merged.loc[i,'working_qflink'] = 'no'
    
    # Log any exception
    except Exception as e:
        print(f"Error at index {i}:\n{e}")
        df_merged.loc[i,'working_qflink'] = 'no'

  8%|▊         | 168/2094 [00:52<05:36,  5.73it/s]  

File with index 167 not downloaded


 18%|█▊        | 387/2094 [01:45<06:32,  4.35it/s]

File with index 386 not downloaded
File with index 387 not downloaded


 19%|█▉        | 397/2094 [01:48<08:15,  3.42it/s]

File with index 396 not downloaded


 19%|█▉        | 398/2094 [01:48<08:13,  3.44it/s]

File with index 398 not downloaded


 20%|█▉        | 411/2094 [01:51<04:53,  5.73it/s]

File with index 409 not downloaded
File with index 410 not downloaded


 26%|██▌       | 538/2094 [03:23<2:52:21,  6.65s/it]

File with index 537 not downloaded


 31%|███       | 642/2094 [05:28<2:39:17,  6.58s/it]

File with index 641 not downloaded


 91%|█████████▏| 1916/2094 [13:01<19:35,  6.60s/it] 

File with index 1915 not downloaded


100%|██████████| 2094/2094 [14:34<00:00,  2.39it/s]


In [None]:
# Filtering for merged df with working QF links
df_merged = df_merged[df_merged['working_qflink'] == 'yes']

# Marking rows with pdf size of less than 100 bytes
for i in df_merged.index:
    pdf_number = df_merged.loc[i,'pdf_number']
    pdf_name = str(pdf_number) + '.pdf'
    pdf_location = "v1_Downloaded active QF pdfs/" + pdf_name
    if os.stat(pdf_location).st_size<100:
        df_merged.loc[i,'valid_pdf'] = 'no'
        os.remove(pdf_location)                                 # Deletes pdf from folder if it has a size of less than 10 byts 
    else:
        df_merged.loc[i,'valid_pdf'] = 'yes'

# Filtering for QFs with valid pdfs
df_merged = df_merged[df_merged['valid_pdf'] == 'yes']

In [91]:
# Deleting new columns
df_merged = df_merged.drop(columns=['working_qflink', 'valid_pdf'])

In [92]:
df_merged

Unnamed: 0,Title,Code,Description,Sector Name,Level,Maximum Notational Hours,Minimum Notational Hours,Version,Originally Approved,Valid Till,Awarding Body,Certifying Bodies,Proposed Occupation,Progression Pathway,Qualifcation Type,Adopted Qualifcation,Training Delivery Hours,NQR code,Qf link,pdf_number
0,Fire Safety Technician (Oil & Gas),2020/HYC/HSSCI/3611,The main responsibility of the fire safety tec...,Hydrocarbon,Level 4,450 Hours,450 Hours,Version,17 Nov 2022,16 Nov 2025,Hydrocarbon Sector Skill Council (HSSCI),Hydrocarbon Sector Skill Council,"Management of Health, Safety and Environment (...",Senior Fire Safety Technician,General Qualification,N.A.,"{""Theory"":""120"",""Practical"":""240"",""Employabili...",2020/HYC/HSSCI/3611,https://www.nqr.gov.in/qualification/file/STT-...,0
1,Hindi Typist,2020/OAFM/MEPSC/03792,"The Hindi Typist, is responsible for formattin...",Management,Level 4,450 Hours,390 Hours,Version,17 Nov 2022,17 Nov 2025,Management & Entrepreneurship and Professional...,Management Entrepreneurship and Professional S...,Office Support,Multi-functional Office Executive,General Qualification,N.A.,"{""Theory"":""150"",""Practical"":""180"",""Employabili...",2020/OAFM/MEPSC/03792,https://www.nqr.gov.in/qualification/file/QFil...,1
2,Certificate Course in Coding Skills,2020/ITES/ASAP/03802,Individuals at this job are responsible for de...,IT-ITeS,Level 5,270 Hours,270 Hours,Version,25 Jun 2020,01 Mar 2026,"Additional Skill Acquisition Programme, Govern...","Additional Skill Acquisition Programme, Govern...",Software Engineer /Project Engineer,"VERTICAL PROGRESSION \nEngineer Trainee, Proje...","Future Skills Qualification,General Qualification",N.A.,"{""Theory"":""36"",""Practical"":""204"",""Employabilit...",2020/ITES/ASAP/03802,https://www.nqr.gov.in/qualification/file/Q%20...,2
3,Transit and Self-Loading Mixer Operator,2020/CON/IESC/3881,Transit and Self-Loading Mixer operator drives...,Infrastructure,Level 4,390 Hours,390 Hours,Version,17 Oct 2019,17 Oct 2022,Infrastructure Equipment Sector Skill Council,Infrastructure Equipment Sector Council,Transit and Self-Loading Mixer Operator,Senior Transit and Self-Loading mixer operator,General Qualification,N.A.,"{""Theory"":""90"",""Practical"":""150"",""Employabilit...",2020/CON/IESC/3881,https://nqr.gov.in/sites/default/files/QF%20-I...,3
4,AI – Data Architect,2020/ITES/ITSSC/04327,Individuals at this job must be responsible fo...,IT-ITeS,Level 7,750 Hours,660 Hours,Version,19 Dec 2018,22 Sep 2025,IT-ITeS Sector Skills Council NASSCOM (SSC NAS...,IT-ITeS SSC NASSCOM,Artificial Intelligence and Big Data Analytics,"Solutions Architect, Senior Database Administr...",Upskilling Qualification,N.A.,"{""Theory"":""180"",""Practical"":""330"",""Employabili...",2020/ITES/ITSSC/04327,https://www.nqr.gov.in/qualification/file/SSC%...,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2089,Mechanical Automation & Maintenance Engineer,QG-06-IT-04198-2025-V1-CGSC,A Mechanical Automation & Maintenance Engineer...,Capital Goods & Manufacturing,Level 6,660 Hours,660 Hours,Version,08 May 2025,08 May 2028,Capital Goods and Strategic Skill Council (CGSSC),Capital Goods and Strategic Skills Council,Maintenance,Mechanical Automation & Maintenance Manager-Le...,"Future Skills Qualification,General Qualification",N.A.,"{""Theory"":""140"",""Practical"":""310"",""Employabili...",QG-06-IT-04198-2025-V1-CGSC,https://www.nqr.gov.in/qualification/file/QF-M...,2089
2090,Central Sterile Service Department (CSSD) Assi...,QG-4.5-HE-04220-2025-V2-HSSC,The individual at this job are responsible for...,Healthcare,Level 4.5,1200 Hours,1200 Hours,Version,08 May 2025,08 May 2028,Healthcare Sector Skill Council (HSSC),Healthcare sector skill council,Non-Direct Care,NSQF Level 5: Patient Relation Associate,"Apprenticeship Qualification,General Qualifica...",N.A.,"{""Theory"":""330"",""Practical"":""660"",""Employabili...",QG-4.5-HE-04220-2025-V2-HSSC,https://www.nqr.gov.in/qualification/file/Qual...,2090
2091,Healthcare Quality Assurance Manager,QG-06-HE-04225-2025-V2-HSSC,The individual’s main job is to ensure that he...,Healthcare,Level 6,570 Hours,570 Hours,Version,08 May 2025,08 May 2028,Healthcare Sector Skill Council (HSSC),Healthcare sector skill council,Healthcare Quality Management,Vertical Mobility:\nNSQF Level 4: General Duty...,"Apprenticeship Qualification,General Qualifica...",N.A.,"{""Theory"":""240"",""Practical"":""150"",""Employabili...",QG-06-HE-04225-2025-V2-HSSC,https://www.nqr.gov.in/qualification/file/Qual...,2091
2092,Full Stack Development Associate,QG-04-IT-04172-2025-V2-NIELIT,Nature: \nThe programme encompasses people’s s...,IT-ITeS,Level 4,390 Hours,390 Hours,Version,08 May 2025,08 May 2028,National Institute of Electronics and Informat...,National Institute of Electronics and Informat...,Web Development,Full Stack Developer,General Qualification,N.A.,"{""Theory"":""120"",""Practical"":""180"",""Employabili...",QG-04-IT-04172-2025-V2-NIELIT,https://www.nqr.gov.in/qualification/file/QF_F...,2092


# Exporting

In [94]:
df_merged.to_csv('Final qf list_cleaned 2Sept25.csv')