#Scrapper for scrap the whole data about SHL assessments


Author-Vansh Garg
vanshgarg2580@gmail.com

##Installation of required libraries

In [None]:
pip install requests beautifulsoup4 pandas



In [None]:
!pip install httpx
!pip install nest_asyncio
import nest_asyncio
nest_asyncio.apply()



##Utility Function for Fetch all Assessments links which are paginated

Added an sleep timer which resolves the issue of same data loading from pages through cache memory

In [None]:
import aiohttp
import asyncio
import random
import hashlib
from bs4 import BeautifulSoup

BASE_URL = "https://www.shl.com"

async def fetch(session, url):
    async with session.get(url) as response:
        return await response.text()

async def get_all_catalog_pages():
    all_assessments = []
    seen_urls = set()
    # seen_soup_hashes = set()
    start = 0
    batch_size = 12

    async with aiohttp.ClientSession() as session:
        while True:
            url = f"{BASE_URL}/solutions/products/product-catalog/?start={start}"
            print(f"Fetching: {url}")
            html = await fetch(session, url)
            soup = BeautifulSoup(html, 'html.parser')

            batch = extract_assessment_links(soup)
            print(len(batch))
            print(batch)
            if not batch:
                print("Stopping: empty batch.")
                break

            for assessment in batch:
                if assessment["url"] not in seen_urls:
                    all_assessments.append(assessment)
                    seen_urls.add(assessment["url"])

            start += batch_size

            # Wait to reduce cache issues and mimic human browsing
            sleep_time = random.uniform(1.5, 3.5)
            print(f"Sleeping for {sleep_time:.2f} seconds...\n")
            await asyncio.sleep(sleep_time)

    return all_assessments


##Utility Function for exracting the links from an single page

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

BASE_URL = "https://www.shl.com"
CATALOG_URL = f"{BASE_URL}/solutions/products/product-catalog/"

def get_catalog_page():
    response = requests.get(CATALOG_URL)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")
def extract_assessment_links(soup):
    assessments = []

    def parse_row(row):
        a_tag = row.find("a", href=True)
        if a_tag and "/solutions/products/product-catalog/view/" in a_tag["href"]:
            href = BASE_URL + a_tag["href"]

            # Extract <td>s and check second <td> (index 1) for adaptive support
            tds = row.find_all("td")
            adaptive_support = 0
            # print(len(tds))
            if len(tds) >= 2:
                third_td = tds[2]
                adaptive_span = third_td.find("span", class_="catalogue__circle")
                if adaptive_span and "-yes" in adaptive_span.get("class", []):
                    adaptive_support = 1

            return {
                "url": href,
                "adaptive_irt_support": adaptive_support
            }

        return None

    # Extract rows from both entity and course IDs

    all_rows = soup.find_all("tr", attrs={"data-entity-id": True})

    for row in all_rows:
        data = parse_row(row)
        if data:
            assessments.append(data)

    # Deduplicate by URL
    unique_assessments = {a["url"]: a for a in assessments}
    return list(unique_assessments.values())


## Utility Function for scraping the Job Assessment Page from their respective links


In [None]:
def parse_assessment_page(assessment):
    try:
        url = assessment["url"]
        adaptive_irt_support = assessment["adaptive_irt_support"]
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')

        title_div = soup.find("div", class_="row content__container typ")
        title = title_div.text.strip() if title_div else "N/A"

        # Initialize fields
        description = job_levels = languages = assessment_length = remote_testing = "N/A"
        test_type_codes = []

        # Test Type Mapping (SHL code to label)
        test_type_map = {
            "A": "Ability & Aptitude",
            "B": "Biodata & Situational Judgement",
            "C": "Competencies",
            "D": "Development & 360",
            "E": "Assessment Exercises",
            "K": "Knowledge & Skills",
            "P": "Personality & Behavior",
            "S": "Simulations"
        }

        # Initialize binary indicators for each test type
        test_type_indicators = {label: 0 for label in test_type_map.values()}

        rows = soup.find_all("div", class_="product-catalogue-training-calendar__row typ")
        for row in rows:
            heading = row.find("h4")
            content = row.find("p")

            if heading and content:
                h = heading.text.strip().lower()

                if "description" in h:
                    description = content.text.strip()
                elif "job level" in h:
                    job_levels = content.text.strip()
                elif "languages" in h:
                    languages = content.text.strip()
                elif "assessment length" in h:
                    assessment_length = content.text.strip()

            # Look for Test Type and Remote Testing explicitly from p tags
            p_tags = row.find_all("p")
            for p_tag in p_tags:
                text = p_tag.text.lower()

                # Test Type Section
                if "test type" in text:
                    test_type_spans = p_tag.select("span", class_="product-catalogue__key")
                    # test_type_codes = [span.text.strip() for span in test_type_spans]
                    test_type_codes = list(set(span.text.strip() for span in test_type_spans))
                    for code in test_type_codes:
                        label = test_type_map.get(code)
                        if label:
                            test_type_indicators[label] = 1

                # Remote Testing Section
                if "remote testing" in text:
                    remote_span = p_tag.find("span", class_="catalogue__circle")
                    if remote_span:
                        classes = remote_span.get("class", [])
                        remote_testing = "Yes" if "-yes" in classes else "No"

        # Final dictionary
        result = {
            "Title": title,
            "Description": description,
            "Job Levels": job_levels,
            "Languages": languages,
            "Assessment Length": assessment_length,
            "Test Type": ", ".join(test_type_map.get(code, code) for code in test_type_codes),
            "Remote Testing": remote_testing,
            "Remote Testing (Yes)": 1 if remote_testing == "Yes" else 0,
            "Adaptive/IRT Support": adaptive_irt_support,
            "URL": url
        }

        # Add binary indicators for test types
        result.update(test_type_indicators)

        return result

    except Exception as e:
        print(f"Failed to parse {url}: {e}")
        return None

##Main Driver Function


In [None]:

if __name__ == "__main__":
    # assessments = asyncio.run(get_all_catalog_pages())
    # print(f"Total unique assessments: {len(assessments)}")

    # assessment_links = get_all_catalog_pages()
    assessment_links = await get_all_catalog_pages()
    print(f"Found {len(assessment_links)} assessments.")

    all_data = []
    for assessment in assessment_links:
        data = parse_assessment_page(assessment)  # pass dict with url & adaptive_irt_support
        if data:
            all_data.append(data)

    df = pd.DataFrame(all_data)
    df.to_csv("shl_assessments.csv", index=False)

Fetching: https://www.shl.com/solutions/products/product-catalog/?start=0
12
[{'url': 'https://www.shl.com/solutions/products/product-catalog/view/global-skills-development-report/', 'adaptive_irt_support': 0}, {'url': 'https://www.shl.com/solutions/products/product-catalog/view/net-framework-4-5/', 'adaptive_irt_support': 1}, {'url': 'https://www.shl.com/solutions/products/product-catalog/view/net-mvc-new/', 'adaptive_irt_support': 0}, {'url': 'https://www.shl.com/solutions/products/product-catalog/view/net-mvvm-new/', 'adaptive_irt_support': 0}, {'url': 'https://www.shl.com/solutions/products/product-catalog/view/net-wcf-new/', 'adaptive_irt_support': 0}, {'url': 'https://www.shl.com/solutions/products/product-catalog/view/net-wpf-new/', 'adaptive_irt_support': 0}, {'url': 'https://www.shl.com/solutions/products/product-catalog/view/net-xaml-new/', 'adaptive_irt_support': 0}, {'url': 'https://www.shl.com/solutions/products/product-catalog/view/accounts-payable-new/', 'adaptive_irt_su

In [None]:
data=pd.read_csv("shl_assessments.csv")
print(data.shape)
data.head(50)

(365, 18)


Unnamed: 0,Title,Description,Job Levels,Languages,Assessment Length,Test Type,Remote Testing,Remote Testing (Yes),Adaptive/IRT Support,URL,Ability & Aptitude,Biodata & Situational Judgement,Competencies,Development & 360,Assessment Exercises,Knowledge & Skills,Personality & Behavior,Simulations
0,Global Skills Development Report,This report is designed to be given to individ...,"Director, Entry-Level, Executive, General Popu...",,,"A\nE\nB\nC\nD\nP, Competencies, Assessment Exe...",Yes,1,0,https://www.shl.com/solutions/products/product...,1,1,1,1,1,0,1,0
1,.NET Framework 4.5,The.NET Framework 4.5 test measures knowledge ...,"Professional Individual Contributor, Mid-Profe...","English (USA),",Approximate Completion Time in minutes = 30,Knowledge & Skills,Yes,1,1,https://www.shl.com/solutions/products/product...,0,0,0,0,0,1,0,0
2,.NET MVC (New),Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...","English (USA),",Approximate Completion Time in minutes = 17,Knowledge & Skills,Yes,1,0,https://www.shl.com/solutions/products/product...,0,0,0,0,0,1,0,0
3,.NET MVVM (New),Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...","English (USA),",Approximate Completion Time in minutes = 5,Knowledge & Skills,Yes,1,0,https://www.shl.com/solutions/products/product...,0,0,0,0,0,1,0,0
4,.NET WCF (New),Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...","English (USA),",Approximate Completion Time in minutes = 11,Knowledge & Skills,Yes,1,0,https://www.shl.com/solutions/products/product...,0,0,0,0,0,1,0,0
5,.NET WPF (New),Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...","English (USA),",Approximate Completion Time in minutes = 9,Knowledge & Skills,Yes,1,0,https://www.shl.com/solutions/products/product...,0,0,0,0,0,1,0,0
6,.NET XAML (New),Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...","English (USA),",Approximate Completion Time in minutes = 5,Knowledge & Skills,Yes,1,0,https://www.shl.com/solutions/products/product...,0,0,0,0,0,1,0,0
7,Accounts Payable (New),Multiple-choice test that measures the knowled...,"Entry-Level, Graduate, Mid-Professional, Profe...","English (USA),",Approximate Completion Time in minutes = 9,Knowledge & Skills,Yes,1,0,https://www.shl.com/solutions/products/product...,0,0,0,0,0,1,0,0
8,Accounts Payable Simulation (New),Simulated data entry test that measures the ab...,"Entry-Level, Graduate, Mid-Professional, Profe...","English (USA),",Approximate Completion Time in minutes = 8,Simulations,Yes,1,0,https://www.shl.com/solutions/products/product...,0,0,0,0,0,0,0,1
9,Accounts Receivable (New),Multiple-choice test that measures the knowled...,"Entry-Level, Graduate, Mid-Professional, Profe...","English (USA),",Approximate Completion Time in minutes = 13,Knowledge & Skills,Yes,1,0,https://www.shl.com/solutions/products/product...,0,0,0,0,0,1,0,0


In [None]:
# prompt: in the csv whenever you find NaN replace it with an string "Not defined"

import pandas as pd

# Load the CSV file
data = pd.read_csv("shl_assessments.csv")

# Replace NaN values with "Not defined"
data.fillna("Not defined", inplace=True)

# Save the modified DataFrame back to the CSV file
data.to_csv("shl_assessments.csv", index=False)


In [None]:
data=pd.read_csv("shl_assessments.csv")
print(data.shape)
data.head(50)

(365, 18)


Unnamed: 0,Title,Description,Job Levels,Languages,Assessment Length,Test Type,Remote Testing,Remote Testing (Yes),Adaptive/IRT Support,URL,Ability & Aptitude,Biodata & Situational Judgement,Competencies,Development & 360,Assessment Exercises,Knowledge & Skills,Personality & Behavior,Simulations
0,Global Skills Development Report,This report is designed to be given to individ...,"Director, Entry-Level, Executive, General Popu...",Not defined,Not defined,"A\nE\nB\nC\nD\nP, Competencies, Assessment Exe...",Yes,1,0,https://www.shl.com/solutions/products/product...,1,1,1,1,1,0,1,0
1,.NET Framework 4.5,The.NET Framework 4.5 test measures knowledge ...,"Professional Individual Contributor, Mid-Profe...","English (USA),",Approximate Completion Time in minutes = 30,Knowledge & Skills,Yes,1,1,https://www.shl.com/solutions/products/product...,0,0,0,0,0,1,0,0
2,.NET MVC (New),Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...","English (USA),",Approximate Completion Time in minutes = 17,Knowledge & Skills,Yes,1,0,https://www.shl.com/solutions/products/product...,0,0,0,0,0,1,0,0
3,.NET MVVM (New),Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...","English (USA),",Approximate Completion Time in minutes = 5,Knowledge & Skills,Yes,1,0,https://www.shl.com/solutions/products/product...,0,0,0,0,0,1,0,0
4,.NET WCF (New),Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...","English (USA),",Approximate Completion Time in minutes = 11,Knowledge & Skills,Yes,1,0,https://www.shl.com/solutions/products/product...,0,0,0,0,0,1,0,0
5,.NET WPF (New),Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...","English (USA),",Approximate Completion Time in minutes = 9,Knowledge & Skills,Yes,1,0,https://www.shl.com/solutions/products/product...,0,0,0,0,0,1,0,0
6,.NET XAML (New),Multi-choice test that measures the knowledge ...,"Mid-Professional, Professional Individual Cont...","English (USA),",Approximate Completion Time in minutes = 5,Knowledge & Skills,Yes,1,0,https://www.shl.com/solutions/products/product...,0,0,0,0,0,1,0,0
7,Accounts Payable (New),Multiple-choice test that measures the knowled...,"Entry-Level, Graduate, Mid-Professional, Profe...","English (USA),",Approximate Completion Time in minutes = 9,Knowledge & Skills,Yes,1,0,https://www.shl.com/solutions/products/product...,0,0,0,0,0,1,0,0
8,Accounts Payable Simulation (New),Simulated data entry test that measures the ab...,"Entry-Level, Graduate, Mid-Professional, Profe...","English (USA),",Approximate Completion Time in minutes = 8,Simulations,Yes,1,0,https://www.shl.com/solutions/products/product...,0,0,0,0,0,0,0,1
9,Accounts Receivable (New),Multiple-choice test that measures the knowled...,"Entry-Level, Graduate, Mid-Professional, Profe...","English (USA),",Approximate Completion Time in minutes = 13,Knowledge & Skills,Yes,1,0,https://www.shl.com/solutions/products/product...,0,0,0,0,0,1,0,0
