In [None]:
import requests
from bs4 import BeautifulSoup
import re
import time
import pandas as pd

In [None]:
# Generate all catalog URLs
urls = [f"https://www.shl.com/solutions/products/product-catalog/?start={start}&type=2&type=2" for start in range(0, 144, 12)]

results = []

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    rows = soup.find_all("tr", attrs={"data-course-id": True})

    for row in rows:
        try:
            name_tag = row.select_one("td.custom__table-heading__title a")
            name = name_tag.text.strip()

            relative_link = name_tag['href']
            link = relative_link if relative_link.startswith("http") else "https://www.shl.com" + relative_link

            general_cells = row.select("td.custom__table-heading__general")

            # Remote Testing
            remote_testing = "Yes Remote Testing" if general_cells[0].find("span", class_="catalogue__circle -yes") else "No Remote Testing"

            # IRT
            irt = "Yes IRT" if general_cells[1].find("span", class_="catalogue__circle -yes") else "No IRT"

            # Test Types
            test_type_cells = general_cells[2].find_all("span", class_="product-catalogue__key")
            test_types = [cell.text.strip() for cell in test_type_cells]
            test_types_str = ", ".join(test_types)

            results.append({
                "Name": name,
                "URL": link,
                "Remote Testing": remote_testing,
                "IRT": irt,
                "Test Types": test_types_str
            })

        except Exception as e:
            print(f"Error parsing row: {e}")
            continue

    time.sleep(1.5)

# Convert to df
df_tests = pd.DataFrame(results)


In [None]:
def extract_details(test_url):
    time.sleep(1.5)
    response = requests.get(test_url)
    soup = BeautifulSoup(response.text, "html.parser")

    job_level, completion_time, language, description = None, None, None, None

    for section in soup.find_all("div", class_="product-catalogue-training-calendar__row typ"):
        heading = section.find("h4")
        paragraph = section.find("p")

        if heading and paragraph:
            title = heading.text.strip().lower()
            text = paragraph.text.strip()

            if "job levels" in title:
                job_level = text
            elif "assessment length" in title or "completion time" in title:
                completion_time = text
            elif "language" in title:
                language = text
            elif "description" in title:
                description = text  # ✅ Extract description here

    return job_level, completion_time, language, description


data = []

for i, row in df_tests.iterrows():
    try:
        job_level, completion_time, language, description = extract_details(row["URL"])
        data.append([row["Name"], row["URL"], job_level, completion_time, language, description])
    except Exception as e:
        print(f"Error at {row['URL']}:", e)
        data.append([row["Name"], row["URL"], None, None, None, None])

df_additional_info = pd.DataFrame(data, columns=["Name", "URL", "Job Level", "Completion Time", "Languages", "Description"])


In [None]:
# Define the mapping dictionary
type_mapping = {
    'A': 'Ability and aptitude',
    'B': 'Biodata and situational Judgement',
    'C': 'Competencies',
    'D': 'Development and 360',
    'E': 'Assessment Exercises',
    'K': 'Knowledge and Skills',
    'P': 'Personalities and behavior',
    'S': 'Simulations'
}

def map_test_types(codes):
    return ', '.join([type_mapping[code.strip()] for code in codes.split(',') if code.strip() in type_mapping])

df_tests['Test Types Full'] = df_tests['Test Types'].apply(map_test_types)


In [None]:
# First, ensure consistent casing for merging (avoid mismatches due to case or whitespace)
df_tests["Name"] = df_tests["Name"].str.strip()
df_additional_info["Name"] = df_additional_info["Name"].str.strip()
df_tests["URL"] = df_tests["URL"].str.strip()
df_additional_info["URL"] = df_additional_info["URL"].str.strip()

# Merge the two dataframes on Name and URL
merged_df = pd.merge(
    df_tests,
    df_additional_info,
    on=["Name", "URL"],
    how="left"
)

# Reorder columns as requested
final_df = merged_df[
    [
        "Name",
        "URL",
        "Remote Testing",
        "IRT",
        "Test Types Full",
        "Job Level",
        "Completion Time",
        "Languages",
        "Description"
    ]
]

final_df


Unnamed: 0,Name,URL,Remote Testing,IRT,Test Types Full,Job Level,Completion Time,Languages,Description
0,Account Manager Solution,https://www.shl.com/solutions/products/product...,Yes Remote Testing,Yes IRT,"Competencies, Personalities and behavior, Abil...","Mid-Professional,",Approximate Completion Time in minutes = 49,"English (USA),",The Account Manager solution is an assessment ...
1,Administrative Professional - Short Form,https://www.shl.com/solutions/products/product...,Yes Remote Testing,Yes IRT,"Ability and aptitude, Knowledge and Skills, Pe...","Entry-Level,",Approximate Completion Time in minutes = 36,"English (USA),",The Administrative Professional solution is fo...
2,Agency Manager Solution,https://www.shl.com/solutions/products/product...,Yes Remote Testing,Yes IRT,"Ability and aptitude, Biodata and situational ...","Front Line Manager, Manager, Supervisor,",Approximate Completion Time in minutes = 51,"English (USA),",The Agency Manager solution is for mid-level s...
3,Apprentice + 8.0 Job Focused Assessment,https://www.shl.com/solutions/products/product...,Yes Remote Testing,No IRT,"Biodata and situational Judgement, Personaliti...","General Population, Graduate, Entry-Level,",Approximate Completion Time in minutes = 30,"English International, German,",The Apprentice + 8.0 Job-Focused Assessment is...
4,Apprentice 8.0 Job Focused Assessment,https://www.shl.com/solutions/products/product...,Yes Remote Testing,No IRT,"Biodata and situational Judgement, Personaliti...","Entry-Level, General Population, Graduate,",Approximate Completion Time in minutes = 20,"English International, German, French,",The Apprentice 8.0 Job-Focused Assessment is a...
...,...,...,...,...,...,...,...,...,...
136,Workplace Safety - Individual 7.1 (Americas),https://www.shl.com/solutions/products/product...,Yes Remote Testing,No IRT,Biodata and situational Judgement,"Entry-Level,",Approximate Completion Time in minutes = 16,"Portuguese (Brazil), English (USA), French (Ca...",Our Workplace Safety - Individual 7.1 solution...
137,Workplace Safety - Team 7.0 Solution,https://www.shl.com/solutions/products/product...,Yes Remote Testing,No IRT,Biodata and situational Judgement,"Director,",Approximate Completion Time in minutes = 20,"English International, English (USA), French (...",The Workplace Safety – Team 7.0 solution is de...
138,Workplace Safety - Team 7.1 (Americas),https://www.shl.com/solutions/products/product...,Yes Remote Testing,No IRT,"Biodata and situational Judgement, Competencie...","Entry-Level, General Population, Professional ...",Approximate Completion Time in minutes = 20,"English (USA), Latin American Spanish, French ...",The Workplace Safety – Team 7.1 solution is de...
139,Workplace Safety - Team 7.1 (International),https://www.shl.com/solutions/products/product...,Yes Remote Testing,No IRT,"Biodata and situational Judgement, Competencie...","Entry-Level, General Population, Professional ...",Approximate Completion Time in minutes = 20,"English International, Finnish, French, Dutch,...",The Workplace Safety – Team 7.1 solution is de...


In [None]:
final_df['context'] = final_df.apply(
    lambda row: f"passage: {row['Name']} | Remote Testing: {row['Remote Testing']} | Adaptive/IRT: {row['Adaptive/IRT']} | Test Types: {row['Test Types']} | Job Level: {row['Job Level']} | Completion Time: {row['Completion Time']} | Languages: {row['Languages']} | Description: {row['Description']}",
    axis=1
)


In [None]:
SHL_scraped=final_df

In [None]:
SHL_scraped.to_csv("SHL_Scraped_Data.csv", index=False)