In [1]:
# Cell 1: Imports
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [2]:
# Cell 2: Define URL, make request, and parse
# 1. Define the URL to scrape
# This URL is for "IT and Software" jobs in "India".
url = "https://www.linkedin.com/jobs/search/?keywords=IT%20and%20Software&location=India&geoId=102713980"

# 2. Send a GET request to the URL. Note: LinkedIn often blocks simple requests.
# A full browser automation (like in scraper.py) is more reliable.
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers)

# 3. Parse the HTML content of the page with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')



In [22]:
# --- Code to find and display all unique CSS classes, their tags, and content ---
from collections import defaultdict

print("--- All Unique CSS Classes, Their Tags, and Content Snippets ---")
# This dictionary will map a class name to a list of dictionaries,
# where each dictionary holds the tag and content of an element with that class.
class_to_info = defaultdict(list)

# soup.find_all(True) gets every single HTML tag in the document
for element in soup.find_all(True):
    # Get the text content of the element, stripped of extra whitespace
    content = element.get_text(strip=True)

    # We only care about elements that have both a 'class' attribute and some text content
    if 'class' in element.attrs and content:
        tag_name = element.name
        # An element can have multiple classes. We'll associate the content with each one.
        for class_name in element.attrs['class']:
            class_to_info[class_name].append({
                'tag': tag_name,
                'content': content
            })

# Now, print the collected information in a readable format.
if class_to_info:
    for class_name, info_list in sorted(class_to_info.items()):
        print(f"\n[CLASS]: {class_name}")
        # To keep the output from being overwhelming, we'll show up to 3 examples for each class.
        for info in info_list[:3]:
            tag = info['tag']
            # Truncate long content to keep the output clean
            content_snippet = (info['content'][:150] + '...') if len(info['content']) > 150 else info['content']
            print(f"  - TAG: <{tag}> | CONTENT: \"{content_snippet}\"")
else:
    print("No elements with both a class and text content were found.")

print("\n--- End of Class List ---")

--- All Unique CSS Classes, Their Tags, and Content Snippets ---

[CLASS]: !bg-[#F1F8FA]
  - TAG: <div> | CONTENT: "LinkedInLinkedIn is better on the appDon’t have the app? Get it in the Microsoft Store.Open the app"
  - TAG: <div> | CONTENT: "LinkedInKnow when new jobs open upNever miss a job alert with the new LinkedIn app for Windows.Get the app"

[CLASS]: !font-regular
  - TAG: <button> | CONTENT: "العربية (Arabic)"
  - TAG: <button> | CONTENT: "বাংলা (Bangla)"
  - TAG: <button> | CONTENT: "Čeština (Czech)"

[CLASS]: !min-h-0
  - TAG: <button> | CONTENT: "Any time"
  - TAG: <button> | CONTENT: "Company"
  - TAG: <button> | CONTENT: "Job type"

[CLASS]: !pl-[14px]
  - TAG: <button> | CONTENT: "IT And Software in IndiaExpand search"

[CLASS]: !pr-3
  - TAG: <section> | CONTENT: "Clear text"
  - TAG: <section> | CONTENT: "Clear text"
  - TAG: <section> | CONTENT: "Clear text"

[CLASS]: !text-color-text
  - TAG: <p> | CONTENT: "By clicking Continue to join or sign in, you agree to Link

In [3]:
# Cell 3: Find all job listing cards
# 4. Find all the job listings
jobs = soup.find_all('div', class_='base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card')
print(f"Found {len(jobs)} job cards on the page.")



Found 59 job cards on the page.


In [24]:
#Finalised code for scraping job details

import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# --- Configuration ---
# URL of the LinkedIn job search results page
SEARCH_URL = "https://www.linkedin.com/jobs/search/?keywords=Python%20Developer&location=India"
# File path to save the results
CSV_FILE_PATH = 'linkedin_job_descriptions.csv'

# --- Initialize Selenium WebDriver ---
# Ensure you have the correct WebDriver for your browser installed and in your PATH
driver = webdriver.Chrome()
driver.get(SEARCH_URL)

print("Waiting for job cards to load...")
# Wait for the initial job cards to be present on the page
try:
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.CLASS_NAME, "base-card"))
    )
except Exception as e:
    print(f"Initial job cards not found. Exiting. Error: {e}")
    driver.quit()
    exit()

print("Scrolling to load all jobs...")
# Scroll down to the bottom of the page to load all job listings
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)  # Wait for page to load
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# --- Step 1: Get all job links from the search page ---
print("Extracting job links...")
soup = BeautifulSoup(driver.page_source, 'html.parser')
link_elements = soup.find_all('a', class_='base-card__full-link')
job_links = [link['href'] for link in link_elements if 'href' in link.attrs]
job_links = list(dict.fromkeys(job_links)) # Remove duplicates

print(f"Found {len(job_links)} unique job links.")

# --- Step 2: Visit each link and scrape the details ---
all_jobs_data = []

for link in job_links:
    print(f"Scraping job: {link}")
    driver.get(link)

    try:
        # Wait for the main description container to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "show-more-less-html"))
        )

        # Click the "show more" button if it exists
        try:
            see_more_button = driver.find_element(By.CLASS_NAME, 'show-more-less-html__button')
            driver.execute_script("arguments[0].click();", see_more_button)
            time.sleep(1) # Wait for content to expand
        except Exception:
            # If button not found, it means the description is short, so we just pass
            pass

        # Parse the page with the full description
        job_soup = BeautifulSoup(driver.page_source, 'html.parser')

        # --- Extract data ---
        job_title = job_soup.find('h1', class_='top-card-layout__title').get_text(strip=True)
        company_name = job_soup.find('a', class_='topcard__org-name-link').get_text(strip=True)
        location = job_soup.find('span', class_='topcard__flavor--bullet').get_text(strip=True)
        description_div = job_soup.find('div', class_='show-more-less-html__markup')
        job_description = description_div.get_text(separator='\n', strip=True) if description_div else "Not found"

        # Store data
        all_jobs_data.append({
            "Title": job_title,
            "Company": company_name,
            "Location": location,
            "Description": job_description,
            "Link": link
        })

    except Exception as e:
        print(f"Could not scrape {link}. Error: {e}")

    time.sleep(1) # Be respectful to LinkedIn's servers

# --- Clean up and save to CSV ---
print("Scraping complete. Saving data to CSV...")
driver.quit()

if all_jobs_data:
    df = pd.DataFrame(all_jobs_data)
    df.to_csv(CSV_FILE_PATH, index=False, encoding='utf-8')
    print(f"Data successfully saved to {CSV_FILE_PATH}")
else:
    print("No job data was collected.")

Waiting for job cards to load...
Scrolling to load all jobs...


InvalidSessionIdException: Message: invalid session id: session deleted as the browser has closed the connection
from disconnected: not connected to DevTools
  (Session info: chrome=138.0.7204.92); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#invalidsessionidexception
Stacktrace:
#0 0x58db629cb7da <unknown>
#1 0x58db624702e0 <unknown>
#2 0x58db62456eee <unknown>
#3 0x58db6247edc0 <unknown>
#4 0x58db624efd9f <unknown>
#5 0x58db6250d0bc <unknown>
#6 0x58db624e7683 <unknown>
#7 0x58db624b3b5b <unknown>
#8 0x58db624b4f31 <unknown>
#9 0x58db6299072b <unknown>
#10 0x58db62994534 <unknown>
#11 0x58db62977229 <unknown>
#12 0x58db629950d8 <unknown>
#13 0x58db6295b61f <unknown>
#14 0x58db629b8dd8 <unknown>
#15 0x58db629b8fb6 <unknown>
#16 0x58db629caaf6 <unknown>
#17 0x77cd5fc9caa4 <unknown>
#18 0x77cd5fd29c3c <unknown>


In [20]:
# Cell 4: Loop through jobs and extract data
# 5. Create a list to store the job data
job_list = []

# 6. Loop through each job listing to extract the details
for job in jobs:
    try:
        # 7. Extract the job title
        job_title = job.find('h3', class_='base-search-card__title').text.strip()

        # 8. Extract the company name
        company_name = job.find('h4', class_='base-search-card__subtitle').text.strip()

        # NOTE: The full job description is not available on the search results page.
        # This part of the original script would fail. We'll mark it as not available.
        job_desc = job.find('div', class_='details-pane__content').text.strip()

        # 9. Extract the job location
        job_location = job.find('span', class_='job-search-card__location').text.strip()

        # 10. Extract the link to the job posting
        job_link = job.find('a', class_='base-card__full-link')['href']

        # 11. Store the extracted data in a dictionary
        job_data = {
            'Title': job_title,
            'Company': company_name,
            'Description': job_desc,
            'Location': job_location,
            'Link': job_link
        }

        # 12. Append the dictionary to the list
        job_list.append(job_data)
    except Exception as e:
        # This will catch errors if a job card has a different structure
        print(f"Could not process a job card, skipping. Error: {e}")
        continue

print(f"Successfully extracted data for {len(job_list)} jobs.")



Could not process a job card, skipping. Error: 'NoneType' object has no attribute 'text'
Could not process a job card, skipping. Error: 'NoneType' object has no attribute 'text'
Could not process a job card, skipping. Error: 'NoneType' object has no attribute 'text'
Could not process a job card, skipping. Error: 'NoneType' object has no attribute 'text'
Could not process a job card, skipping. Error: 'NoneType' object has no attribute 'text'
Could not process a job card, skipping. Error: 'NoneType' object has no attribute 'text'
Could not process a job card, skipping. Error: 'NoneType' object has no attribute 'text'
Could not process a job card, skipping. Error: 'NoneType' object has no attribute 'text'
Could not process a job card, skipping. Error: 'NoneType' object has no attribute 'text'
Could not process a job card, skipping. Error: 'NoneType' object has no attribute 'text'
Could not process a job card, skipping. Error: 'NoneType' object has no attribute 'text'
Could not process a j

In [16]:
# Cell 5: Create a pandas DataFrame and display the first few rows
# 13. Create a pandas DataFrame from the list of dictionaries
df = pd.DataFrame(job_list)
df.head()


Unnamed: 0,Title,Company,Description,Location,Link
0,Software Developer -1,NxtWave,Software Developer -1\n \n \n\n\...,"Hyderabad, Telangana, India",https://in.linkedin.com/jobs/view/software-dev...
1,Desktop Support Engineer,Easebuzz,Desktop Support Engineer\n \n \n...,"Pune, Maharashtra, India",https://in.linkedin.com/jobs/view/desktop-supp...
2,Software Engineer,Notion,Software Engineer\n \n \n\n\n ...,"Hyderabad, Telangana, India",https://in.linkedin.com/jobs/view/software-eng...
3,Data Engineer,BOX8,Data Engineer\n \n \n\n\n ...,"Bengaluru, Karnataka, India",https://in.linkedin.com/jobs/view/data-enginee...
4,Software Developer/ Engineer -1,NxtWave,Software Developer/ Engineer -1\n \n ...,"Hyderabad, Telangana, India",https://in.linkedin.com/jobs/view/software-dev...


In [17]:
# Cell 6: Save the DataFrame to a CSV file
# 14. Save the DataFrame to a CSV file
if not df.empty:
    df.to_csv('linkedin_jobs.csv', index=False)
    print("Scraping complete. Data saved to linkedin_jobs.csv")
else:
    print("No job data was scraped, CSV file not created.")

Scraping complete. Data saved to linkedin_jobs.csv
