# LinkedIn Jobs Scraper

A Python scraper for LinkedIn job postings using BeautifulSoup.

## Features
- Scrapes job listings from LinkedIn
- Configurable search parameters (keywords, location, max jobs)
- Rate limiting and retry logic
- Exports results to JSON
- Clean, structured data output

## Install Dependencies

In [None]:
# Install required packages
!pip install requests==2.32.3 beautifulsoup4==4.12.3 tenacity==9.0.0 -q

## Import Libraries

In [None]:
from dataclasses import dataclass
from typing import List, Optional
import requests
from bs4 import BeautifulSoup
import time
import random
import json
from urllib.parse import quote
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
import pandas as pd
from datetime import datetime

## Configuration & Data Classes

In [None]:
@dataclass
class JobData:
    title: str
    company: str
    location: str
    job_link: str
    posted_date: str


class ScraperConfig:
    BASE_URL = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search"
    JOBS_PER_PAGE = 25
    MIN_DELAY = 2
    MAX_DELAY = 5
    RATE_LIMIT_DELAY = 30
    RATE_LIMIT_THRESHOLD = 10

    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "DNT": "1",
        "Cache-Control": "no-cache",
    }

## LinkedIn Scraper Class

In [None]:
class LinkedInJobsScraper:
    def __init__(self):
        self.session = self._setup_session()

    def _setup_session(self) -> requests.Session:
        session = requests.Session()
        retries = Retry(
            total=5, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504]
        )
        session.mount("https://", HTTPAdapter(max_retries=retries))
        return session

    def _build_search_url(self, keywords: str, location: str, start: int = 0) -> str:
        params = {
            "keywords": keywords,
            "location": location,
            "start": start,
        }
        return f"{ScraperConfig.BASE_URL}?{'&'.join(f'{k}={quote(str(v))}' for k, v in params.items())}"

    def _clean_job_url(self, url: str) -> str:
        return url.split("?")[0] if "?" in url else url

    def _extract_job_data(self, job_card: BeautifulSoup) -> Optional[JobData]:
        try:
            title = job_card.find("h3", class_="base-search-card__title").text.strip()
            company = job_card.find(
                "h4", class_="base-search-card__subtitle"
            ).text.strip()
            location = job_card.find(
                "span", class_="job-search-card__location"
            ).text.strip()
            job_link = self._clean_job_url(
                job_card.find("a", class_="base-card__full-link")["href"]
            )
            posted_date = job_card.find("time", class_="job-search-card__listdate")
            posted_date = posted_date.text.strip() if posted_date else "N/A"

            return JobData(
                title=title,
                company=company,
                location=location,
                job_link=job_link,
                posted_date=posted_date,
            )
        except Exception as e:
            print(f"Failed to extract job data: {str(e)}")
            return None

    def _fetch_job_page(self, url: str) -> BeautifulSoup:
        try:
            response = self.session.get(url, headers=ScraperConfig.HEADERS)
            if response.status_code != 200:
                raise RuntimeError(
                    f"Failed to fetch data: Status code {response.status_code}"
                )
            return BeautifulSoup(response.text, "html.parser")
        except requests.RequestException as e:
            raise RuntimeError(f"Request failed: {str(e)}")

    def scrape_jobs(
        self, keywords: str, location: str, max_jobs: int = 100
    ) -> List[JobData]:
        all_jobs = []
        start = 0

        while len(all_jobs) < max_jobs:
            try:
                url = self._build_search_url(keywords, location, start)
                soup = self._fetch_job_page(url)
                job_cards = soup.find_all("div", class_="base-card")

                if not job_cards:
                    break
                for card in job_cards:
                    job_data = self._extract_job_data(card)
                    if job_data:
                        all_jobs.append(job_data)
                        if len(all_jobs) >= max_jobs:
                            break
                print(f"Scraped {len(all_jobs)} jobs...")
                start += ScraperConfig.JOBS_PER_PAGE
                time.sleep(
                    random.uniform(ScraperConfig.MIN_DELAY, ScraperConfig.MAX_DELAY)
                )
            except Exception as e:
                print(f"Scraping error: {str(e)}")
                break
        return all_jobs[:max_jobs]

    def save_results(
        self, jobs: List[JobData], filename: str = "linkedin_jobs.json"
    ) -> None:
        if not jobs:
            return
        with open(filename, "w", encoding="utf-8") as f:
            json.dump([vars(job) for job in jobs], f, indent=2, ensure_ascii=False)
        print(f"Saved {len(jobs)} jobs to {filename}")

## Configure Search Parameters

Edit these values to customize your search:

In [None]:
# Search parameters - Edit these!
SEARCH_KEYWORDS = "AI/ML Engineer"  # Job title or keywords
SEARCH_LOCATION = "London"  # Location (city, state, country, or "Remote")
MAX_JOBS = 50  # Maximum number of jobs to scrape

print(f"Search Configuration:")
print(f"  Keywords: {SEARCH_KEYWORDS}")
print(f"  Location: {SEARCH_LOCATION}")
print(f"  Max Jobs: {MAX_JOBS}")
print()

## Run Scraper

In [None]:
# Initialize scraper
scraper = LinkedInJobsScraper()

# Run the scraper
print(f"Starting LinkedIn job scrape...")
print(f"Searching for: '{SEARCH_KEYWORDS}' in '{SEARCH_LOCATION}'")
print(f"Target: {MAX_JOBS} jobs\n")

start_time = datetime.now()
jobs = scraper.scrape_jobs(
    keywords=SEARCH_KEYWORDS,
    location=SEARCH_LOCATION,
    max_jobs=MAX_JOBS
)
end_time = datetime.now()

duration = (end_time - start_time).total_seconds()
print(f"\n‚úÖ Scraping complete!")
print(f"   Found: {len(jobs)} jobs")
print(f"   Duration: {duration:.2f} seconds")

## Display Results

In [None]:
# Display first few results
if jobs:
    print(f"\nüìä Sample Results (showing first 5):\n")
    for i, job in enumerate(jobs[:5], 1):
        print(f"{i}. {job.title}")
        print(f"   Company: {job.company}")
        print(f"   Location: {job.location}")
        print(f"   Posted: {job.posted_date}")
        print(f"   Link: {job.job_link}")
        print()
    
    if len(jobs) > 5:
        print(f"... and {len(jobs) - 5} more jobs")
else:
    print("‚ö†Ô∏è  No jobs found. Try different keywords or location.")

## View as DataFrame

In [None]:
# Convert to pandas DataFrame for easier viewing
if jobs:
    df = pd.DataFrame([vars(job) for job in jobs])
    print(f"\nüìã All {len(df)} Jobs:\n")
    display(df)
    
    # Summary statistics
    print(f"\nüìà Summary:")
    print(f"   Total jobs: {len(df)}")
    print(f"   Unique companies: {df['company'].nunique()}")
    print(f"   Unique locations: {df['location'].nunique()}")
    
    # Top companies
    if len(df) > 0:
        print(f"\nüè¢ Top Companies:")
        top_companies = df['company'].value_counts().head(5)
        for company, count in top_companies.items():
            print(f"   {company}: {count} jobs")
else:
    print("No data to display.")

## Save Results

In [None]:
# Save to JSON
if jobs:
    json_filename = f"linkedin_jobs_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    scraper.save_results(jobs, json_filename)
    
    # Also save to CSV
    csv_filename = f"linkedin_jobs_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    df = pd.DataFrame([vars(job) for job in jobs])
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\nüíæ Also saved to CSV: {csv_filename}")
else:
    print("No jobs to save.")

## Quick Test (Small Sample)

Run this cell for a quick test with just 5 jobs:

In [None]:
# Quick test with minimal jobs
test_scraper = LinkedInJobsScraper()
test_jobs = test_scraper.scrape_jobs(
    keywords="Data Analyst",
    location="San Francisco",
    max_jobs=5
)

if test_jobs:
    print(f"\n‚úÖ Quick test successful! Found {len(test_jobs)} jobs:\n")
    for job in test_jobs:
        print(f"  ‚Ä¢ {job.title} at {job.company} ({job.location})")
else:
    print("‚ö†Ô∏è  Quick test found no jobs. This might indicate:")
    print("   - LinkedIn API changes")
    print("   - Rate limiting")
    print("   - Network issues")