# Web Scraping LinkedIn for Lead Generation
#### - Scraping LinkedIn Job posting to find target companies and marking growing companies as target lead.
#### - See Hiring Trends and scrape accordingly

#### Workflow
- Using Selinium to handle scrolls
- Using chrome driver
- Using Beautiful Soup for parsing html
- Storing data in csv format using pandas

## Importing Dependencies

In [26]:
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait

## Without Login
- Limits search results

In [15]:

url = 'https://www.linkedin.com/jobs/search?keywords=Accountant&location=United%20States&geoId=103644278&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'
# location - usa (requirements)
driver = webdriver.Chrome()
driver.get(url)

## With Login
- Works better as Linkedin restricts search results without Login

In [None]:
# LinkedIn credentials
USERNAME = "sample@gmail.com"
PASSWORD = "password"


In [33]:

# Initialize WebDriver
driver = webdriver.Chrome()
driver.get("https://www.linkedin.com/login")
time.sleep(2)

# Locate and fill the username field
username_input = driver.find_element(By.ID, "username")
username_input.send_keys(USERNAME)

# Locate and fill the password field
password_input = driver.find_element(By.ID, "password")
password_input.send_keys(PASSWORD)

# Submit the login form
password_input.send_keys(Keys.RETURN)

# Wait to ensure login completes
time.sleep(5)

print("✅ Logged into LinkedIn successfully!")

# Now you can go to a job search URL
#search_url = 'https://www.linkedin.com/jobs/search?keywords=Accountant&location=United%20States&geoId=103644278'
search_url = 'https://www.linkedin.com/jobs/search?keywords=Accountant&location=United%20States&geoId=103644278&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'
driver.get(search_url)
time.sleep(5)

# Continue with your scrolling/scraping logic here...


✅ Logged into LinkedIn successfully!


In [32]:
# scraping 100 times
# Scroll multiple times to load more jobs

for i in range(2):
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
    print(f'Scroll #{i+1} done')
    time.sleep(2) 

    '''
    Now in Linkedin no See more jobs button
    try:
        x = driver.find_element('xpath', "//button[@aria-label='See more jobs']")
        x.click()
        time.sleep(3)
    except:
        pass
        time.sleep(3)
    '''
        
soupy = BeautifulSoup(driver.page_source, "html.parser")
print(soupy)
driver.quit()

Scroll #1 done
Scroll #2 done
<html class="theme theme--mercado app-loader--default artdeco windows" lang="en"><head>
<script nonce="">!function(i,n){void 0!==i.addEventListener&&void 0!==i.hidden&&(n.liVisibilityChangeListener=function(){i.hidden&&(n.liHasWindowHidden=!0)},i.addEventListener("visibilitychange",n.liVisibilityChangeListener))}(document,window);</script>
<meta content="script-src-attr 'none'; require-trusted-types-for 'script'; trusted-types 'allow-duplicates' default jSecure highcharts dompurify goog#html" data-disposition="enforce" data-report-to="https://www.linkedin.com/security/csp?a=voyager-web&amp;f=tt" data-sanitizer="jSecure" http-equiv="Content-Security-Policy" name="trusted-types"/>
<title>(1) Accountant Jobs in United States | LinkedIn</title>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta class="mercado-icons-sprite" content="https://static.licdn.com/aero-v1/sc/h/2brfqc2ezdcl00afxrykl5mid" id="artdeco-icons/static/images

In [30]:
# Find all job title <a> tags (they include title and link)
link_tags = soupy.find_all('a', class_=lambda c: c and 'job-card-list__title--link' in c)

# Extract fields into a list of dicts
data = []
for a in link_tags:
    job_title = a.get_text(strip=True)
    job_link = a['href']
    
    # The parent <div> holds company and location
    parent = a.find_parent('div', class_='job-card-list__entity-lockup')
    
    company_tag = parent.find('div', class_='artdeco-entity-lockup__subtitle') if parent else None
    company = company_tag.get_text(strip=True) if company_tag else None
    
    location_tag = parent.find('div', class_='artdeco-entity-lockup__caption') if parent else None
    location = location_tag.get_text(strip=True) if location_tag else None
    
    data.append({
        'Job Title': job_title,
        'Company': company,
        'Location': location,
        'Link': job_link
    })

# Convert to DataFrame
df = pd.DataFrame(data)

# Optional: save to CSV
df.to_csv('linkedin_jobs_extracted.csv', index=False)

# Print the first few rows to verify
print(df.head())


                                  Job Title              Company  \
0                      AccountantAccountant               Mercor   
1    AccountantAccountant with verification    AlphaX RE Capital   
2          Staff AccountantStaff Accountant    Rhodes Associates   
3  Staff Tax AccountantStaff Tax Accountant  The Reed Corporaton   
4              Tax AccountantTax Accountant             SoTalent   

                  Location                                               Link  
0   United States (Remote)  /jobs/view/4243044933/?eBP=NOT_ELIGIBLE_FOR_CH...  
1  Cupertino, CA (On-site)  /jobs/view/4233978844/?eBP=NOT_ELIGIBLE_FOR_CH...  
2   United States (Remote)  /jobs/view/4231836181/?eBP=NOT_ELIGIBLE_FOR_CH...  
3   United States (Remote)  /jobs/view/4240344721/?eBP=NOT_ELIGIBLE_FOR_CH...  
4   United States (Remote)  /jobs/view/4238643211/?eBP=NOT_ELIGIBLE_FOR_CH...  


In [21]:
# Extract job cards
BASE_CARDS = soupy.find_all('div', class_='base-card')
numPostings = len(BASE_CARDS)

# Extract titles and links
JOB_TITLES = [tag.get_text(strip=True) for tag in soupy.find_all('h3', class_="base-search-card__title")]
LINKS = [a['href'] for a in soupy.find_all('a', class_='base-card__full-link')]

# Extract company names
COMPANIES = []
for card in BASE_CARDS:
    company = card.find('a', class_="hidden-nested-link")
    COMPANIES.append(company.get_text(strip=True) if company else "N/A")

# Combine all data
job_data = []
for i in range(min(len(JOB_TITLES), len(COMPANIES), len(LINKS))):
    entry = {
        'Job Title': JOB_TITLES[i],
        'Company': COMPANIES[i],
        'Link': LINKS[i]
    }
print(entry)


{'Job Title': '********** (******)', 'Company': '*****', 'Link': 'https://www.linkedin.com/jobs/view/accountant-remote-at-lensa-4244934468?position=7&pageNum=0&refId=H4Ugqrct5XNFlxMmPTRFHg%3D%3D&trackingId=v6ziEDU07wZnheFw6CUQiA%3D%3D'}


In [18]:
# Convert to DataFrame
df = pd.DataFrame(job_data)
df.to_csv("linkedin_jobs.csv", index=False)
print("✅ Job data saved to linkedin_jobs.csv")


✅ Job data saved to linkedin_jobs.csv


In [25]:
from bs4 import BeautifulSoup
import pandas as pd

# Load the saved LinkedIn HTML
with open('./linkedin_jobs_page.html', 'r', encoding='utf-8') as f:
    soupy = BeautifulSoup(f, 'html.parser')

# Find all job title <a> tags (they include title and link)
link_tags = soupy.find_all('a', class_=lambda c: c and 'job-card-list__title--link' in c)

# Extract fields into a list of dicts
data = []
for a in link_tags:
    job_title = a.get_text(strip=True)
    job_link = a['href']
    
    # The parent <div> holds company and location
    parent = a.find_parent('div', class_='job-card-list__entity-lockup')
    
    company_tag = parent.find('div', class_='artdeco-entity-lockup__subtitle') if parent else None
    company = company_tag.get_text(strip=True) if company_tag else None
    
    location_tag = parent.find('div', class_='artdeco-entity-lockup__caption') if parent else None
    location = location_tag.get_text(strip=True) if location_tag else None
    
    data.append({
        'Job Title': job_title,
        'Company': company,
        'Location': location,
        'Link': job_link
    })

# Convert to DataFrame
df = pd.DataFrame(data)

# Optional: save to CSV
df.to_csv('linkedin_jobs_extracted.csv', index=False)

# Print the first few rows to verify
print(df.head())


                                  Job Title              Company  \
0                      AccountantAccountant               Mercor   
1    AccountantAccountant with verification    AlphaX RE Capital   
2          Staff AccountantStaff Accountant    Rhodes Associates   
3  Staff Tax AccountantStaff Tax Accountant  The Reed Corporaton   
4              Tax AccountantTax Accountant             SoTalent   

                  Location                                               Link  
0   United States (Remote)  /jobs/view/4243044933/?eBP=NOT_ELIGIBLE_FOR_CH...  
1  Cupertino, CA (On-site)  /jobs/view/4233978844/?eBP=NOT_ELIGIBLE_FOR_CH...  
2   United States (Remote)  /jobs/view/4231836181/?eBP=NOT_ELIGIBLE_FOR_CH...  
3   United States (Remote)  /jobs/view/4240344721/?eBP=NOT_ELIGIBLE_FOR_CH...  
4   United States (Remote)  /jobs/view/4238643211/?eBP=NOT_ELIGIBLE_FOR_CH...  
