In [None]:
import os
import shutil
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
# from webdriver_manager.chrome import ChromiumService
from selenium.webdriver.chrome.service import Service as ChromiumService
import time
import json
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    WebDriverException,
    ElementNotInteractableException,
)

user_data_dir = "C:/Linkedin"

# Set up Chrome options to use a persistent user profile for cookies and cache
chrome_options = Options()
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")  # Specify the path to store user data
chrome_options.add_argument("--profile-directory=Profile 1")  # Specify the profile, e.g., Profile 1

url = "https://www.linkedin.com/search/results/content/?datePosted=%22past-week%22&keywords=hiring%20data%20scientist&origin=FACETED_SEARCH&searchId=f47da7ed-4acd-4098-841a-090ea2244d63&sid=!_Q"

In [2]:
JobDetails = 'JobDetails'
os.makedirs(JobDetails, exist_ok=True)

In [None]:
# Set up the Chrome WebDriver
driver = webdriver.Chrome(service=ChromiumService(ChromeDriverManager().install()), options=chrome_options)
driver.get(url)

# Start the timer (60 minutes = 3600 seconds)
start_time = time.time()
timeout = 3600  # 60 minutes in seconds

# Initialize data storage
all_posts = []

# Function to scroll the page
def scroll_page():
    # Scroll down one page at a time
    body = driver.find_element(By.TAG_NAME, "body")
    body.send_keys(Keys.PAGE_DOWN)
    time.sleep(2)

# Run the extraction loop for 60 minutes
while time.time() - start_time < timeout:
    try:
        # Wait for post containers to load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "fie-impression-container")))
        
        # Find all posts on the current page
        post_containers = driver.find_elements(By.CLASS_NAME, "fie-impression-container")
        
        # Process each post individually to keep metadata, email, URL and text together
        for idx, post in enumerate(post_containers):
            post_data = {}
            
            # Extract metadata (person name, time posted)
            try:
                metadata_element = post.find_element(By.CLASS_NAME, "update-components-actor__container")
                post_data['metadata'] = metadata_element.text
            except:
                post_data['metadata'] = "No metadata found"
            
            # Extract post text
            try:
                text_element = post.find_element(By.CSS_SELECTOR, 'span.break-words')
                post_data['text'] = text_element.text
            except:
                post_data['text'] = "No text found"
            
            # Extract emails from this specific post
            try:
                email_elements = post.find_elements(By.CSS_SELECTOR, "a[href^='mailto']")
                post_data['emails'] = [email.get_attribute('href').replace('mailto:', '') for email in email_elements]
            except:
                post_data['emails'] = []
            
            # Extract URLs from this specific post
            try:
                url_elements = post.find_elements(By.CSS_SELECTOR, "a[href]")
                # Define a list of keywords to filter out URLs
                keywords = ['keywords']  # You can add more keywords to this list as needed
                # Filter out mailto links, internal page links, and URLs containing any of the keywords
                post_data['urls'] = [url.get_attribute('href') for url in url_elements 
                                    if url.get_attribute('href').startswith('http') 
                                    and not 'mailto:' in url.get_attribute('href')
                                    and not any(keyword in url.get_attribute('href') for keyword in keywords)]
            except:
                post_data['urls'] = []
            
            # Check if this post is already in our collection (to avoid duplicates)
            if post_data not in all_posts:
                all_posts.append(post_data)
                
            
            with open(os.path.join(JobDetails, f"DS_{idx}.json"), 'w') as f:
                json.dump(post_data, f, indent=4)

                # # Print the extracted data for this post
                # print(f"Post #{len(all_posts)}:")
                # print(f"Metadata: {post_data['metadata']}")
                # print(f"Text: {post_data['text']}")
                # print(f"Emails: {post_data['emails']}")
                # print(f"URLs: {post_data['urls']}")
                # print("-" * 50)
        
        # Scroll to load more posts
        scroll_page()
        
    except Exception as e:
        print(f"Error: {e}")
        # If there's an error, wait a bit and try scrolling again
        time.sleep(5)
        scroll_page()

# Print summary
print(f"Extraction complete. Total posts extracted: {len(all_posts)}")

# Close WebDriver
driver.quit()

### 