In [101]:
import time  # Import time module for delays
import json  # Import JSON module for handling cookies
import random  # Import random module to select user agents
import asyncio  # Import asyncio for async execution
import nest_asyncio  # Import nest_asyncio to allow nested async loops
from selenium import webdriver  # Import Selenium WebDriver
from selenium.webdriver.common.by import By  # Import By class for locating elements
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service  # Import Service for ChromeDriver
from selenium.webdriver.chrome.options import Options  # Import Options to configure WebDriver
from selenium.webdriver.support.ui import WebDriverWait  # Import WebDriverWait for explicit waits
from selenium.webdriver.support import expected_conditions as EC  # Import expected_conditions for conditions
import pandas as pd  # Import pandas for handling and saving extracted data

# Apply nest_asyncio to allow nested async loops
nest_asyncio.apply()

In [102]:
def setup_driver():
    """
    Initializes and configures Selenium WebDriver with performance optimizations.
    """
    user_agents = [  # List of user agents to avoid detection
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:94.0)",
    ]
    
    chrome_options = Options()  # Create Chrome options object
     # Running in headless mode for efficiency (Uncomment if needed)
    # chrome_options.add_argument('--headless')  # Run browser in headless mode
    chrome_options.add_argument('--disable-blink-features=AutomationControlled') # Bypass automation detection
    chrome_options.add_argument(f'--user-agent={random.choice(user_agents)}')  # Randomize user-agent
    chrome_options.add_argument('--start-maximized') # Open browser in maximized mode
    chrome_options.add_argument('--disable-gpu')  # Disable GPU acceleration for stability
    chrome_options.add_argument('--log-level=3')  # Suppress unnecessary logs
    chrome_options.add_argument('--ignore-certificate-errors')  # Ignore SSL errors
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])  # Disable automation flags
    
    chrome_service = Service(executable_path="C:/Users/parni/OneDrive/Desktop/web_scraping_projects/linkedin_scraper/drivers/chromedriver.exe")
    driver = webdriver.Chrome(service=chrome_service, options=chrome_options) # Initialize WebDriver with options
    return driver   # Return the configured WebDriver instance


In [103]:
def scroll_page(driver):
    """
    Scrolls down the LinkedIn page to load more results.
    
    Args:
        driver: Selenium WebDriver instance.
        
    Returns:
        None
    """
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")  # Scroll to the bottom of the page
    time.sleep(3)  # Wait for the page to load new content

In [104]:
def open_linkedin(driver):
    """
    Opens LinkedIn and waits for user interaction.
    User needs to solve CAPTCHA or sign in.
    
    Args:
        driver: Selenium WebDriver instance.
        
    Returns:
        None
    """
    driver.get("https://www.linkedin.com/")
    input("Solve CAPTCHA or accept cookies or sign in, then press Enter to continue...")
    time.sleep(3)

In [105]:
def search_manually(driver):
    """
    Allows the user to manually perform a LinkedIn search.
    The user should navigate to the search page and enter the search term manually.
    
    Args:
        driver: Selenium WebDriver instance.
        
    Returns:
        None
    """
    input("Navigate to LinkedIn search and perform your search manually. Press Enter when you're ready to continue...")
    time.sleep(3)

In [106]:
def extract_data_related_users(driver, num_users=10):
    """
    Extracts a list of users related to the search from the LinkedIn search results page.
    
    Args:
        driver: Selenium WebDriver instance.
        num_users: Number of users to extract.
        
    Returns:
        A list of dictionaries containing profile URLs of the users.
    """
    users = []
    user_elements = driver.find_elements(By.XPATH, "//a[contains(@href, '/in/')]")
    
    print(f"Extracting {num_users} users...")  # Print message in English
    for user_element in user_elements[:num_users]:
        user_url = user_element.get_attribute("href")
        users.append({"Profile URL": user_url})
    
    print(f"Extracted {len(users)} users.")  # Print the number of extracted users
    return users


In [107]:
def extract_contact_info(driver, user_url, visited_profiles):
    """
    Extracts contact information from a LinkedIn user profile, including the username.
    
    Args:
        driver: Selenium WebDriver instance.
        user_url: URL of the user's LinkedIn profile.
        visited_profiles: Set to track visited profiles and avoid duplicates.
        
    Returns:
        A dictionary containing the username, profile URL, and email if found.
    """
    if user_url in visited_profiles:  # Check if the profile has already been visited
        return {}  # Skip the profile if already visited
    
    visited_profiles.add(user_url)  # Mark the profile as visited
    
    print(f"Visiting profile: {user_url}")  # Print message in English
    driver.get(user_url)  # Visit the user's profile
    time.sleep(2)  # Adjust sleep time if necessary to avoid being blocked
    
    username = user_url.split("/in/")[-1].split("/")[0]  # Extract username from the URL
    
    contact_info = {'Username': username, 'User’s Profile': user_url}
    
    try:
        contact_info_button = driver.find_element(By.XPATH, "//a[contains(@href, 'contact-info')]")
        contact_info_button.click()  # Click on the 'Contact Info' button
        time.sleep(2)
        
        try:
            email_element = driver.find_element(By.XPATH, "//a[contains(@href, 'mailto:')]")
            contact_info['Email'] = email_element.get_attribute("href").replace("mailto:", "")
        except Exception:
            contact_info['Email'] = "Not Available"
        
        print(f"Contact info for {username}: {contact_info}")  # Print contact info in English
        return contact_info
    except Exception as e:
        print(f"Error with {user_url}: {e}")
        contact_info['Email'] = "Not Available"
        return contact_info

In [108]:
async def extract_multiple_contacts(driver, users):
    """
    Extracts contact information from multiple LinkedIn profiles asynchronously.
    
    Args:
        driver: Selenium WebDriver instance.
        users: A list of dictionaries containing user profile URLs.
        
    Returns:
        A list of dictionaries with extracted contact information.
    """
    contact_info_list = []
    visited_profiles = set()  # Initialize the visited_profiles set to track visited profiles
    
    for user in users:
        user_url = user.get("Profile URL")
        if user_url:
            contact_info = extract_contact_info(driver, user_url, visited_profiles)
            contact_info_list.append(contact_info)
            time.sleep(random.uniform(3, 20))  # Add random delay to reduce blocking chances
    
    return contact_info_list

In [109]:
def go_to_next_page(driver):
    """
    Clicks the 'Next' button to navigate to the next page of LinkedIn search results.
    
    Args:
        driver: Selenium WebDriver instance.
        
    Returns:
        None
    """
    try:
        next_button = driver.find_element(By.XPATH, "//button[@aria-label='Next']")
        next_button.click()  # Click 'Next' button to go to the next page
        time.sleep(3)  # Wait for the page to load
    except Exception as e:
        print(f"Error while clicking the 'Next' button: {e}")

In [110]:
def main():
    """
    Main function to execute the LinkedIn scraper.
    
    Initializes the WebDriver, navigates LinkedIn, performs searches, extracts user data, 
    and saves the contact information to a CSV file.
    """
    driver = setup_driver()
    open_linkedin(driver)
    
    # Perform manual search on LinkedIn
    search_manually(driver)
    
    users = []  # List to store user data
    num_users = 10  # Total number of users to extract
    extracted_users = 0  # Counter for extracted users
    
    print("Starting extraction...")  # Start of extraction
    
    while extracted_users < num_users:
        # Extract users on the current page
        users_on_page = extract_data_related_users(driver, num_users=(num_users - extracted_users))
        users.extend(users_on_page)  # Add the users from this page to the list
        extracted_users += len(users_on_page)  # Update the extracted users count
        
        if extracted_users < num_users:
            scroll_page(driver)  # Scroll to load more users
            go_to_next_page(driver)  # Go to the next page
    
    # Extract contact info asynchronously for all users
    loop = asyncio.get_event_loop()
    contact_info_list = loop.run_until_complete(extract_multiple_contacts(driver, users))
    
    # Print or save the extracted contact information
    contact_info_df = pd.DataFrame(contact_info_list)
    print("Extraction completed. Displaying first few rows of the extracted data:")  # Message in English
    print(contact_info_df.head())  # Print first few rows for preview
    contact_info_df.to_csv("data_related_users_contact_info.csv", index=False)  # Save to CSV
    
    print(f"Extracted {len(contact_info_list)} contact information entries.")  # Print number of contact entries extracted
    driver.quit()

if __name__ == "__main__":
    main()

Starting extraction...
Extracting 10 users...
Extracted 10 users.
Visiting profile: https://www.linkedin.com/in/sara-abossedgh-81554243?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAkoAhEBrDUEHN1xPCbvkzGGdikCr81AUKQ
Contact info for sara-abossedgh-81554243?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAkoAhEBrDUEHN1xPCbvkzGGdikCr81AUKQ: {'Username': 'sara-abossedgh-81554243?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAkoAhEBrDUEHN1xPCbvkzGGdikCr81AUKQ', 'User’s Profile': 'https://www.linkedin.com/in/sara-abossedgh-81554243?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAkoAhEBrDUEHN1xPCbvkzGGdikCr81AUKQ', 'Email': 'siahlooei@gmail.com'}
Visiting profile: https://www.linkedin.com/in/faezeh-ghaderi-37a913147?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACONg04BsvERrP31gPQNP0c8sam8crRme0w
Contact info for faezeh-ghaderi-37a913147?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACONg04BsvERrP31gPQNP0c8sam8crRme0w: {'Username': 'faezeh-ghaderi-37a913147?miniProfileUrn=urn%3Ali%3A