In [11]:
# importing required packages
import time
import gspread
from gspread_dataframe import set_with_dataframe
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# Job search URL
url = 'https://www.linkedin.com/jobs/search?trk=guest_homepage-basic_guest_nav_menu_jobs&position=1&pageNum=0'

# Location: (Can be updated as per need)
desired_location = 'England'

# Setting up Selenium WebDriver 
chrome_path = r'C:\Users\Interviewbit\OneDrive\Desktop\chromedriver.exe'
chrome_service = ChromeService(executable_path=chrome_path)
driver = webdriver.Chrome(service=chrome_service)  

# Open LinkedIn job search page
driver.get(url)

# Wait for the page to load (Might need to adjust the wait time if not working)
time.sleep(5)

# Find the location input field and update the value
location_input = driver.find_element('id', 'job-search-bar-location')

if location_input:
    location_input.clear()
    location_input.send_keys(desired_location)
    location_input.send_keys(Keys.RETURN)
    print(f"Location set to: {desired_location}")
else:
    print("Location input not found. Skipping location update.")

# Wait for the page to load after applying the location filter
time.sleep(5)

# Find and click on the Job Type filter button
try:
    job_type_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//button[contains(@aria-label, "Job type filter")]'))
    )
    job_type_button.click()

    # Add a short wait to ensure the dropdown has fully appeared
    time.sleep(2)

    # Find and click on the checkboxes for Full-time and Contract
    job_type_checkboxes = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.XPATH, '//input[@id="f_JT-0"] | //input[@id="f_JT-2"]'))
    )
    for checkbox in job_type_checkboxes:
        ActionChains(driver).move_to_element(checkbox).click().perform()

    # Find and click on the "Done" button to apply the Job Type filter
    done_button_job_type = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//button[@class="filter__submit-button" and @data-tracking-control-name="public_jobs_f_JT"]'))
    )
    done_button_job_type.click()

    # Wait for the page to load after applying the job type filter
    time.sleep(5)

except TimeoutException:
    print("Job Type filter button did not appear within the expected time. Continuing without filtering by Job Type.")

# Find and click on the On-site/Remote filter button
try:
    on_site_remote_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//button[contains(@aria-label, "On-site/remote filter")]'))
    )
    on_site_remote_button.click()

    # Add a short wait to ensure the dropdown has fully appeared
    time.sleep(2)

    # Find and click on the checkbox for Remote
    remote_checkbox = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//input[@id="f_WT-2"]'))
    )
    ActionChains(driver).move_to_element(remote_checkbox).click().perform()

    # Find and click on the "Done" button to apply the On-site/Remote filter
    done_button_remote = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//button[@class="filter__submit-button" and @data-tracking-control-name="public_jobs_f_WT"]'))
    )
    done_button_remote.click()

    # Add a short sleep to wait for the page to load after applying the On-site/Remote filter
    time.sleep(5)

except TimeoutException:
    print("On-site/Remote filter button did not appear within the expected time. Continuing without filtering by On-site/Remote.")

# Scroll down multiple times to load more job listings
for _ in range(10):  # Adjust the number of scrolls as needed
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)  # Adjust the wait time between scrolls as needed

# Get the updated page source
html_content = driver.page_source

# Close the browser
driver.quit()

# Parse HTML with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
job_entries = soup.find_all('div', class_='base-search-card--link')

job_data = []

for job_entry in job_entries:
    try:
        job_title_elem = job_entry.find('h3', class_='base-search-card__title')
        company_name_elem = job_entry.find('h4', class_='base-search-card__subtitle')
        location_elem = job_entry.find('span', class_='job-search-card__location')
        salary_info_elem = job_entry.find('span', class_='job-search-card__salary-info')
        benefits_text_elem = job_entry.find('span', class_='result-benefits__text')
        time_posted_elem = job_entry.find('time', class_='job-search-card__listdate--new')
        link_elem = job_entry.find('a', class_='base-card__full-link')
        job_type_elem = job_entry.find('span', class_='description__job-criteria-text--criteria')

        # Check if elements exist before accessing their text attribute
        if job_title_elem:
            job_title = job_title_elem.text.strip()
        else:
            job_title = None

        if company_name_elem:
            company_name = company_name_elem.text.strip()
        else:
            company_name = None

        if location_elem:
            location = location_elem.text.strip()
        else:
            location = None

        if salary_info_elem:
            salary_info = salary_info_elem.text.strip()
        else:
            salary_info = None

        if benefits_text_elem:
            benefits_text = benefits_text_elem.text.strip()
        else:
            benefits_text = None

        if time_posted_elem:
            time_posted = time_posted_elem.text.strip()
        else:
            time_posted = None

        if link_elem:
            link = link_elem['href']
        else:
            link = None

        if job_type_elem:
            job_type = job_type_elem.text.strip()
        else:
            job_type = None

        # Check if the job with the same Job Title and Company Name combination already exists
        if not any(entry['Job Title'] == job_title and entry['Company Name'] == company_name for entry in job_data):
            job_data.append({
                'Job Title': job_title,
                'Company Name': company_name,
                'Location': location,
                'Salary Info': salary_info,
                'Benefits': benefits_text,
                'Time Posted': time_posted,
                'Link': link,
                'Job Type': job_type
            })
    except Exception as e:
        print(f"Error processing job entry: {e}")

# Convert the job data to a DataFrame
df = pd.DataFrame(job_data)

gc = gspread.service_account(filename="C:\\Users\\Interviewbit\\Downloads\\sheet-integration-408606-ecd03582303b.json")
# Open the 'Automation' Google Sheet
spreadsheet = gc.open('Automation')
# Select the worksheet you want to update 
worksheet = spreadsheet.worksheet('Jobs')

# Read existing data from the Google Sheet
existing_data = pd.DataFrame(worksheet.get_all_records())
existing_data = existing_data.drop_duplicates(subset=['Job Title', 'Company Name'])

# Combine existing data with new data
df_combined = pd.concat([existing_data, df], ignore_index=True)

# Write combined data to Google Sheet
set_with_dataframe(worksheet, df_combined)

print('Data written to Google Sheet')


Location set to: England
Data written to Google Sheet
