### Code to extract data from glassdoor (June 2023)


Things to improve:
- create new file by itself, if does not exist in the dir.

In [1]:
# Author: Priti Gupta
# Date: June 8th, 2023
# Description: Scrapping data from glassdoor to analyse salaries of data science positions in India
# GitHub: https://github.com/PritiG1/DS-SalaryPredictor



# Import libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd
import urllib.parse
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import csv
import os
import re

def text_clean(string):
    """
    Function to clean the text by removing a specific pattern using regular expressions.
    """
    pattern = r"\d+(\.\d+)? ★$"  # Regex pattern to match the substring "3.9 ★" at the end of the string
    stripped_string = re.sub(pattern, "", string)  # Remove the matched substring using regex substitution
    return stripped_string

def scraper_naukri(keyword_job):
    """
    Function to scrape job data from Glassdoor website based on a given job keyword.
    """
    # Encode the keyword for URL compatibility
    keyword_job_encoded = urllib.parse.quote(keyword_job, safe='-')

    # Construct the URL with the encoded keyword
    url = 'https://www.glassdoor.co.in/Job/india-' + keyword_job_encoded + '-jobs-SRCH_IL.0,5_IN115_KO6,20.htm?includeNoSalaryJobs=true'
    
    # Specify the path to the Chrome WebDriver
    path = '/Users/pritigupta/Desktop/chromedriver_mac64/chromedriver'
    
    # Create a Chrome WebDriver instance
    driver = webdriver.Chrome(path)
    
    # Open the URL in the WebDriver
    driver.get(url)
    
    # Wait for the page to load (sleep for 15 seconds)
    time.sleep(15)

    # Check if the CSV file is empty to determine column names
    with open('datascientist_salary_glassdoor.csv', 'r') as csvfile:
        reader = csv.reader(csvfile)
        file_empty = csvfile.tell() == 0
        if file_empty:
            column_names = ['Title', 'Company', 'Location', 'Salary']
        else:
            column_names = next(reader)

    # Open the CSV file in append mode to write job data
    with open('datascientist_salary_glassdoor.csv', 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)

        # Write column names if the file is empty
        if file_empty:
            writer.writerow(column_names)

        # Iterate over job pages (up to 28 pages)
        for _ in range(1, 29):
            # Find all job elements on the page
            all_jobs = driver.find_elements(By.XPATH, '//article[@id="MainCol"]')

            # Iterate over each job element
            for job in all_jobs:
                job_elements = job.find_elements(By.XPATH, './/li[@class="react-job-listing css-1kjejvf eigr9kq3"]')
                for element in job_elements:
                    try:
                        # Extract job details (company, title, location, salary) and write to CSV file
                        company_name = element.find_element(By.XPATH, './/div[@class="d-flex align-items-center"][1]').text
                        writer.writerow([
                            element.find_element(By.XPATH, './/div[@class="job-title mt-xsm"]').text,
                            text_clean(company_name),
                            element.find_element(By.XPATH, './/div[@class="location mt-xxsm"]').text,
                            element.find_element(By.XPATH, './/div[@class="salary-estimate"]').text
                        ])
                    except NoSuchElementException:
                        continue
        
            # Wait for the "Next" button to be clickable
            wait = WebDriverWait(driver, 15)
            next_button = wait.until(EC.element_to_be_clickable((By.XPATH, './/button[@class="nextButton job-search-1iiwzeb e13qs2072"]')))
            
            # Click the "Next" button
            next_button.click()
            
            # Wait for the page to load (sleep for 10 seconds)
            time.sleep(10)
            
            # Check for and close any pop-up windows
            try:
                pop_up = driver.find_element(By.XPATH, './/span[@class="SVGInline modal_closeIcon"]')
                pop_up.click()
                time.sleep(2)
            except NoSuchElementException:
                continue

    # Quit the WebDriver
    driver.quit()

    # Return True to indicate successful execution
    return True

# Call the scraper function with the desired job keyword
data = scraper_naukri('Data Scientist')


  driver = webdriver.Chrome(path)
