# INDEED JOB SCRAPPER

Lets start off by loading the libraries we will require

In [1]:
from time import sleep
from selenium import webdriver
from selenium.common.exceptions import ElementNotVisibleException
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import random
from collections import Counter
import nltk

We can scrap individual links from job posts and use Beautiful Soup to extract the details and job description.
But it will be difficult to extract data in such a way because we will be redirected to a unique page each time on the company's website.
Instead we will use selenium and click on individual post. After clicking on a job post, javascript runs and we are redirected to indeed's version of job post instead of that on company's page.

Lets define a function which will scrap individual job posts from indeed.com.

In [1]:
def scrap_indeed(search,max_search = 100,save_csv = True):
    # search_term is the keyword/designation to be searched
    search_term = search.replace(' ','+')                                   
    url = 'https://www.indeed.com/jobs?q={}&limit=50&radius=25&start=0'.format(search_term) 
    
    # Start the browser and load the above URL
    browser = webdriver.Chrome('C:/Program Files (x86)/chromedriver.exe')
    browser.get(url)
    
    # A popup will appear when we finish loading the page. We cannot proceed without closing it.
    # Hence send the command to the browser to close the popup
    try:
        close_button = browser.find_element_by_id('prime-popover-close-button')
        close_button.click()
    except (ElementNotVisibleException, NoSuchElementException):
        print('No Popup')
    
    
    # Rating is specified as width in the HTML code. According to this width, stars are assigned in the reviews section.
    max_rating = browser.find_element_by_class_name('ratings').size['width']
    
    # Empty dataframe in which we will store our data scraped from job posts
    data = pd.DataFrame(columns = ['Job_Title','Company','Estimated_Salary','Date_Posted','Link_Job_Company',
                                   'Link_Job_Indeed','Location','Rating_In_Stars','Job_Description'])

    x = 0
    y = 0
    
    
    # Loop through the pages
    for j in range(max_search // 50):
        
        # All the job posts have class 'row result clickcard' except the last one
        # The last job post has class name 'lastRow row result clickcard'
        # We will append it to our job_elements list
        job_elements =  browser.find_elements_by_xpath("//div[@class='row result clickcard']")
        
        try:
            job_elements.append(browser.find_element_by_xpath("//div[@class='lastRow row result clickcard']"))
        except Exception:
            pass
        
        # Loop through the individual job posts
        for i in range(len(job_elements)):
            
            
            # The link to job post on company's website
            redirect_to_comp_site = job_elements[i].find_element_by_tag_name('a').get_attribute('href')
            
            
            
            # Find estimated salary if mentioned, else assign NA
            
            try:
                estimated_salary = job_elements[i].find_element_by_class_name('no-wrap').text
            except NoSuchElementException:
                estimated_salary = 'NA'
            
            
            # Get the date when job was posted. Usually in N days ago.  
            date = job_elements[i].find_element_by_class_name('date').text 
            
            # Click on the job post
            job_elements[i].click()

            # Switch to the next browser tab 
            browser.switch_to_window(browser.window_handles[1])
            
            # Sleep for minimum 3 seconds because we dont want to create unnecessary load on Indeed's servers
            sleep(3 + random.randint(0,3))
            
            # Link to the Indeed's version of job post
            indeed_link = browser.current_url
            
            # Sometimes Selenium might start scraping before the page finishes loading or 
            # we might encounter '404 : Job not found error'
            # Although these occurences are very rare we don't want our job scrapper to crash.
            # Therefore we will retry before moving on.
            # If the data was successfully scrapped then it will break out of the while loop
            # If we encounter error it will retry again provided the retry count is below 5
            
            retry_count = 0
            while True:
                try:
                    retry_count += 1
                    title =  browser.find_element_by_class_name('jobtitle').text
                    post = browser.find_element_by_id('job_summary').text
                    company = browser.find_element_by_class_name('company').text
                    location = browser.find_element_by_class_name('location').text
                    break
                except NoSuchElementException:
                    if retry_count > 5:
                        print('Unable to fetch data. Skipping Page.....')
                        break
                    else:
                        print('Unable to fetch data. Retrying....')




            # We can fetch the job rating by class 'rating' but it rounds up the number stars. 
            # Hence it will be slightly less accurate
            # Instead we will get the CSS attribute of 'rating' and convert it to number of stars ourselves.
            try:
                rtng_style_str = browser.find_element_by_class_name('rating').get_attribute('style')
                rtng_style_int = float(rtng_style_str.split(' ')[1][:-3])
                rating = (rtng_style_int*5)/max_rating
            except NoSuchElementException:
                rating = 'NA'

            # For debugging purposes lets log the job post scrapped
            print('Completed Post {} of Page {} - {}'.format(i+1,j+1,title))
            
            # Insert the data into our dataframe
            data = data.append({'Job_Title':title,'Company':company,'Link_Job_Company':redirect_to_comp_site,'Estimated_Salary':estimated_salary,'Date_Posted':date,'Link_Job_Indeed':indeed_link,'Location':location,'Rating_In_Stars':rating,'Job_Description':post},ignore_index=True)    
            # Close the window and go to the main page
            browser.close()
            browser.switch_to_window(browser.window_handles[0])




        # Change the URL, so as to move on to the next page

        y += 50

        url = url.replace('start=' + str(x),'start=' +str(x+50))
        x+= 50

        browser.get(url)
        print('Moving on to page ' + str(j+2))
        sleep(2)
        
        # A popover appears when we go to the next page. We will tell the browser to click on close button.
        # Although so far for me it has appeared only on 2nd page but I have included the check for every page to be on safer side
        try:
            
            browser.find_element_by_id('popover-x-button').click()
        except:
            print('No Newsletter Popup Found')
            
    # Save dataframe to csv     
    if save_csv:
        data.to_csv(search + '.csv')
        

    
    

I scrapped 1500 job posts for each

In [None]:
scrap_indeed('machine learning',1500) #Scrapped on 27th Jan
scrap_indeed('artificial intelligence',1500) #Scrapped on 27th Jan
scrap_indeed('data scientist',1500) #Scrapped on 28th Jan

In [None]:
ds = pd.read_csv('Data Scientist.csv')
ml = pd.read_csv('Machine Learning.csv')
ai = pd.read_csv('Artificial Intelligence.csv')

## Cleaning the data

In [None]:
ds_main = pd.concat([ds,ml,ai],ignore_index=True)

In [None]:
duplicated_rows_bool = ds_main.duplicated(['Job_Title','Company','Job_Description'],keep = 'first')
duplicated_rows_bool.value_counts()

In [None]:
ds_main.drop_duplicates(subset=['Job_Title','Company','Job_Description'],inplace = True)


Lets start by removing the stop words from Job Description column.


In [None]:
stopwords = nltk.corpus.stopwords.words('english')
RE_stopwords = r'\b(?:{})\b'.format('|'.join(stopwords))
# replace '|'-->' ' and drop all stopwords

#ds['Job_Description'] = ds['Job_Description'].str.lower()
#/ to " " cuz or
ds['Job_Description'] = ds['Job_Description'].replace([r'\|', '/','•',RE_stopwords,'  ','\n'], [' ',' ','', ' ',' ',' NextLineHere '], regex=True)


WORK IN PROGRESS

In [None]:
languages_list = ['C\\+\\+','[^\w]R[^\w]', 'java[^\w]', 'Python', '[^\w]SAS[^\w]','SQL']
#^ = Not, \w alphaneumeric chars
#java or javascript
# ( space before or after R
# ds['R'] = ds['Job_Description'].str.contains(' R[^\w]', case=False).astype(int)


for i in languages_list:
    total_count = sum(ds['Job_Description'].str.contains(i, case=False).astype(int))
    print(i+ '  ' +  str(total_count))