# Indeed Job Scraper

In [1]:
#Learn this on Izzy Analytics's Youtube channel, video link attached below, cheers!
#https://www.youtube.com/watch?v=eN_3d4JrL_w&list=LL&index=4

import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup


def get_url(position, location, days):
    """Generate url from position and location"""
    template = 'https://www.indeed.com/jobs?q={}&l={}&fromage={}'
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position, location, days)
    return url


def get_record(card):
    """Extract job data from a single record"""
    
    job_title = card.h2.a.get('title')
    company = card.find('span', 'company').text.strip()
    job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    post_date = card.find('span', 'date').text
    today = datetime.today().strftime('%Y-%m-%d')
    summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
    job_url = 'https://www.indeed.com' + card.h2.a.get('href')

    # this does not exists for all jobs, so handle the exceptions
    salary_tag = card.find('span', 'salaryText')
    if salary_tag:
        salary = salary_tag.text.strip()
    else:
        salary = ''  
        
    record = (job_title, company, job_location, post_date, today, summary, salary, job_url)
    return record


def main(position, location, days):
    """Run the main program routine"""
    records = []
    url = get_url(position, location, days)
    
    # extract the job data
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', 'jobsearch-SerpJobCard')
        for card in cards:
            record = get_record(card)
            records.append(record)
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break
        
    # save the job data
    with open('jobs.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary', 'JobUrl'])
        writer.writerows(records)

# Execution Code

#### Try different job titles, Locations, days posted  yourself

In [2]:
#Searching for business intelligence analyst in MA posted with 7 days
main('business intelligence analyst','Massachusetts','7')
#jobs.csv file has been saved to your local folder after you run the code above

# Geting Detailed job descriptions 

#### Since the job summary texts in the last file are shortened and if you wanna get the full job description here is the code to do it. However, I'm not using Indeed's API and this code might get blocked by Indeed for too many accessing times

In [3]:
#Importing everything again just in case you want to make it into another seperate notebook
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import random

In [4]:
#Getting just urls for scrapping
#Using the jobs.csv your just scrapped
df = pd.read_csv('jobs.csv')
urldf = df['JobUrl']
urldf.head()

0    https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
1    https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
2    https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
3    https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
4    https://www.indeed.com/rc/clk?jk=225bdb0759ee1...
Name: JobUrl, dtype: object

In [5]:
#Count many jobs in this csv file
len(urldf)

44

In [6]:
#Try one random example
response_fulljd = requests.get('https://www.indeed.com/rc/clk?jk=225bdb0759ee17be&fccid=2f6f2e23480ffded&vjs=3')
soup_fulljd = BeautifulSoup(response_fulljd.text, 'html.parser')
jd = soup_fulljd.find('div', 'jobsearch-jobDescriptionText')
fjd=jd.text.strip()
fjd

'FINANCE\nANALYST, BUSINESS INTELLIGENCE\nWestford, Massachusetts, United States of America\n\nSPEED & SPIRIT is what we look for in our candidates, defined by some simple values that inspire us to BE DRIVEN in our performance, BE VIBRANT in our sporting legacy, BE TOGETHER in our team spirit, and BE YOU to let our individual talent and experience shine. Applying for a job at PUMA is easy and all genders are welcome. Simply click APPLY ONLINE and follow the steps to upload your application.\nYOUR MISSION\nConceptualize, develop, implement, and maintain reporting solutions for monitoring and analyzing business performance for wide audiences\nMaintain Power BI visualization suite including report access, building new reports, dashboards, and visualization solutions, and app creation while adhering to reporting standards and best practices\nWork with end users by conducting ad hoc reporting, supporting special projects, leveraging data to identify and tell a story around the root cause of

In [7]:
#Clean the text and make into a list
joblist= []
for u in urldf:
    response_fulljd = requests.get(u)
    soup_fulljd = BeautifulSoup(response_fulljd.text, 'html.parser')
    jd = soup_fulljd.find('div', 'jobsearch-jobDescriptionText')
    fjd=jd.text.strip()
    fjd_clean = fjd.replace('\n', ' ')
    joblist.append(fjd_clean)

In [8]:
#Count job descriptions number
len(joblist)

44

In [9]:
#Turn list into dataframe and save into a csv file with Urls aside
joblist_df = pd.DataFrame(joblist,columns=['Full Job Description'])
joblist_df['JobUrl'] = urldf
joblist_df.to_csv('Full Job Descriptions.csv', index=False)

In [10]:
# Full Job Descriptions.csv saved to local folder
# Use Wrap Text function in excel to read clearly