# Indeed Web Scraping

### Learning + Preliminary Testing

In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
url='https://uk.indeed.com/jobs?q=&l=Bishops+Stortford%2C+Hertfordshire'
sort_page = '&sort=date'
page_number = '&start='
html_text = requests.get(url+sort_page).text

bs = BeautifulSoup(html_text, 'lxml')

In [3]:
jobs = bs.find_all('table', class_='jobCard_mainContent')

job_titles = []
for job in jobs:
    job_title = job.find('h2', class_='jobTitle jobTitle-color-purple jobTitle-newJob').text.replace('new', '')
    
    job_titles.append(job_title)
job_titles

['Library Assistant',
 'Cargo Operative',
 'Retail Assistant',
 'Junior Legal Secretary',
 'Waiting Staff',
 'Assistant in Training',
 'Trade Counter / Yard Person',
 'Customer Assistant Cheshunt Womenswear',
 'Night Care Worker',
 'Store Assistant - Days',
 'Customer Assistant Welwyn',
 'Delivery Assistant - Permanent (Delivery/Stockroom Departmen...',
 'Customer Assistant Welwyn Garden City',
 'Sales Assistant (12 Hours) - Westfield Stratford',
 'General Labourer']

In [5]:
html = url+sort_page
step = 0
job_titles = []
page_sort = '&sort=date'
page_number = '&start='
for page in range(10):
    url = html + page_number.split('=')[0] + '=' + str(step)
    step += 10
    html_text = requests.get(url).text
    bs = BeautifulSoup(html_text, 'lxml')
    jobs = bs.find_all('table', class_='jobCard_mainContent')
    for job in jobs:
        job_title = job.find('h2', class_='jobTitle jobTitle-color-purple jobTitle-newJob').text.replace('new', '')
        job_titles.append(job_title)

job_titles

['Barista - Chelmsford Meadows - 20 Hours - PT',
 'Kitchen Assistant - Apprenticeships availible',
 'Scrum Master - 24 months FTC',
 'Office Administrator',
 'Van Drivers',
 'Senior Care Assistant (Days)',
 'Cleaner',
 'Catering Assistant - Essex',
 'Customer Service Associate',
 'Warranty Administrator',
 '1 - 2 - 1 Behaviour Support Assistant',
 'Marketing Executive - Events',
 'Customer Sales Advisor - 20 hours per week',
 'Internship Opportunities: Cloud Infrastructure – Machine Lea...',
 'First Aid Trainer – Hertford',
 '1 - 2 - 1 Behaviour Support Assistant',
 'Marketing Executive - Events',
 'Customer Sales Advisor - 20 hours per week',
 'Internship Opportunities: Cloud Infrastructure – Machine Lea...',
 'First Aid Trainer – Hertford',
 'Internal Sales and Admin Support',
 'Tersus Administrator',
 'Duty Worker',
 'Head of Modern Foreign Languages',
 'One Stop - Customer Service Assistant',
 'Administration Assistant',
 'General Assistant',
 'Fork Lift Driver',
 'Customer Assista

## Multiple Page Web Scraper with Keyword Identifier and Score

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from datetime import datetime

class JobFinder:
    def __init__(self, url='https://uk.indeed.com/jobs?q=', location='&l=Bishop%27s+Stortford', 
                 radius='&radius=10', age='&fromage=7', sort='&sort=date', page='&start=', 
                 filter_in=['part time', 'artist', 'artistic', 'bookshop', 'garden', 'creative', 'books'], 
                 filter_out=['receptionist', 'restaurant', ], threshold=2):
        self.url = url
        self.location = location
        self.radius = radius
        self.age = age
        self.sort = sort
        self.page = page
        self.filter_in = filter_in
        self.filter_out = filter_out
        self.scores = []
        self.threshold = threshold
        self.job_titles = []
        self.companies = []
        self.job_urls = []
        self.job_descriptions = []
        self.descriptions = []
        self.job_snippets = []
        self.keywords = []
        
        
    def webscrape(self):
        step = 0
        previous_page = None
        job_titles = []
        companies = []
        job_urls = []
        
        for i in range(10):
            address = self.url + self.location + self.radius + self.age + self.sort + self.page.split('=')[0] + '=' + str(step)
            html_text = requests.get(address).text
            bs = BeautifulSoup(html_text, 'lxml')
            page_number = int(bs.find('ul', {'class': 'pagination-list'}).find('b').text)
            
            if page_number == previous_page:
                break
            else:
                job_cards = bs.find('div', {'id': 'mosaic-provider-jobcards'})
                jobs = job_cards.find_all('table', class_='jobCard_mainContent')
                
                for job in jobs:
                    job_title = job.find('h2', class_='jobTitle jobTitle-color-purple jobTitle-newJob').text.strip('new')
                    self.job_titles.append(job_title)
                    company = job.find('span', class_='companyName').text
                    self.companies.append(company)
                    
                for card in job_cards.find_all('a', href=True):
                    if '/rc' in card['href']:
                        self.job_urls.append('https://uk.indeed.com' + card['href'])
                    elif '/company' in card['href']:
                        self.job_urls.append('https://uk.indeed.com' + card['href'])
                    elif '/pagead' in card['href']:
                        self.job_urls.append('https://uk.indeed.com' + card['href'])
                        
                for snippet in job_cards.find_all('div', class_='job-snippet'):
                    self.job_snippets.append(snippet.text.replace('\n', ' '))
                    
            previous_page = page_number
            step += 10    

            
    def deepscrape(self):
        for url in self.job_urls:
            html_text2 = requests.get(url).text
            bs2 = BeautifulSoup(html_text2, 'lxml')
            self.job_descriptions.append(bs2.find('div', {'id' : 'jobDescriptionText'}).text)
        
    
    def score(self):
        counter = 0
        for desc in self.job_descriptions:
            score = 0
            keyword_temp = []
            
            for keyword in self.filter_in:
                if re.search(keyword, desc, re.IGNORECASE):
                    score += 1
                    keyword_temp.append(keyword)
            for keyword in self.filter_out:
                if re.search(keyword, desc, re.IGNORECASE):
                    score -= 1
                    
            self.scores.append(score)
            self.keywords.append(keyword_temp)
            
            if score >= self.threshold:
                #self.descriptions.append(desc.replace('\n', ' ')[:200] + '...')
                self.descriptions.append(f'{self.job_snippets[counter]}  Keywords found: {self.keywords[counter]}')
                
            else: 
                self.descriptions.append('')
                
            counter += 1
            
    
    def displayinfo(self):
        data = {'Job Title': self.job_titles, 'Comapany' : self.companies,'Filter Score' : self.scores, 
                f'Description for Score >= {self.threshold}' : self.descriptions, 'Link': self.job_urls}
        summary = pd.DataFrame(data)
        
        summary['Filter Score'] = summary['Filter Score'].astype(int)
        summary = summary.sort_values('Filter Score', ascending=False)
        summary = summary.style.set_properties(subset=[f'Description for Score >= {self.threshold}'], **{'width': '500px'})
        #pd.set_option('display.max_colwidth', None)
        
        summary.to_excel('Job Listings' + ' ' + datetime.today().strftime('%d-%m-%Y') + '.xlsx', index=False)
            

In [2]:
job_search = JobFinder()
job_search.webscrape()
job_search.deepscrape()
job_search.score()
job_search.displayinfo()