# Web Scraping and NLP Project

### Importing packages

In [49]:
import requests
import pprint
from bs4 import BeautifulSoup
import re
import json

In [25]:
pp = pprint.PrettyPrinter(indent=4)

## Pages to be scraped: object of the analysis
Each page contains 25 job ads + 2 advertised ones. I have decided to include the advertised ones with the first page, since they repeat on the second page.

In [26]:
URL = 'https://www.reed.co.uk/jobs/data-scientist-jobs-in-london?fulltime=True&proximity=30'
page = requests.get(URL)

In [27]:
soup1 = BeautifulSoup(page.content, 'html.parser')

In [28]:
pages_list = [soup1]

In [29]:
job_counter=soup1.findAll("div", {"class": "page-counter"})[0].contents[0].split()
jobs_total = int(job_counter[4])
job_current = int(job_counter[2])

In [30]:
n_page=2
while job_current<jobs_total:
    URL_p = 'https://www.reed.co.uk/jobs/data-scientist-jobs-in-london?pageno='+str(n_page)+'&fulltime=True&proximity=20'
    page_p = requests.get(URL_p)
    soup_p = BeautifulSoup(page_p.content, 'html.parser')
    pages_list.append(soup_p)
    job_current = int(soup_p.findAll("div", {"class": "page-counter"})[0].contents[0].split()[2])
    n_page+=1

## Loop acquiring job description, salary, location, type of contract

In [31]:
descriptions_all = []
salaries_all = []
locations_all = []
contract_all = []
job_ref_all = []

for i in range(len(pages_list)):
    desc = pages_list[i].findAll("div", {"class": "description"})
    sal = pages_list[i].findAll("li", {"class": "salary"})
    loc = pages_list[i].findAll("li", {"class": "location"})
    con = pages_list[i].findAll("li", {"class": "time"})
    job_ref = pages_list[i].findAll("a", {"data-id": True})
    #job_id = job_refs[1]["data-id"]
    if i !=0:
        desc = desc[2:]
        sal = sal[2:]
        loc = loc[2:]
        con = con[2:]
        job_ref = job_ref[2:]
        
    descriptions_all.append(desc)
    salaries_all.append(sal)
    locations_all.append(loc)
    contract_all.append(con)
    job_ref_all.append(job_ref)

### Where to find the information in the scraped soup

salary : $<span data-qa="salaryLbl">£80,000 - £95,000 per annum</span>$  
description : $<span itemprop="description"> <p><strong>SENIOR MACHINE LEARNING ENGINEER/ FULL-STACK DA...$  
location : $<span data-qa="regionLbl" itemprop="addressLocality">London</span>$  
contract : $<span data-qa="jobTypeMobileLbl">Permanent, full-time</span>$

In [137]:
job_ads_container = {}

for i in range(len(job_ref_all)):
    for j in range(len(job_ref_all[i])):
        job_id = job_ref_all[i][j]["data-id"]
        job_dict = {}
        
        '''
        Job page scraping, to get job description
        '''
        job_URL = 'https://www.reed.co.uk/jobs/data-scientist/'+job_id
        job_page = requests.get(job_URL)
        job_soup = BeautifulSoup(job_page.content, 'html.parser')
        
        
        ##################################
        try:
            job_desc = job_soup.findAll("span", {"itemprop": "description"})[0]
            job_dict['Description'] = job_desc.text
        except IndexError:
            job_dict['Description'] = ''
        try:
            job_dict['Salary'] = job_soup.findAll("span", {"data-qa": "salaryLbl"})[0].text
            #salaries_all[i][j].contents[0]
        except IndexError:
            job_dict['Salary'] = None
        try:
            job_dict['Location'] = job_soup.findAll("span", {"data-qa": "regionLbl"})[0].text
            #locations_all[i][j].contents[1].contents[0]
        except IndexError:
            job_dict['Location'] = None
        try:
            job_dict['Contract'] = job_soup.findAll("span", {"data-qa": "jobTypeMobileLbl"})[0].text
            #contract_all[i][j].contents[0]
        except IndexError:
            job_dict['Contract'] = None
        try:
            job_dict['ad_poster'] = job_soup.findAll("span", {"itemprop": "hiringOrganization"})[0].find("span", {"itemprop": "name"}).text
        except IndexError:
            job_dict['ad_poster'] = None
        try:
            job_dict['ad_poster_type']= re.findall(r"(jobRecruiterType: )'(\w+\s\w+\s\w+|\w+\s\w+|\w+|\s)", str(job_soup.findAll('script')))[0][1]
        except IndexError:
            job_dict['ad_poster_type']=None        
        try:
            job_dict['job_Knowledge_Domain']= re.findall(r"(jobKnowledgeDomain: )'(\w+\s\w+\s\w+|\w+\s\w+|\w+|\s)", str(job_soup.findAll('script')))[0][1]
        except IndexError:
            job_dict['job_Knowledge_Domain']=None
            
            
        job_ads_container[job_id] = job_dict

### Saving the job ads dictionary

In [138]:
# -*- coding: utf-8 -*-
# Save a dictionary into a pickle file.
#import pickle
import json

with open('data/job_ads_container_large.json', 'w') as fp:
    json.dump(job_ads_container, fp)

### Load

In [19]:
# Read JSON file
with open('data/job_ads_container.json') as data_file:
    job_ads_container_load = json.load(data_file)

In [131]:
len(job_ads_container)

241

### Appendix - Scraping job page

In [32]:
job_URL = 'https://www.reed.co.uk/jobs/senior-data-scientist/'+job_refs[1]["data-id"]
job_page = requests.get(job_URL)

In [33]:
job_soup = BeautifulSoup(job_page.content, 'html.parser')

In [38]:
job_desc = job_soup.findAll("span", {"itemprop": "description"})

In [96]:
job_desc[0].contents[3]

<p>Alexa Shopping Spoken Language Understanding team is looking for a senior data scientist to join a recently established team in London. The team’s mission is to ‘put the customer in the loop’ to improve the shopping experience on Alexa. The team will be  building systems to collect and use explicit and implicit signals from customer behavior. As the lead scientist on the team you will drive the investigation to make this data actionable. You will have considerable scope to direct the research to find the maximum  impact. This could include identifying leading indicators of customer dissatisfaction, using customer provided signals to improve existing Alexa language understanding models or operationalizing entirely new models to improve the experience for our customers.<br/><br/>This is a blue-sky role that gives you a chance to roll up your sleeves and dive into big data sets in order to build simulations and experimentation systems at scale, build optimization algorithms and leverag

In [41]:
len(descriptions)

27

In [44]:
descriptions[25]

<div class="description">
<p>We are currently recruiting a enthusiastic and bold Senior <span class="highlight">Data</span> <span class="highlight">Scientist</span> for a leading dating/social network organisation. As a Senior <span class="highlight">Data</span> <span class="highlight">Scientist</span> you will be responsible for working with large, complex <span class="highlight">data</span> sets and much more. The Senior <span class="highlight">Data</span> <span class="highlight">Scientist</span>...</p>
</div>

In [132]:
job_URL = 'https://www.reed.co.uk/jobs/data-scientist/40810605'
job_page = requests.get(job_URL)
job_soup = BeautifulSoup(job_page.content, 'html.parser')


In [134]:
re.findall(r"(jobKnowledgeDomain: )'(\w+\s\w+\s\w+|\w+\s\w+|\w+|\s)", str(job_soup.findAll('script')))[0][1]

'Marketing And Media'