## WEB CRAWLING A JOB WEBSITE TO EXTRACT ALL JOB RELATED INFO
This workbook demonstrates usage of web scrapping to gather all job related information from a website

In [1]:
import os
from bs4 import BeautifulSoup
import requests 

We consider a website **Timesjobs** that has all job related information to scrape.

In [2]:
url="https://www.timesjobs.com/candidate/job-search.html?searchType=Home_Search&from=submit&asKey=OFF&txtKeywords=&cboPresFuncArea=35&clusterName=CLUSTER_FA&hc=CLUSTER_FA"

In [3]:
result=requests.get(url)

Check whether we indeed get a response or not.

In [4]:
result.status_code

200

Now we create a file that stores the frontend code which then we open and convert into a string that we read from.

In [5]:
with open('timesjobs.html','w') as f:
    f.write(result.text)

In [6]:
os.listdir()

['.DS_Store',
 'timesjobs.html',
 'response.html',
 '.jovianrc',
 'WEB SCRAPPING.ipynb',
 '.ipynb_checkpoints',
 'WEB SCRAPPING INFO FROM A JOB WEBSITE.ipynb']

In [7]:
timesjobs_html_text=""
with open('timesjobs.html','r') as f:
    timesjobs_html_text+=f.read()

In [8]:
import lxml

In [9]:
soup=BeautifulSoup(timesjobs_html_text,'lxml')

In [10]:
print(soup.prettify)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [11]:
res=soup.find_all('li',class_='job-bx')

In [12]:
res

[<li class="clearfix job-bx wht-shd-bx">
 <header class="clearfix">
 <!--
 -->
 <!-- -->
 <div class="d-flex d-flex-l-r job-title__logo">
 <div class="d-flex d-flex-l-r">
 <span class="logo-container">
 <i class="default-company-logo"></i>
 </span>
 <div>
 <h2 class="heading-trun" title="Manual Test Engineer Required in Canada Under PNP program">
 <a href="https://www.timesjobs.com/job-detail/manual-test-engineer-required-in-canada-under-pnp-program-world-overseas-immigration-consultancy-pvt-ltd-australia-canada-2-to-7-yrs-jobid-NMmhRdQ1bIBzpSvf__PLUS__uAgZw==&amp;source=srp" onclick="logViewUSBT('view','69387137','software , software engineer , software developer , programmer , web developer , Manual Testing , Automated Testing  Manual Testing , Automation Manual Testing','Australia,  Canada','2 - 7','IT Software : Software Products &amp; Services','1','' )" target="_blank">
       Manual Test Engineer Required in Canada Under PNP program</a> </h2>
 <div class="d-flex d-flex-align-ite

In [13]:
res=soup.find('i','experience')
print(type(res.next_sibling))

<class 'bs4.element.NavigableString'>


In [48]:
def Get_Job_Details(bs_obj):
    ans={}
    res=bs_obj.find_all('li',class_='job-bx')
    
    for item in res:
        k=item.a.string.strip().strip('\t').strip('\n')
        ans[k]={}
        ans[k]['Hosted By']=item.h3.string.strip().strip('\t').strip('\n')
        ans[k]['Job Description']=item.find('li',class_='job-description__').string.strip().strip('\t').strip('\n')
        ans[k]['Location']=list(item.find('li',class_='location-tru').children)[-1].strip().strip('\t').strip('\n')
        ans[k]['Experience']=item.find('i','experience').next_sibling
        ans[k]['Salary']=str(item.find('i','salary').next_sibling).replace(' ','').replace('\t','').replace('\n','')
        ans[k]['Posted']=str(list(item.find('span',class_='sim-posted').children)[1].string)
    return ans

In [49]:
IT_jobs_details=Get_Job_Details(soup)

In [50]:
IT_jobs_details

{'Manual Test Engineer Required in Canada Under PNP program': {'Hosted By': 'World Overseas immigration Consultancy Pvt Ltd',
  'Job Description': 'Work with developers to design algorithms and flowcharts.Produce clean, efficient code based on specifications.Integrate software components and third...',
  'Location': 'Australia,  Canada',
  'Experience': '2 - 7 Years',
  'Salary': '39-44Lakhs',
  'Posted': 'Posted today'},
 'Power Platform Developer': {'Hosted By': 'D B Consultancy',
  'Job Description': 'Urgent opening for Power Platform Developer-Pune/BangalorePosition:Power Platform DeveloperExperience:5+YearsBudget:10-12LPA(Slightly negotiable)Locat...',
  'Location': 'Bengaluru / Bangalore,  Pune',
  'Experience': '5 - 10 Years',
  'Salary': 'Notdisclosed',
  'Posted': 'Posted today'},
 'Software Engineer\\Sr. Software Engineer - J48459': {'Hosted By': 'SAMPOORNA CONSULTANTS PVT LTD',
  'Job Description': 'Knowledge/Qualifications/ Technical Competencies Knowledge, Skills & Abiliti

#### OBTAIN JOBS THAT WERE RECENTLY POSTED

In [54]:
type(IT_jobs_details['Angular | 4 to 8 years | Pune & Hyderabad']['Posted'])

str

In [51]:
import regex as re

In [58]:
def Obtain_Jobs_Recently_Posted(doc):
    ans=[]
    for item in doc:
        if re.search('today',doc[item]['Posted']) or re.search('a few days ago',doc[item]['Posted']):
            ans.append(item)
    return ans

In [59]:
res=Obtain_Jobs_Recently_Posted(IT_jobs_details)

In [60]:
res

['Manual Test Engineer Required in Canada Under PNP program',
 'Power Platform Developer',
 'Software Engineer\\Sr. Software Engineer - J48459',
 'Innovation Engineer - AI - J48457',
 'Data Analyst - AI/ML - J48458',
 '.NET  API + Azure-(UK) - Con- BLR - J48411',
 'Systems Engineer - C',
 'Senior Software Engineer - C',
 'Data Engineer - C',
 'Java Angular | 4 to 8 years | Pune & Hyderabad',
 'Angular | 4 to 8 years | Pune & Hyderabad',
 'System Engineer(SAP BASIS WITH HANA) | 6 to 9 YEARS | MUMBAI, PUNE & BENGALURU',
 'Senior Data Curation Engineer - Remote',
 'Business Advisor - B',
 'Senior Tradeshows & Events Specialist EMEA (maternity cover) Job Details | Boston Scientific',
 'Sr, Data Engineer HEREDIA Job Details | Boston Scientific',
 'Guidewire Developer | 6 to 12 years | PAN India',
 'Project Manager, Technical Job Details | Boston Scientific',
 'Vedlikeholdstekniker elektro Job Details | ExxonMobil',
 'Business Transformation Consultant: IoT & PLM',
 'Server I/O Logic Design'

Above we find the list of jobs posted either **a few days ago** or **on the current day when viewed**