# Web Scraping using python for Job Search

In [7]:
## loading all the required libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import csv

In [8]:
##generating function to extract url
def get_url(position,location):
    """Generate a url with position and location"""
    temp = "https://ca.indeed.com/jobs?q={}&l={}"
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = temp.format(position,location)
    return url

In [9]:
def searchSimplifier(val):
    if val > 0:
            val = 1
    else:
            val = 0
    return val

In [10]:
## BeautifulSoup Library is used to Extract HTML Content on Each Search Page

def get_record(content):
    """Extract job data for a single record"""
    
    atag = content.select("h2.jobTitle a")
    job_title = atag[0].span.get('title')
    temp_url = atag[0].get("href")
    job_url = "https://indeed.com" + temp_url
    company_name = content.find("span", class_ = "companyName").text.strip()
    job_location = content.find("div","companyLocation").text
    try:
        job_summary = content.find("tr",class_="underShelfFooter").li.text.strip()
    except:
        job_summary = np.nan
    post_date = content.find("span","date").text.strip()
    today = datetime.today().strftime('%Y-%m-%d')
    try:
        salary_range = content.find("div","attribute_snippet").text.strip()
    except:
        salary_range = np.nan
        
    tempo_url = "https://indeed.com" + content.find("a","jcs-JobTitle").get('href')
    webpage1 = requests.get(tempo_url)
    soup1 = BeautifulSoup(webpage1.content)
    item = soup1.find("div", "jobsearch-jobDescriptionText").get_text()
    pythonFind = searchSimplifier(item.find("Python"))
    hadoopFind = searchSimplifier(item.find("Hadoop"))
    SASFind = searchSimplifier(item.find("SAS"))
    RFind = searchSimplifier(item.find(" R "))
    javaFind = searchSimplifier(item.find("java"))
    excelFind = searchSimplifier(item.find("Excel"))                                         
    SQLFind = searchSimplifier(item.find("SQL"))
    SPSSFind = searchSimplifier(item.find("SPSS"))
    tableauFind = searchSimplifier(item.find("tableau"))
    SparkFind = searchSimplifier(item.find("Spark"))
    communicationFind = searchSimplifier(item.find("communication"))
    presentationFind = searchSimplifier(item.find("presentation"))
    problemsolvingFind = searchSimplifier(item.find("problem solving"))
    projectmanagementFind = searchSimplifier(item.find("project management"))
    consultingFind = searchSimplifier(item.find("consulting"))
    leadershipFind = searchSimplifier(item.find("leadership"))   

    record = (job_title,company_name, job_location, job_summary, post_date, today, salary_range, job_url, pythonFind, hadoopFind,
             SASFind, RFind, javaFind, excelFind, SQLFind, SPSSFind, tableauFind, SparkFind, communicationFind, presentationFind,
             problemsolvingFind, projectmanagementFind, consultingFind, leadershipFind)
    
    return record

In [11]:
def main(position,location):
    """Run the main Program"""
    records = []
    url = get_url(position, location)
    
    # extract the job data
    while True:
        webpage = requests.get(url)
        soup = BeautifulSoup(webpage.content)
        contents = soup.find_all("div",attrs ={"class":"job_seen_beacon"})

        for content in contents:
            record = get_record(content)
            records.append(record)

        try:
            url = "https://indeed.com"+soup.find("a", attrs= {"aria-label":"Next"}).get('href')
        except AttributeError:
            break
            
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'Summary', 'PostDate', 'ExtractDate', 'Salary', 'JobUrl', 'PythonFind', 'HadoopFind',
             'SASFind', 'RFind', 'javaFind', 'ExcelFind', 'SQLFind', 'SPSSFind', 'tableauFind', 'SparkFind', 'communicationFind', 'presentationFind',
             'problemsolvingFind', 'projectmanagementFind', 'consultingFind', 'leadershipFind'])
        writer.writerows(records)
            

In [12]:
#run the main function 
main("Data Scientist", "Ontario,Canada")

In [13]:
df = pd.read_csv('results.csv')

In [14]:
df.head()

Unnamed: 0,JobTitle,Company,Location,Summary,PostDate,ExtractDate,Salary,JobUrl,PythonFind,HadoopFind,...,SQLFind,SPSSFind,tableauFind,SparkFind,communicationFind,presentationFind,problemsolvingFind,projectmanagementFind,consultingFind,leadershipFind
0,Data Scientist I,TD Bank,"Toronto, ON",We are looking for someone to work as part of ...,PostedJust posted,2022-06-17,Full-time,https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfk...,0,0,...,0,0,0,0,0,0,0,0,0,1
1,"Data Scientist, Consumer Finance Science",Amazon Dev Centre Canada ULC,"Toronto, ON+6 locations","Collaborating with our dedicated product, data...",Posted3 days ago,2022-06-17,Full-time,https://indeed.com/rc/clk?jk=790af72c28d7eac3&...,1,0,...,1,0,0,0,0,0,0,0,0,0
2,Applied Scientist - Text IQ,Relativity,Remote in Ontario,Strong computer science fundamentals in algori...,Posted8 days ago,2022-06-17,Full-time,https://indeed.com/rc/clk?jk=c03641578aa6eec6&...,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Data Scientist/Engineer,CHEO,"Temporarily Remote in Ottawa, ON","Understanding of data structures, data modelin...",EmployerActive 3 days ago,2022-06-17,,https://indeed.com/company/CHEO/jobs/Data-Scie...,0,0,...,0,0,0,0,1,1,0,0,0,1
4,Data Scientist,Royal Bank of Canada,"Toronto, ON","Strong data profiling, cleaning, mining and te...",PostedJust posted,2022-06-17,Full-time,https://indeed.com/rc/clk?jk=045a970c0fa2aadb&...,1,0,...,1,0,0,1,1,1,0,0,0,0
