# Web Scrapping from Indeed


## 1. import request & BeautifulSoup

In [1]:
import requests
from bs4 import BeautifulSoup

## 2. request pagination
 - 50 jobs in one page
 - check and get the link from Indeed "Data intern"

In [2]:
LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=data+intern&limit={LIMIT}"

In [3]:
def extract_indeed_pages():
    result = requests.get(URL)
    soup = BeautifulSoup(result.text, "html.parser")
    pagination = soup.find("div", {"class": "pagination"})
    links = pagination.find_all('a')
    pages = []
    for link in links[:-1]:
        pages.append(int(link.string))
        
    max_page = pages[-1]
    return max_page

- erase the last page name which is "Next"
- define last indeed page 

In [4]:
last_indeed_page = extract_indeed_pages()

## 3. Extract job details
- title
- company
- location
- link (job_id)

### 3.1 define extract_job as request the job details

In [5]:
def extract_job(html):
    title = html.find("div", {"class": "title"}).find("a")["title"]
    company = html.find("span", {"class": "company"})
    company_anchor = company.find("a") 
    if company_anchor is not None:
        company = str(company_anchor.string)
    else:
        company = str(company.string)
    company = company.strip()
    location = html.find("div", {"class": "recJobLoc"})["data-rc-loc"]
    job_id = html["data-jk"]
    return {
        'title': title,
        'company': company,
        'location': location,
        'link': f"https://www.indeed.com/viewjob?jk={job_id}"
    }

### 3.2 request information from Indeed web page

In [6]:
def extract_indeed_jobs(last_page):
    jobs = []
    for page in range(last_page):
        print(f"Scrapping page: {page}")
        result = requests.get(f"{URL}&start={page*LIMIT}")
        soup = BeautifulSoup(result.text, "html.parser")
        results = soup.find_all("div", {"class": "jobsearch-SerpJobCard"})
        for result in results:
            job = extract_job(result)
            jobs.append(job)
    return jobs

In [7]:
extract_indeed_jobs(last_indeed_page)

Scrapping page: 0
Scrapping page: 1
Scrapping page: 2
Scrapping page: 3
Scrapping page: 4
Scrapping page: 5
Scrapping page: 6
Scrapping page: 7
Scrapping page: 8
Scrapping page: 9
Scrapping page: 10
Scrapping page: 11
Scrapping page: 12
Scrapping page: 13
Scrapping page: 14
Scrapping page: 15
Scrapping page: 16
Scrapping page: 17
Scrapping page: 18
Scrapping page: 19


[{'title': 'Data Management Intern',
  'company': 'Northeast Charter Schools Network',
  'location': 'Albany, NY',
  'link': 'https://www.indeed.com/viewjob?jk=e9a1676031f9e8e2'},
 {'title': 'Data Analytics Intern',
  'company': 'Global Atlantic Financial Group',
  'location': 'Des Moines, IA',
  'link': 'https://www.indeed.com/viewjob?jk=a67812272d8c2a28'},
 {'title': '2020 Software Engineer Intern',
  'company': 'Bloomberg',
  'location': 'New York, NY',
  'link': 'https://www.indeed.com/viewjob?jk=02de239c0c5c6f24'},
 {'title': 'Intern to Full Time Position',
  'company': 'LA Models',
  'location': 'Los Angeles, CA',
  'link': 'https://www.indeed.com/viewjob?jk=19b757fb968c0d7b'},
 {'title': '2020 Tax Intern',
  'company': 'Guardian Life Insurance Company',
  'location': 'New York, NY',
  'link': 'https://www.indeed.com/viewjob?jk=c163e9cb08854c11'},
 {'title': 'Web Design Intern',
  'company': 'Town & Country',
  'location': 'New York, NY',
  'link': 'https://www.indeed.com/viewjob

In [8]:
indeed_jobs = extract_indeed_jobs(last_indeed_page)

Scrapping page: 0
Scrapping page: 1
Scrapping page: 2
Scrapping page: 3
Scrapping page: 4
Scrapping page: 5
Scrapping page: 6
Scrapping page: 7
Scrapping page: 8
Scrapping page: 9
Scrapping page: 10
Scrapping page: 11
Scrapping page: 12
Scrapping page: 13
Scrapping page: 14
Scrapping page: 15
Scrapping page: 16
Scrapping page: 17
Scrapping page: 18
Scrapping page: 19


In [9]:
indeed_jobs

[{'title': 'Data Management Intern',
  'company': 'Northeast Charter Schools Network',
  'location': 'Albany, NY',
  'link': 'https://www.indeed.com/viewjob?jk=e9a1676031f9e8e2'},
 {'title': 'Data Analytics Intern',
  'company': 'Global Atlantic Financial Group',
  'location': 'Des Moines, IA',
  'link': 'https://www.indeed.com/viewjob?jk=a67812272d8c2a28'},
 {'title': '2020 Software Engineer Intern',
  'company': 'Bloomberg',
  'location': 'New York, NY',
  'link': 'https://www.indeed.com/viewjob?jk=02de239c0c5c6f24'},
 {'title': 'Intern to Full Time Position',
  'company': 'LA Models',
  'location': 'Los Angeles, CA',
  'link': 'https://www.indeed.com/viewjob?jk=19b757fb968c0d7b'},
 {'title': '2020 Tax Intern',
  'company': 'Guardian Life Insurance Company',
  'location': 'New York, NY',
  'link': 'https://www.indeed.com/viewjob?jk=c163e9cb08854c11'},
 {'title': 'Operation Intern',
  'company': 'Criteria for Success, Inc.',
  'location': 'New York, NY',
  'link': 'https://www.indeed.

## 4. Saving to csv

In [10]:
import csv

In [15]:
def save_file(jobs):
    file = open("indeed_jobs.csv", mode="w")
    writer = csv.writer(file)
    writer.writerow(["Title", "Company", "Location", "Link"])
    for job in indeed_jobs:
        writer.writerow(list(job.values()))
    return

In [16]:
save_file(indeed_jobs)

In [17]:
import pandas as pd
indeed_jobs = pd.read_csv("indeed_jobs.csv")
indeed_jobs.head()

Unnamed: 0,Title,Company,Location,Link
0,Data Management Intern,Northeast Charter Schools Network,"Albany, NY",https://www.indeed.com/viewjob?jk=e9a1676031f9...
1,Data Analytics Intern,Global Atlantic Financial Group,"Des Moines, IA",https://www.indeed.com/viewjob?jk=a67812272d8c...
2,2020 Software Engineer Intern,Bloomberg,"New York, NY",https://www.indeed.com/viewjob?jk=02de239c0c5c...
3,Intern to Full Time Position,LA Models,"Los Angeles, CA",https://www.indeed.com/viewjob?jk=19b757fb968c...
4,2020 Tax Intern,Guardian Life Insurance Company,"New York, NY",https://www.indeed.com/viewjob?jk=c163e9cb0885...
