In [1]:
#importing libraries
!pip install pandasql
from bs4 import BeautifulSoup
import requests
from random import random
from time import sleep
from collections import namedtuple
import smtplib
import csv
import pandas as pd
import numpy as np
import pandasql as psql
from pandasql import sqldf

Collecting pandasql
  Downloading https://files.pythonhosted.org/packages/6b/c4/ee4096ffa2eeeca0c749b26f0371bd26aa5c8b611c43de99a4f86d3de0a7/pandasql-0.7.3.tar.gz
Building wheels for collected packages: pandasql
  Building wheel for pandasql (setup.py) ... [?25l[?25hdone
  Created wheel for pandasql: filename=pandasql-0.7.3-cp37-none-any.whl size=26820 sha256=a72b45b41c9cf4d45db3946b585d74f063255786e0ac332ed70c8494247d8551
  Stored in directory: /root/.cache/pip/wheels/53/6c/18/b87a2e5fa8a82e9c026311de56210b8d1c01846e18a9607fc9
Successfully built pandasql
Installing collected packages: pandasql
Successfully installed pandasql-0.7.3


In [16]:
#scraping data
def generate_url(job_title, job_location):
    url_template = "https://in.indeed.com//jobs?q={}&l={}"
    url = url_template.format(job_title, job_location)
    return url


def save_record_to_csv(record, filepath, create_new_file=False):
    """Save an individual record to file; set `new_file` flag to `True` to generate new file"""
    header = ["JobTitle", "Company", "Location", "Salary", "PostDate", "Summary", "JobUrl"]
    if create_new_file:
        with open(filepath, mode='w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(header)
    else:
        with open(filepath, mode='a+', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(record)

def collect_job_cards_from_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    cards = soup.find_all('div', 'jobsearch-SerpJobCard')
    return cards, soup


def sleep_for_random_interval():
    seconds = random() * 10
    sleep(seconds)


def request_jobs_from_indeed(url):
    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
                  'application/signed-exchange;v=b3;q=0.9',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'en-US,en;q=0.9',
        'cache-control': 'max-age=0',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'none',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.47 '
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        return None


def find_next_page(soup):
    try:
        pagination = soup.find("a", {"aria-label": "Next"}).get("href")
        return "https://in.indeed.com/" + pagination
    except AttributeError:
        return None


def extract_job_card_data(card):
    atag = card.h2.a
    try:
        job_title = atag.get('title')
    except AttributeError:
        job_title = ''
    try:
        company = card.find('span', 'company').text.strip()
    except AttributeError:
        company = ''
    try:
        location = card.find('div', 'recJobLoc').get('data-rc-loc')
    except AttributeError:
        location = ''
    try:
        job_summary = card.find('div', 'summary').text.strip()
    except AttributeError:
        job_summary = ''
    try:
        post_date = card.find('span', 'date').text.strip()
    except AttributeError:
        post_date = ''
    try:
        salary = card.find('span', 'salarytext').text.strip()
    except AttributeError:
        salary = ''
    job_url = 'https://in.indeed.com/' + atag.get('href')
    return job_title, company, location, job_summary, salary, post_date, job_url


def main(job_title, job_location, filepath, email=None):
    unique_jobs = set()  # track job urls to avoid collecting duplicate records
    total_pages_extract=15
    print("Starting to scrape indeed for `{}` in `{}`".format(job_title, job_location))
    url = generate_url(job_title, job_location)
    save_record_to_csv(None, filepath, create_new_file=True)

    for page in range(0,total_pages_extract):
      print(url)
      html = request_jobs_from_indeed(url)
      if not html:
        break
      cards, soup = collect_job_cards_from_page(html)
      for card in cards:
        record = extract_job_card_data(card)
        if not record[-1] in unique_jobs:
          save_record_to_csv(record, filepath)
          unique_jobs.add(record[-1])
      sleep_for_random_interval()
      url = find_next_page(soup)
      if not url:
        break
      print('Finished collecting {:,d} job postings.'.format(len(unique_jobs)))
    
if __name__ == '__main__':
    # job search settings
    title = ' '
    loc = 'India'
    path = 'raw_data.csv'


    # without email settings
    main(title, loc, path)

Starting to scrape indeed for ` ` in `India`
https://in.indeed.com//jobs?q= &l=India
Finished collecting 15 job postings.
https://in.indeed.com//jobs?q=+&l=India&start=10
Finished collecting 30 job postings.
https://in.indeed.com//jobs?q=+&l=India&start=20
Finished collecting 40 job postings.
https://in.indeed.com//jobs?q=+&l=India&start=30
Finished collecting 55 job postings.
https://in.indeed.com//jobs?q=+&l=India&start=40
Finished collecting 58 job postings.
https://in.indeed.com//jobs?q=+&l=India&start=50
Finished collecting 71 job postings.
https://in.indeed.com//jobs?q=+&l=India&start=60
Finished collecting 78 job postings.
https://in.indeed.com//jobs?q=+&l=India&start=70
Finished collecting 91 job postings.
https://in.indeed.com//jobs?q=+&l=India&start=80
Finished collecting 106 job postings.
https://in.indeed.com//jobs?q=+&l=India&start=90
Finished collecting 118 job postings.
https://in.indeed.com//jobs?q=+&l=India&start=100
Finished collecting 131 job postings.
https://in.ind

In [20]:
#reading raw data
import pandas as pd
data=pd.read_csv("/content/raw_data.csv")

In [21]:
data.drop(['PostDate'], axis = 1,inplace=True) 

In [118]:
data.rename(columns = {'Salary':'Summary','Summary':'PostDate'}, inplace = True)

In [114]:
#Data Transformation
jobsinindia=psql.sqldf("""  
select *from data  
where Location LIKE 'Bengaluru%' 
OR Location LIKE 'Delhi%' 
OR Location LIKE 'New Delhi%' 
OR Location LIKE 'Kolkata%'
OR Location LIKE 'Chennai%' 
OR Location LIKE 'Hyderabad%' 
OR Location LIKE 'Ahmedabad%' 
OR Location LIKE 'Pune%' 
OR Location LIKE 'Kanpur%' 
OR Location LIKE 'Visakhapatnam%'
OR Location LIKE 'Surat%' 
OR Location LIKE 'Jaipur%'
OR Location LIKE 'Nagpur%' 
OR Location LIKE 'Patna%'  
""")

In [119]:
recently_posted=psql.sqldf("""
select *from jobsinindia
where PostDate LIKE '1 days ago%' OR 
PostDate  LIKE '2 days ago%' OR 
PostDate  LIKE '3 days ago%' OR 
PostDate  LIKE '4 days ago%' OR 
PostDate  LIKE '5 days ago%'
""")

recently_posted.head(5)

Unnamed: 0,JobTitle,Company,Location,Summary,PostDate,JobUrl
0,Administrative Business Partner,Google,"Bengaluru, Karnataka",Due to the current health crisis related to CO...,4 days ago,https://in.indeed.com//rc/clk?jk=70023c86205af...
1,Associate - Gurgaon 10 C - Energy,PwC,"Delhi, Delhi",A career in our Government and Public Sector T...,3 days ago,https://in.indeed.com//rc/clk?jk=a98b84eb22efc...
2,Clerk And Computer Operator,SHREE GANESH MEDICOS,"Delhi, Delhi",COMPUTER OPERATOR FOR PHARMACY STORE IN DELHI ...,3 days ago,https://in.indeed.com//rc/clk?jk=592301b7dada6...
3,Event Coordinator,Walmart Global Technology Services,"Bengaluru, Karnataka",Conducting short- and long-term planning and m...,5 days ago,https://in.indeed.com//rc/clk?jk=7079e6aa8efa6...
4,Coordination Responsible Production,Tetra Pak,"Pune, Maharashtra",At Tetra Pak we touch millions of lives every ...,5 days ago,https://in.indeed.com//rc/clk?jk=b5d9df8881da6...


In [120]:
recently_posted.to_csv('/content/sample_data/jobsinindia.csv')