<a href="https://colab.research.google.com/github/NohmanAudi/Indeed_Job_Posts/blob/main/scraping_dirty.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Scraping code source which contained basic functions with limited extractions: https://medium.com/codex/web-scraping-with-beautifulsoup-66a3a2b3b60
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
link="https://de.indeed.com/jobs?q=Data+Analyst&l=Berlin&start=10"

""" Sending HTTP Request"""
r=requests.get(link,headers=headers)

"""Check response status Code"""
r.status_code

200

In [3]:
"""URL Template"""
url_temp= "https://de.indeed.com/jobs?q={}&l={}&start={}"
base_link="https://de.indeed.com"

"""This function takes the URL template, designation and city as inputs.
It navigates through the top 200 search results and scans all the <a> tags and returns a list of 
all the href attributes."""

def get_href(url_temp,position,city):
    results_per_city=200
    href_list=[]
    for start in range(0,results_per_city+10,10):
        url=url_temp.format(position,city,start)
        r=requests.get(url,headers=headers)
        soup=BeautifulSoup(r.text,"html.parser")    
       
        for i in soup.find_all('a'):
            # if tag has attribute of class
            if i.has_attr( "href" ):
                k=i['href']
                href_list.append(base_link+k)
    
    return href_list

In [4]:
"""This function takes the list of all the href attributes as input, 
finds the URLs with the mentioned strings and returns a list of those URLs."""

def get_job_links(href_list):
    job_links=[]
    for a in href_list:
        if a.find('/rc/clk')!=-1:
            job_links.append(a)
        elif a.find('/company/')!=-1:
            job_links.append(a)
    return job_links

In [5]:
"""This function takes the list of the URLs of the job postings and the city and does the following:
1. Send HTTP request to each of the URL.
2. Creates a soup object with html parsing.
3. Extracts title, company name, location and job description from each of the webpage and returns a dataframe.
4. start with mock section names and their headers following expected logical order as observered on the site"""

def get_job_df(job_links,city):
    df=pd.DataFrame(columns=[ "date", "job_location", "job_title", "company", "company_alt","b_offer", "offer", "b_tasks", "tasks",
                         "b_requirement", "requirement", "b_additional", "additional", "job_description"])
    
    for i in job_links:
        req=requests.get(i,headers=headers)
        soup_req=BeautifulSoup(req.text,"html.parser")
        try:
            title=soup_req.find('h1',{'class': 'icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title'}).text
        except:
            continue
        try:
            company=soup_req.find('div',{'class':'icl-u-lg-mr--sm icl-u-xs-mr--xs'}).find_next().text #this worked for one company, trying for all
        except:
            continue
        
        company_alt=soup_req.find('div',{'class':'jobsearch-JobMetadataFooter'}).find_next().text #grabbing comapny name from footer


        try:
            location=soup_req.find('div',{'class':'jobsearch-InlineCompanyRating icl-u-xs-mt--xs jobsearch-DesktopStickyContainer-companyrating'}).text
        except:
            location=city
        
        sections = [(x, x.find_previous('b')) for x in soup_req.find("div", {'class':'jobsearch-jobDescriptionText'}).find_all('ul')]
        b_offer = b_tasks = b_requirement = b_additional = offer = tasks = requirement = additional = None

        if len(sections) == 4:
          offer, tasks, requirement, additional = [x[0] for x in sections[:4]]
          b_offer, b_tasks, b_requirement, b_additional = [x[1] for x in sections[:4]]

        elif len(sections) == 3:
            offer, tasks, requirement = [x[0] for x in sections[:3]]
            b_offer, b_tasks, b_requirement = [x[1] for x in sections[:3]]

        try:
            desc=soup_req.find('div',{'class':'jobsearch-jobDescriptionText'}).text
        except:
            continue
        date=soup_req.find('div',{'class':'jobsearch-JobMetadataFooter'}).find_next().find_next().text #this works

        df = df.append({"job_location":city, "job_title":title, "company":company, "job_description":desc, "company_alt": company_alt, "date": date,
                "b_offer": b_offer, "b_tasks" : b_tasks, "b_requirement" : b_requirement, "b_additional": b_additional,
                "offer": offer, "tasks" : tasks, "requirement" : requirement, "additional": additional},
                       ignore_index=True)
    
    return df

In [6]:
"""Calling all the above functions inside this function which takes the URL template, designation and city as inputs."""

def get_job_postings(url_temp,position,city):
    
    href_list= get_href(url_temp,position,city)
    
    job_links= get_job_links(href_list)
    
    job_df= get_job_df(job_links,city)
    
    return job_df

In [7]:
data_analyst_df = get_job_postings(url_temp,position='Data+Analyst',city='Berlin')

In [8]:
data_analyst_df.head()

Unnamed: 0,date,job_location,job_title,company,company_alt,b_offer,offer,b_tasks,tasks,b_requirement,requirement,b_additional,additional,job_description
0,.css-fjuv6g{box-sizing:border-box;background:n...,Berlin,Business Data Analyst – Level 1 (m/f/d),ICE International Copyright Enterprise Germany...,vor 9 Tagen,[You are responsible for: ],"[\n, [Data extraction based on customer and in...",[Minimum Requirements: ],"[\n, [You’re keen to delve into the world of c...",[We offer You: ],"[\n, [Competitive salary], \n, [Diverse work e...",[Application Requirements: ],"[\n, [Your current CV], \n]",\nWhat is ICE?ICE is the result of a big visio...
1,.css-fjuv6g{box-sizing:border-box;background:n...,Berlin,Data Analyst (gn),eduki,vor 30 Tagen,[Us for You!],[[We want to offer you a product with which yo...,[You for us!],[[You will help us to foster a data-driven min...,[That's you!],"[[You deliver good knowledge of statistics, a ...",,,Company© eduki*About us*eduki was founded in 2...
2,vor 23 Tagen,Berlin,Intern (f/m/d) - Data Analyst,SAP,SAP,,,,,,,,,\n\nWe help the world run better\n\n Our compa...
3,vor 30+ Tagen,Berlin,Data Analyst Intern (f/m/d),Delivery Hero,Delivery Hero,[ Your mission:],"[\n, [[ Engage within the team and key stakeho...",[ Your heroic skills:],"[\n, [[ You are freshly graduated or about to ...",[ Nice to have:],"[\n, [[ Understanding of big data tools (e.g R...",[ Why Delivery Hero?],"[\n, [[ English is our working language and ou...",\n\n\nWant to be a Hero? Join the #1 GLOBAL LE...
4,vor 1 Tag,Berlin,Data Analyst (m/f/d),ToolTime GmbH,ToolTime GmbH,,[[[\nSupport the management team by tracking a...,,[[[\nProven experience as data analyst or busi...,,[[[\nA modern office space as well as the abil...,,,DESCRIPTION\nAbout us\n\nToolTime is a fast-gr...


In [9]:
#investigating records where we got actual value
df = data_analyst_df[data_analyst_df.b_tasks.isna() == False]
df.head()

Unnamed: 0,date,job_location,job_title,company,company_alt,b_offer,offer,b_tasks,tasks,b_requirement,requirement,b_additional,additional,job_description
0,.css-fjuv6g{box-sizing:border-box;background:n...,Berlin,Business Data Analyst – Level 1 (m/f/d),ICE International Copyright Enterprise Germany...,vor 9 Tagen,[You are responsible for: ],"[\n, [Data extraction based on customer and in...",[Minimum Requirements: ],"[\n, [You’re keen to delve into the world of c...",[We offer You: ],"[\n, [Competitive salary], \n, [Diverse work e...",[Application Requirements: ],"[\n, [Your current CV], \n]",\nWhat is ICE?ICE is the result of a big visio...
1,.css-fjuv6g{box-sizing:border-box;background:n...,Berlin,Data Analyst (gn),eduki,vor 30 Tagen,[Us for You!],[[We want to offer you a product with which yo...,[You for us!],[[You will help us to foster a data-driven min...,[That's you!],"[[You deliver good knowledge of statistics, a ...",,,Company© eduki*About us*eduki was founded in 2...
3,vor 30+ Tagen,Berlin,Data Analyst Intern (f/m/d),Delivery Hero,Delivery Hero,[ Your mission:],"[\n, [[ Engage within the team and key stakeho...",[ Your heroic skills:],"[\n, [[ You are freshly graduated or about to ...",[ Nice to have:],"[\n, [[ Understanding of big data tools (e.g R...",[ Why Delivery Hero?],"[\n, [[ English is our working language and ou...",\n\n\nWant to be a Hero? Join the #1 GLOBAL LE...
5,vor 30+ Tagen,Berlin,Data Analyst (f/m/d),BMG RIGHTS MANAGEMENT GmbH - Corporate,Bertelsmann,"[[], \nYour Responsibilities]",[[\nDeliver reporting and self-service dashboa...,"[[], \nYour Profile]","[[\nStrong SQL skills, accompanied with an und...","[[], \nNow let’s see what’s in it for you]","[[[\nThe unique BMG culture, empowering and dr...","[[], \nNow let’s see what’s in it for you]","[[[\nA 21st century working environment, colla...",We are looking for a Data Analyst (f/m/d) to j...
7,vor 30+ Tagen,Berlin,Data Analyst (m/f/d),Audible,Audible,"[As a Data Analyst, you will...]","[\n, [Build and maintain basic data artifacts ...","[As a Data Analyst, you will...]","[\n, [ Quantitative degree (Economics, Mathema...","[As a Data Analyst, you will...]","[\n, [ Understand how to use one or more indus...",,,\n\n Good storytelling starts with great list...


In [10]:
#checking which columns contain none value >> only the additional and b_offer
df.isna().any()

date               False
job_location       False
job_title          False
company            False
company_alt        False
b_offer             True
offer              False
b_tasks            False
tasks              False
b_requirement      False
requirement        False
b_additional        True
additional          True
job_description    False
dtype: bool

In [11]:
#check values in b_tasks for vocabs
vocabs = df.b_tasks.apply(lambda x: x.text.strip())

In [12]:
exp_keywords = 'Require|WHAT W|Skill|skill|exper|Exper|Profil|profil'
vocabs[vocabs.str.contains(exp_keywords) == True]

0                      Minimum Requirements:
3                        Your heroic skills:
5                               Your Profile
15     Hiermit bewegen Sie uns - Ihr Profil:
17                Your Skills and Experience
                       ...                  
293                      Your heroic skills:
295                              Dein Profil
306                             Your Profile
308         Beneficial skills and experience
311                            Requirements:
Name: b_tasks, Length: 72, dtype: object

In [13]:
#get df index when b_tasks section contains experience keywords building a sample of requirement keywords
vocab_idx = vocabs[vocabs.str.contains(exp_keywords) == True].index
vocab_idx

Int64Index([  0,   3,   5,  15,  17,  20,  21,  30,  35,  39,  44,  49,  50,
             51,  54,  59,  61,  63,  78,  83, 101, 106, 111, 115, 117, 123,
            129, 130, 131, 133, 134, 136, 137, 139, 140, 141, 151, 152, 161,
            162, 170, 181, 182, 184, 188, 189, 197, 201, 210, 214, 225, 226,
            228, 229, 233, 235, 238, 243, 244, 245, 251, 261, 265, 271, 282,
            289, 290, 293, 295, 306, 308, 311],
           dtype='int64')

In [14]:
len(vocab_idx)

72

In [15]:
exp_vocabs = df.loc[vocab_idx.values].tasks.apply(lambda x: x.text.strip())

In [16]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import ngrams, FreqDist
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('stopwords')

english_stopwords = stopwords.words('english')

clean_exp = exp_vocabs.str.lower().str.split().apply(lambda words: [word for word in words if word not in english_stopwords])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
df1 = pd.DataFrame(clean_exp.values.tolist())
df1.stack().value_counts()[:20].index

Index(['experience', 'data', 'skills', 'knowledge', 'business', 'strong',
       'und', 'years', 'sql', 'working', 'work', 'ability', 'plus',
       'communication', 'degree', 'analytics', 'tools', 'field', 'good',
       'team'],
      dtype='object')

Looking at keywords that are not particularly relevant to position to generalize. Thus, not taking words like (business, analytics, sql) into consideration, also those that are relevant to many sections like (work, team).
left with ('experience', 'knowledge', 'skills',
       'strong', 'ability', 'years', 'degree', 'plus, 'good')

Moving on to a new notebook to get a clean df that takes keywords into consideration while scraping