<a href="https://colab.research.google.com/github/NohmanAudi/Indeed_Job_Posts/blob/main/Job_postings_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Scraping code source: https://medium.com/codex/web-scraping-with-beautifulsoup-66a3a2b3b60
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import timedelta, datetime

In [2]:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
link="https://de.indeed.com/jobs?q=Data+Analyst&l=Berlin&start=10"

""" Sending HTTP Request"""
r=requests.get(link,headers=headers)

"""Check response status Code"""
r.status_code

200

In [3]:
"""URL Template"""
url_temp= "https://de.indeed.com/jobs?q={}&l={}&fromage=30&start={}" #### changing url_temp to specify the postend time to 30 days as indeed doesn't show specific date beyond
base_link="https://de.indeed.com"

"""This function takes the URL template, designation and city as inputs.
It navigates through the top 200 search results and scans all the <a> tags and returns a list of 
all the href attributes."""

def get_href(url_temp,position,city):
    results_per_city=200
    href_list=[]
    for start in range(0,results_per_city+10,10):
        url=url_temp.format(position,city,start)
        r=requests.get(url,headers=headers)
        soup=BeautifulSoup(r.text,"html.parser")    
       
        for i in soup.find_all('a'):
            # if tag has attribute of class
            if i.has_attr( "href" ):
                k=i['href']
                href_list.append(base_link+k)
    
    return href_list

In [4]:
"""This function takes the list of all the href attributes as input, 
finds the URLs with the mentioned strings and returns a list of those URLs."""

def get_job_links(href_list):
    job_links=[]
    for a in href_list:
        if a.find('/rc/clk')!=-1:
            job_links.append(a)
        elif a.find('/company/')!=-1:
            job_links.append(a)
    return job_links

In [5]:
""" This function looks for all ul elements in the soup request corresponding to a list section,
Extracts the section to a pandas series, check if a section contains the keywords,
then sort them depending on how many keyowrds they contain,
and returns the first one or two
"""

re_reqmnt = ['experience', 'knowledge', 'skills', 'strong', 'ability', 'years', 'good']

def get_requirement(soup_req):
  description = soup_req.find("div", {'class':'jobsearch-jobDescriptionText'}).find_all('ul')
  sections = pd.Series([x.text.strip() for x in description])
  see = sections.str.lower().str.split()
  see = see.apply(lambda words: [" ".join(word for word in words if word in re_reqmnt)])
  see_idx = see.apply(lambda x: x[0]).replace("", np.nan).dropna().index
  qualifications = sections[see_idx].reset_index(drop=True).sort_values(ascending=False)

  if len(qualifications) > 1:
    qual = qualifications[0]
    add_qual = qualifications[1]
  elif len(qualifications) == 1:
    qual = qualifications[0]
    add_qual = np.nan
  else:
    qual = np.nan
    add_qual = np.nan
  
  return(qual, add_qual)


In [6]:
"""This function takes the list of the URLs of the job postings and the city and does the following:
1. Send HTTP request to each of the URL.
2. Creates a soup object with html parsing.
3. Extracts title, company name, location and job description from each of the webpage and returns a dataframe."""


def get_job_df(job_links,city):
    df=pd.DataFrame(columns=["date", "job_location", "job_title", "company", "Company_rating", "company_alt","requirement", "add_req", "job_description"])
    
    for i in job_links:
        req=requests.get(i,headers=headers)
        soup_req=BeautifulSoup(req.text,"html.parser")
        try:
            title=soup_req.find('h1',{'class': 'icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title'}).text
        except:
            continue
        try:
            company=soup_req.find('div',{'class':'icl-u-lg-mr--sm icl-u-xs-mr--xs'}).find_next().text #this worked for one company, trying for all
        except:
            continue
        
        company_alt=soup_req.find('div',{'class':'jobsearch-JobMetadataFooter'}).find_next().text #grabbing comapny name from footer

        try:
          Company_rating=soup_req.find(itemprop="ratingValue").get("content")
        except:
          continue

        try:
            location=soup_req.find('div',{'class':'jobsearch-InlineCompanyRating icl-u-xs-mt--xs jobsearch-DesktopStickyContainer-companyrating'}).text
        except:
            location=city
        
        try:
            requirement, add_req = get_requirement(soup_req)
        except:
          continue

        try:
            desc=soup_req.find('div',{'class':'jobsearch-jobDescriptionText'}).text
        except:
            continue
        date=soup_req.find('div',{'class':'jobsearch-JobMetadataFooter'}).find_next().find_next().text #this works

        df = df.append({"job_location":city, "job_title":title, "company":company, "Company_rating":Company_rating,
                        "job_description":desc, "company_alt": company_alt, "date": date,
                "requirement": requirement, "add_req": add_req}, ignore_index=True)


    
    return df

In [7]:
"""Calling all the above functions inside this function which takes the URL template, designation and city as inputs."""

def get_job_postings(url_temp,position,city):
    
    href_list= get_href(url_temp,position,city)
    
    job_links= get_job_links(href_list)
    
    job_df= get_job_df(job_links,city)
    
    return job_df

In [8]:
data = get_job_postings(url_temp,position='Data+Analyst',city='Berlin')

  # This is added back by InteractiveShellApp.init_path()


In [9]:
data.to_csv('job_data.csv')

In [10]:
data_analyst_df = pd.read_csv('/content/job_data.csv', index_col=0)
data_analyst_df.head()

Unnamed: 0,date,job_location,job_title,company,Company_rating,company_alt,requirement,add_req,job_description
0,vor 21 Tagen,Berlin,IT Business Analyst – ServiceMax,Johnson Controls,3.7,Johnson Controls,Min. 3 years working experience with Salesforc...,Involvement in full cycle project implementati...,\n\nJob Details \nWhat you will do \n\nDo you ...
1,vor 15 Tagen,Berlin,Data Analyst (m/f/d),Zalando,3.1,Zalando,An analytical & entrepreneurial mindset with p...,,\nAs a Data Analyst (m/f/d) for our Convenienc...
2,vor 19 Tagen,Berlin,Data Analyst - Marketing (m/f/x),reBuy reCommerce GmbH,3.0,reBuy reCommerce GmbH,"More than 2 years of experience in analytics, ...",,\n\n Company Description:\n \n\n\n\n We are...
3,vor 5 Tagen,Berlin,EMEA Sales Operations Analyst,AWS EMEA SARL (Germany Branch),3.5,Amazon.com,,,"Bachelor’s degree in finance, business, econom..."
4,vor 7 Tagen,Berlin,Senior Data Analyst - Global Data (f/m/d),Delivery Hero,3.7,Delivery Hero,Want to be a Hero? Join the #1 GLOBAL LEADER i...,More than 4 years experience as a BI/Data Anal...,\n\n\n\n\n Want to be a Hero? Join the #1 ...


In [11]:
data_analyst_df.shape

(137, 9)

In [12]:
!pip install langdetect #Language detection library ported from Google's language-detection.
from langdetect import detect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 17.3 MB/s 
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993242 sha256=1f11d75f66501a08ec14da39e3823b22d5a0623a6ff01bb23158ec6297eb4a8c
  Stored in directory: /root/.cache/pip/wheels/c5/96/8a/f90c59ed25d75e50a8c10a1b1c2d4c402e4dacfa87f3aff36a
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [13]:
#detecting records with German language and dropping them
idx_ger_records = data_analyst_df[data_analyst_df.job_description.apply(lambda x: detect(x)) == 'de'].index
data_analyst_df.drop(index=idx_ger_records, inplace=True)

In [14]:
data_analyst_df.date.str.startswith('.css').value_counts()

False    102
True      10
Name: date, dtype: int64

In [15]:
data_analyst_df.date.isna().value_counts()

False    112
Name: date, dtype: int64

In [16]:
#found one record at one attempt to have NaN.
data_analyst_df.date.fillna(data_analyst_df.company_alt, inplace=True)

In [17]:
#footers had it wrong sometimes, replaced with company_alt that had the right date which exist in the footer as well
#data_analyst_df.loc[data_analyst_df.date == "Diesen Job melden", 'date] = data_analyst_df.company_alt #bad values for old de.indeed format
data_analyst_df.loc[data_analyst_df.date.str.startswith('.c'), 'date'] = data_analyst_df.company_alt #bad values for new de.indeed format

In [18]:
data_analyst_df.date.value_counts()

vor 13 Tagen    13
vor 9 Tagen     12
vor 1 Tag        9
vor 26 Tagen     8
vor 21 Tagen     6
vor 29 Tagen     6
Heute            5
vor 12 Tagen     5
vor 20 Tagen     4
vor 2 Tagen      4
vor 22 Tagen     4
vor 27 Tagen     4
vor 5 Tagen      4
vor 19 Tagen     4
vor 15 Tagen     3
vor 28 Tagen     3
vor 23 Tagen     3
vor 14 Tagen     3
vor 4 Tagen      2
vor 16 Tagen     2
vor 8 Tagen      2
vor 24 Tagen     2
vor 10 Tagen     2
vor 7 Tagen      1
vor 6 Tagen      1
Name: date, dtype: int64

In [19]:
data_analyst_df.company.value_counts()

Delivery Hero                     12
Deutsche Bank                     12
Zalando                           11
wefox                              7
Wayfair                            4
Solarisbank                        3
Arrow Electronics, Inc.            3
Arvato infoscore GmbH              3
Lieferando                         3
Pepper Media Holding GmbH          3
HelloFresh                         2
Beets&Roots GmbH                   2
Flaconi GmbH                       2
ImmoScout24                        2
Amazon Development Center DEU      2
Deloitte Consulting GmbH           2
Aroundhome                         2
Google                             1
Amazon Germany Holdco 2 GmbH       1
Salesforce                         1
HERE Technologies                  1
Campusjäger by Workwise            1
Johnson Controls                   1
DemoUp Cliplister                  1
sennder                            1
Luxoft                             1
MEININGER Hotels                   1
S

In [20]:
#This time getting the company from the footer if missing
data_analyst_df.loc[data_analyst_df.company == '', 'company'] = data_analyst_df.company_alt

In [21]:
data_analyst_df.drop(columns='company_alt', inplace=True) #dropping company_alt after extracting actual date and company values

In [22]:
data_analyst_df.head()

Unnamed: 0,date,job_location,job_title,company,Company_rating,requirement,add_req,job_description
0,vor 21 Tagen,Berlin,IT Business Analyst – ServiceMax,Johnson Controls,3.7,Min. 3 years working experience with Salesforc...,Involvement in full cycle project implementati...,\n\nJob Details \nWhat you will do \n\nDo you ...
1,vor 15 Tagen,Berlin,Data Analyst (m/f/d),Zalando,3.1,An analytical & entrepreneurial mindset with p...,,\nAs a Data Analyst (m/f/d) for our Convenienc...
2,vor 19 Tagen,Berlin,Data Analyst - Marketing (m/f/x),reBuy reCommerce GmbH,3.0,"More than 2 years of experience in analytics, ...",,\n\n Company Description:\n \n\n\n\n We are...
3,vor 5 Tagen,Berlin,EMEA Sales Operations Analyst,AWS EMEA SARL (Germany Branch),3.5,,,"Bachelor’s degree in finance, business, econom..."
4,vor 7 Tagen,Berlin,Senior Data Analyst - Global Data (f/m/d),Delivery Hero,3.7,Want to be a Hero? Join the #1 GLOBAL LEADER i...,More than 4 years experience as a BI/Data Anal...,\n\n\n\n\n Want to be a Hero? Join the #1 ...


In [23]:
#date posted to actual date
data_analyst_df.date.replace(['Heute', 'Gerade geschaltet'], 0, inplace=True) #replacing date "Heute": today and "Gerade geschaltet":Just posted, with 0
data_analyst_df.date.replace(regex=[r'\D+'], value="", inplace=True) #remove non digits
data_analyst_df.date = data_analyst_df.date.astype(int).apply(lambda x: datetime.now().date()-timedelta(x)) #calculating the actual date

In [24]:
data_analyst_df.head()

Unnamed: 0,date,job_location,job_title,company,Company_rating,requirement,add_req,job_description
0,2022-05-12,Berlin,IT Business Analyst – ServiceMax,Johnson Controls,3.7,Min. 3 years working experience with Salesforc...,Involvement in full cycle project implementati...,\n\nJob Details \nWhat you will do \n\nDo you ...
1,2022-05-18,Berlin,Data Analyst (m/f/d),Zalando,3.1,An analytical & entrepreneurial mindset with p...,,\nAs a Data Analyst (m/f/d) for our Convenienc...
2,2022-05-14,Berlin,Data Analyst - Marketing (m/f/x),reBuy reCommerce GmbH,3.0,"More than 2 years of experience in analytics, ...",,\n\n Company Description:\n \n\n\n\n We are...
3,2022-05-28,Berlin,EMEA Sales Operations Analyst,AWS EMEA SARL (Germany Branch),3.5,,,"Bachelor’s degree in finance, business, econom..."
4,2022-05-26,Berlin,Senior Data Analyst - Global Data (f/m/d),Delivery Hero,3.7,Want to be a Hero? Join the #1 GLOBAL LEADER i...,More than 4 years experience as a BI/Data Anal...,\n\n\n\n\n Want to be a Hero? Join the #1 ...


In [25]:
#records in requirement with missings values
data_req_na = data_analyst_df[data_analyst_df.requirement.isna()]
data_req_na

Unnamed: 0,date,job_location,job_title,company,Company_rating,requirement,add_req,job_description
3,2022-05-28,Berlin,EMEA Sales Operations Analyst,AWS EMEA SARL (Germany Branch),3.5,,,"Bachelor’s degree in finance, business, econom..."
78,2022-05-04,Berlin,ERP Functional Consultant,"Arrow Electronics, Inc.",3.5,,,\nPosition:\n ERP Functional Consultant\n \n\n...


In [35]:
data_req_na.job_description.iloc[0]

"Bachelor’s degree in finance, business, economics, mathematics, or related fields Advanced skills in MS Excel, working with large datasets Ability to multitask and conduct sophisticated and creative analysis of complex data and translate the results into actionable deliverables, messages and presentations Excellent organization skills as well as written and verbal communication skills Ability to learn new tools quickly \n\n\n Job summary\n  This position can be located in Luxemburg, Berlin, London, Madrid or Milan offices\n \n As an EMEA Sales Operations analyst, you will support EMEA Sales managers as well as Sales Strategy & Ops leaders leaders on running sales processes aligned to day-to-day execution of the business as well as driving implementation of new tools and processes to continuously drive productivity improvement in our sales teams.\n  You will be the central SME for operational requests raised by EMEA's sales teams, owning resolution or collaborating with internal end-us

In [26]:
#split the job description by blank lines and see where it leads from there
#data_req_na.job_description[0].split('\n\n')

Getting some css in the date field.
Needs a fix! SOLVED ✔

Convert date from date posted to actual date SOLVED ✔

Information extraction from job description

NLP/ Topic Modeling/ Clustering???

**Refactoring**

*   Refactor get_job_df function. Move sections logic to get_requirement. Solved ✔

*   Refactor company and date fixes by using functions



Add feature, company rating Solved ✔

Add experience level feature? from query? requirements?

check which companies add years of experience that unmatch the experience level in the job post #HR accountability?

In [32]:
data_analyst_df.job_description[0]

'\n\nJob Details \nWhat you will do \n\nDo you have experience with ServiceMax? If so, join our business transformation program! \nIT Business Analyst role is a business-facing IT analyst position accountable for the successful delivery of IT projects that enable the EMEALA Service Business. This position defines and develops business applications that support Service business processes. \n\nHow you will do it \n\nBuild and deploy end to end IT solution for Service business and markets, involving configuring and integrating standard applications with none or minimum customization and software deployment. \nWork together with business partners, IT Business Lead, delivery team, Product owner, architects and third parties to develop, guide and implement IT solutions. \nClose collaboration with partners to define and document user stories, existing and to-be business processes, and functional requirements applicable for a line of business on EMEALA region level or country specific requirem