<a href="https://colab.research.google.com/github/NohmanAudi/Indeed_Job_Posts/blob/main/Job_postings_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Scraping code source: https://medium.com/codex/web-scraping-with-beautifulsoup-66a3a2b3b60
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import timedelta, datetime

In [2]:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
link="https://de.indeed.com/jobs?q=Data+Analyst&l=Berlin&start=10"

""" Sending HTTP Request"""
r=requests.get(link,headers=headers)

"""Check response status Code"""
r.status_code

200

In [3]:
"""URL Template"""
url_temp= "https://de.indeed.com/jobs?q={}&l={}&fromage=30&start={}" #### changing url_temp to specify the postend time to 30 days as indeed doesn't show specific date beyond
base_link="https://de.indeed.com"

"""This function takes the URL template, designation and city as inputs.
It navigates through the top 200 search results and scans all the <a> tags and returns a list of 
all the href attributes."""

def get_href(url_temp,position,city):
    results_per_city=200
    href_list=[]
    for start in range(0,results_per_city+10,10):
        url=url_temp.format(position,city,start)
        r=requests.get(url,headers=headers)
        soup=BeautifulSoup(r.text,"html.parser")    
       
        for i in soup.find_all('a'):
            # if tag has attribute of class
            if i.has_attr( "href" ):
                k=i['href']
                href_list.append(base_link+k)
    
    return href_list

In [4]:
"""This function takes the list of all the href attributes as input, 
finds the URLs with the mentioned strings and returns a list of those URLs."""

def get_job_links(href_list):
    job_links=[]
    for a in href_list:
        if a.find('/rc/clk')!=-1:
            job_links.append(a)
        elif a.find('/company/')!=-1:
            job_links.append(a)
    return job_links

In [5]:
"""This function takes the list of the URLs of the job postings and the city and does the following:
1. Send HTTP request to each of the URL.
2. Creates a soup object with html parsing.
3. Extracts title, company name, location and job description from each of the webpage and returns a dataframe."""



re_reqmnt = ['experience', 'knowledge', 'skills', 'strong', 'ability', 'years', 'good']



def get_job_df(job_links,city):
    df=pd.DataFrame(columns=[ "date", "job_location", "job_title", "company", "company_alt","requirement", "add_req", "job_description"])
    
    for i in job_links:
        req=requests.get(i,headers=headers)
        soup_req=BeautifulSoup(req.text,"html.parser")
        try:
            title=soup_req.find('h1',{'class': 'icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title'}).text
        except:
            continue
        try:
            company=soup_req.find('div',{'class':'icl-u-lg-mr--sm icl-u-xs-mr--xs'}).find_next().text #this worked for one company, trying for all
        except:
            continue
        
        company_alt=soup_req.find('div',{'class':'jobsearch-JobMetadataFooter'}).find_next().text #grabbing comapny name from footer


        try:
            location=soup_req.find('div',{'class':'jobsearch-InlineCompanyRating icl-u-xs-mt--xs jobsearch-DesktopStickyContainer-companyrating'}).text
        except:
            location=city
        
        try:
            sections = pd.Series([x.text.strip() for x in soup_req.find("div", {'class':'jobsearch-jobDescriptionText'}).find_all('ul')])
            see = sections.str.lower().str.split().apply(lambda words: [" ".join(word for word in words if word in re_reqmnt)])
            see_idx = see.apply(lambda x: x[0]).replace("", np.nan).dropna().index
            qualifications = sections[see_idx].reset_index(drop=True).sort_values(ascending=False)
            if len(qualifications) > 1:
              qual = qualifications[0]
              add_qual = qualifications[1]
            elif len(qualifications) == 1:
              qual = qualifications[0]
              add_qual = np.nan
            else:
              qual = np.nan
              add_qual = np.nan

        except:
          continue

        try:
            desc=soup_req.find('div',{'class':'jobsearch-jobDescriptionText'}).text
        except:
            continue
        date=soup_req.find('div',{'class':'jobsearch-JobMetadataFooter'}).find_next().find_next().text #this works

        df = df.append({"job_location":city, "job_title":title, "company":company, "job_description":desc, "company_alt": company_alt, "date": date,
                "requirement": qual, "add_req": add_qual}, ignore_index=True)


    
    return df

In [6]:
"""Calling all the above functions inside this function which takes the URL template, designation and city as inputs."""

def get_job_postings(url_temp,position,city):
    
    href_list= get_href(url_temp,position,city)
    
    job_links= get_job_links(href_list)
    
    job_df= get_job_df(job_links,city)
    
    return job_df

In [7]:
data_analyst_df = get_job_postings(url_temp,position='Data+Analyst',city='Berlin')



In [8]:
data_analyst_df.to_csv('job_data.csv')

In [23]:
data_analyst_df = pd.read_csv('/content/job_data.csv', index_col=0)
data_analyst_df.head()

Unnamed: 0,date,job_location,job_title,company,company_alt,requirement,add_req,job_description
0,vor 1 Tag,Berlin,Intern (f/m/d) - Data Analyst,SAP,SAP,Preferred fields of study: Business Informatic...,,\n\nWe help the world run better\n\n Our compa...
1,.css-fjuv6g{box-sizing:border-box;background:n...,Berlin,Finance Data Analyst,Kenjo GmbH,vor 4 Tagen,"Have robust Relational Model, SQL and BI softw...",,\nWelcome to your Kenjo application journey\n*...
2,vor 8 Tagen,Berlin,Data Analyst (m/f/d),ToolTime GmbH,ToolTime GmbH,Proven experience as data analyst or business ...,A modern office space as well as the ability t...,\n\n\n\n DESCRIPTION\n \n\n\nAbout us\n T...
3,.css-fjuv6g{box-sizing:border-box;background:n...,Berlin,Business Data Analyst – Level 1 (m/f/d),ICE International Copyright Enterprise Germany...,vor 16 Tagen,"You’re keen to delve into the world of coding,...",,What is ICE?ICE is the result of a big vision ...
4,vor 4 Tagen,Berlin,Data Analyst (m/f/d),Zalando,Zalando,An analytical & entrepreneurial mindset with p...,,As a Data Analyst (m/f/d) for our Convenience ...


In [24]:
data_analyst_df.shape

(308, 8)

In [25]:
#how many missing requirement records do we have?
data_analyst_df[data_analyst_df.requirement.isna()]

Unnamed: 0,date,job_location,job_title,company,company_alt,requirement,add_req,job_description
16,vor 3 Tagen,Berlin,Data Analyst / Data Consultant (m/w/d) für den...,Sparkassen Finanzportal GmbH,Sparkassen Finanzportal GmbH,,,Data Analyst / Data Consultant (m/w/d) für den...
18,vor 1 Tag,Berlin,Data Analyst bei Chip / EFahrer.com (m/w/d),EFahrer.com,Hubert Burda Media,,,Data Analyst bei Chip / EFahrer.com (m/w/d)\nA...
21,vor 11 Tagen,Berlin,Business Analyst (m/w/d),Mercedes-Benz Tech Innovation GmbH,Mercedes-Benz Group AG,,,Aufgaben\n\nFachliche Führung eines Squad Team...
28,vor 16 Tagen,Berlin,Data Analyst im Bereich Process Mining (m/w/d),Volkswagen Group Services GmbH,Volkswagen Group Services GmbH,,,\n\n\nUnser Stellenangebot\nData Mining ist Ih...
29,vor 1 Tag,Berlin,Data Analyst:in – Java-basierte Analyse von Fa...,IAV GmbH,IAV GmbH,,,\n\nDiese Herausforderung erwartet dich:\n\n D...
...,...,...,...,...,...,...,...,...
288,vor 16 Tagen,Berlin,Data Analyst (m/w/d) - Cyber Insurance,Project A Ventures,Project A Ventures,,,Wir suchen ein neues Teammitglied für eines un...
291,vor 23 Tagen,Berlin,Business Analyst (M/W/D),Steep Consult,Steep Consult,,,Wir suchen eine/n leidenschaftliche/n Business...
295,vor 15 Tagen,Berlin,(Senior) Data Analyst:in CRO & Data Science / ...,E. Breuninger GmbH & Co.,E. Breuninger GmbH & Co.,,,Unternehmensbeschreibung\nFashion und Lifestyl...
297,vor 15 Tagen,Berlin,Senior Business Analyst (m/w/d),Aroundhome,Aroundhome,,,\n\nAufgaben\n\n\n Als Senior Business Analyst...


In [26]:
data_analyst_df.date.str.startswith('.css').value_counts()

False    279
True      29
Name: date, dtype: int64

In [27]:
#index_rep = data_analyst_df[data_analyst_df.date == "Diesen Job melden"].date.index #bad values for old de.indeed format
index_rep = data_analyst_df[data_analyst_df.date.str.startswith('.css')].date.index #bad values for new de.indeed format

In [28]:
#footers had it wrong sometimes, replaced with company_alt that had the right date which exist in the footer as well

actual_date = data_analyst_df.iloc[index_rep].company_alt
fake_date = data_analyst_df.iloc[index_rep].date
data_analyst_df.date.replace(fake_date.values, actual_date.values, inplace=True)

In [29]:
data_analyst_df.date.value_counts()

vor 17 Tagen         40
vor 1 Tag            20
vor 10 Tagen         20
vor 2 Tagen          19
vor 19 Tagen         17
vor 15 Tagen         17
vor 12 Tagen         17
vor 18 Tagen         14
vor 26 Tagen         14
vor 16 Tagen         13
vor 5 Tagen          13
vor 29 Tagen         12
vor 8 Tagen          11
vor 24 Tagen         11
vor 9 Tagen          10
vor 3 Tagen           9
vor 4 Tagen           9
vor 23 Tagen          9
vor 11 Tagen          8
vor 13 Tagen          5
vor 22 Tagen          5
vor 25 Tagen          4
Heute                 4
vor 20 Tagen          2
Gerade geschaltet     1
vor 6 Tagen           1
vor 7 Tagen           1
vor 14 Tagen          1
vor 30 Tagen          1
Name: date, dtype: int64

In [30]:
data_analyst_df.company.value_counts()

Zalando                                  13
Delivery Hero                            12
Wayfair                                   5
wefox                                     5
Taxfix                                    5
                                         ..
UMI Urban Mobility International GmbH     1
Vista                                     1
Ultimate                                  1
METRO.digital                             1
Solactive AG                              1
Name: company, Length: 209, dtype: int64

In [31]:
ind = data_analyst_df[data_analyst_df.company == ""].company.index

In [32]:
#This time getting the company from the footer
actual = data_analyst_df.iloc[ind].company_alt
fake = data_analyst_df.iloc[ind].company
data_analyst_df.company.replace(fake.values, actual.values, inplace=True)

In [33]:
data_analyst_df.drop(columns='company_alt', inplace=True) #dropping company_alt after extracting actual date and company values

In [34]:
data_analyst_df.company.value_counts()

Zalando                                  13
Delivery Hero                            12
Wayfair                                   5
wefox                                     5
Taxfix                                    5
                                         ..
UMI Urban Mobility International GmbH     1
Vista                                     1
Ultimate                                  1
METRO.digital                             1
Solactive AG                              1
Name: company, Length: 209, dtype: int64

In [35]:
data_analyst_df.head()

Unnamed: 0,date,job_location,job_title,company,requirement,add_req,job_description
0,vor 1 Tag,Berlin,Intern (f/m/d) - Data Analyst,SAP,Preferred fields of study: Business Informatic...,,\n\nWe help the world run better\n\n Our compa...
1,vor 17 Tagen,Berlin,Finance Data Analyst,Kenjo GmbH,"Have robust Relational Model, SQL and BI softw...",,\nWelcome to your Kenjo application journey\n*...
2,vor 8 Tagen,Berlin,Data Analyst (m/f/d),ToolTime GmbH,Proven experience as data analyst or business ...,A modern office space as well as the ability t...,\n\n\n\n DESCRIPTION\n \n\n\nAbout us\n T...
3,vor 17 Tagen,Berlin,Business Data Analyst – Level 1 (m/f/d),ICE International Copyright Enterprise Germany...,"You’re keen to delve into the world of coding,...",,What is ICE?ICE is the result of a big vision ...
4,vor 4 Tagen,Berlin,Data Analyst (m/f/d),Zalando,An analytical & entrepreneurial mindset with p...,,As a Data Analyst (m/f/d) for our Convenience ...


In [38]:
#date posted to actual date
data_analyst_df.date.replace(['Heute', 'Gerade geschaltet'], 0, inplace=True) #replacing date "Heute": today and "Gerade geschaltet":Just posted, with 0
data_analyst_df.date.replace(regex=[r'\D+'], value="", inplace=True) #remove non digits
data_analyst_df.date = data_analyst_df.date.astype(int).apply(lambda x: datetime.now().date()-timedelta(x))

In [39]:
data_analyst_df.head()

Unnamed: 0,date,job_location,job_title,company,requirement,add_req,job_description
0,2022-05-21,Berlin,Intern (f/m/d) - Data Analyst,SAP,Preferred fields of study: Business Informatic...,,\n\nWe help the world run better\n\n Our compa...
1,2022-05-05,Berlin,Finance Data Analyst,Kenjo GmbH,"Have robust Relational Model, SQL and BI softw...",,\nWelcome to your Kenjo application journey\n*...
2,2022-05-14,Berlin,Data Analyst (m/f/d),ToolTime GmbH,Proven experience as data analyst or business ...,A modern office space as well as the ability t...,\n\n\n\n DESCRIPTION\n \n\n\nAbout us\n T...
3,2022-05-05,Berlin,Business Data Analyst – Level 1 (m/f/d),ICE International Copyright Enterprise Germany...,"You’re keen to delve into the world of coding,...",,What is ICE?ICE is the result of a big vision ...
4,2022-05-18,Berlin,Data Analyst (m/f/d),Zalando,An analytical & entrepreneurial mindset with p...,,As a Data Analyst (m/f/d) for our Convenience ...


Getting some css in the date field.
Needs a fix! SOLVED ✔

Convert date from date posted to actual date SOLVED ✔

**Refactoring**

*   Refactor get_job_df function. Move the sections logic to another function.

*   Refactor company and date fixes by using functions



Add feature, company rating from (glassdoor? indeed?)

Add experience level feature

check which companies add years of experience that unmatch the experience level in the job post #HR accountability