<a href="https://colab.research.google.com/github/NohmanAudi/Indeed_Job_Posts/blob/main/Job_postings_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Scraping code source: https://medium.com/codex/web-scraping-with-beautifulsoup-66a3a2b3b60
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
link="https://de.indeed.com/jobs?q=Data+Analyst&l=Berlin&start=10"

""" Sending HTTP Request"""
r=requests.get(link,headers=headers)

"""Check response status Code"""
r.status_code

200

In [3]:
"""URL Template"""
url_temp= "https://de.indeed.com/jobs?q={}&l={}&start={}"
base_link="https://de.indeed.com"

"""This function takes the URL template, designation and city as inputs.
It navigates through the top 200 search results and scans all the <a> tags and returns a list of 
all the href attributes."""

def get_href(url_temp,position,city):
    results_per_city=200
    href_list=[]
    for start in range(0,results_per_city+10,10):
        url=url_temp.format(position,city,start)
        r=requests.get(url,headers=headers)
        soup=BeautifulSoup(r.text,"html.parser")    
       
        for i in soup.find_all('a'):
            # if tag has attribute of class
            if i.has_attr( "href" ):
                k=i['href']
                href_list.append(base_link+k)
    
    return href_list

In [4]:
"""This function takes the list of all the href attributes as input, 
finds the URLs with the mentioned strings and returns a list of those URLs."""

def get_job_links(href_list):
    job_links=[]
    for a in href_list:
        if a.find('/rc/clk')!=-1:
            job_links.append(a)
        elif a.find('/company/')!=-1:
            job_links.append(a)
    return job_links

In [5]:
"""This function takes the list of the URLs of the job postings and the city and does the following:
1. Send HTTP request to each of the URL.
2. Creates a soup object with html parsing.
3. Extracts title, company name, location and job description from each of the webpage and returns a dataframe."""



re_reqmnt = ['experience', 'knowledge', 'skills', 'strong', 'ability', 'years', 'good']



def get_job_df(job_links,city):
    df=pd.DataFrame(columns=[ "date", "job_location", "job_title", "company", "company_alt","requirement", "add_req", "job_description"])
    
    for i in job_links:
        req=requests.get(i,headers=headers)
        soup_req=BeautifulSoup(req.text,"html.parser")
        try:
            title=soup_req.find('h1',{'class': 'icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title'}).text
        except:
            continue
        try:
            company=soup_req.find('div',{'class':'icl-u-lg-mr--sm icl-u-xs-mr--xs'}).find_next().text #this worked for one company, trying for all
        except:
            continue
        
        company_alt=soup_req.find('div',{'class':'jobsearch-JobMetadataFooter'}).find_next().text #grabbing comapny name from footer


        try:
            location=soup_req.find('div',{'class':'jobsearch-InlineCompanyRating icl-u-xs-mt--xs jobsearch-DesktopStickyContainer-companyrating'}).text
        except:
            location=city
        
        try:
            sections = pd.Series([x.text.strip() for x in soup_req.find("div", {'class':'jobsearch-jobDescriptionText'}).find_all('ul')])
            see = sections.str.lower().str.split().apply(lambda words: [" ".join(word for word in words if word in re_reqmnt)])
            see_idx = see.apply(lambda x: x[0]).replace("", np.nan).dropna().index
            qualifications = sections[see_idx].reset_index(drop=True).sort_values(ascending=False)
            if len(qualifications) > 1:
              qual = qualifications[0]
              add_qual = qualifications[1]
            elif len(qualifications) == 1:
              qual = qualifications[0]
              add_qual = np.nan
            else:
              qual = np.nan
              add_qual = np.nan

        except:
          continue

        try:
            desc=soup_req.find('div',{'class':'jobsearch-jobDescriptionText'}).text
        except:
            continue
        date=soup_req.find('div',{'class':'jobsearch-JobMetadataFooter'}).find_next().find_next().text #this works

        df = df.append({"job_location":city, "job_title":title, "company":company, "job_description":desc, "company_alt": company_alt, "date": date,
                "requirement": qual, "add_req": add_qual}, ignore_index=True)


    
    return df

In [6]:
"""Calling all the above functions inside this function which takes the URL template, designation and city as inputs."""

def get_job_postings(url_temp,position,city):
    
    href_list= get_href(url_temp,position,city)
    
    job_links= get_job_links(href_list)
    
    job_df= get_job_df(job_links,city)
    
    return job_df

In [7]:
data_analyst_df = get_job_postings(url_temp,position='Data+Analyst',city='Berlin')



In [8]:
data_analyst_df.to_csv('job_data.csv')
data_analyst_df.head()

Unnamed: 0,date,job_location,job_title,company,company_alt,requirement,add_req,job_description
0,vor 1 Tag,Berlin,Intern (f/m/d) - Data Analyst,SAP,SAP,Preferred fields of study: Business Informatic...,,\n\nWe help the world run better\n\n Our compa...
1,vor 8 Tagen,Berlin,Data Analyst (m/f/d),ToolTime GmbH,ToolTime GmbH,Proven experience as data analyst or business ...,A modern office space as well as the ability t...,DESCRIPTION\nAbout us\n\nToolTime is a fast-gr...
2,.css-fjuv6g{box-sizing:border-box;background:n...,Berlin,Finance Data Analyst,Kenjo GmbH,vor 4 Tagen,"Have robust Relational Model, SQL and BI softw...",,\nWelcome to your Kenjo application journey\n*...
3,.css-fjuv6g{box-sizing:border-box;background:n...,Berlin,Data Analyst (gn),eduki,vor 30+ Tagen,"You deliver good knowledge of statistics, a hi...",,Company© eduki*About us*eduki was founded in 2...
4,.css-fjuv6g{box-sizing:border-box;background:n...,Berlin,Business Data Analyst – Level 1 (m/f/d),ICE International Copyright Enterprise Germany...,vor 16 Tagen,"You’re keen to delve into the world of coding,...",,What is ICE?ICE is the result of a big vision ...


In [None]:
data_analyst_df[data_analyst_df.requirement.isna()]

Unnamed: 0,date,job_location,job_title,company,company_alt,requirement,add_req,job_description
12,vor 17 Tagen,Berlin,(Junior) Data Analyst / BI Developer (w/m/d),Daimler Group Services Berlin GmbH,Mercedes-Benz Group AG,,,\n\n\n\nAufgaben \n\n\n\n\n\n Die Daimle...
13,vor 30+ Tagen,Berlin,Data Engineer/ Data Analyst (w/m/d),Transdev GmbH,Transdev,,,\n\n\n\nZur Verstärkung unseres Teams in Berli...
14,vor 30+ Tagen,Berlin,Business & Data Analyst,dkb,dkb,,,\n\n\n\n Business & Data Analyst (m/w/d)\n ...
15,vor 30+ Tagen,Berlin,Data Analyst (m/w/d),auticon GmbH,auticon GmbH,,,\n\n\n\n\n\n\n\nData Analyst (m/w/d)\nmit Auti...
23,vor 30+ Tagen,Berlin,Data Analyst (m/w/d),AOK-Bundesverband,AOK-Bundesverband GbR,,,Vielfältige Aufgaben erwarten Sie\nDatenbankab...
32,.css-fjuv6g{box-sizing:border-box;background:n...,Berlin,Data Analyst / Software Developer (d/m/w),VDI/VDE Innovation + Technik GmbH,vor 30+ Tagen,,,\nIhre Aufgaben bei uns: \n\n(Weiter-)Entwickl...
33,vor 8 Tagen,Berlin,Data Analyst im Bereich Process Mining (m/w/d),Volkswagen Group Services GmbH,Volkswagen Group Services GmbH,,,Unser Stellenangebot\nData Mining ist Ihre Lei...
36,vor 2 Tagen,Berlin,Data Analyst m/w/d,Precise Hotels & Resorts GmbH,HOGAPAGE.de,,,\n\nData Analyst m/w/d\n\n\n \nWerden Sie Teil...
51,vor 5 Tagen,Berlin,Data Analyst (m/w/d),Michael Page,Michael Page,,,Hersteller-unabhängiges Unternehmen|Agiles und...
54,vor 5 Tagen,Berlin,(Senior) Data Analyst / Business Analyst (m/w/d),Weltbild GmbH & CO. KG,Verlagsgruppe Weltbild,,,\n\n\nWeltbild als einer der größten Multikana...


In [None]:
data_analyst_df.shape

(76, 8)

In [11]:
data_analyst_df[data_analyst_df.date.str.startswith('.css')].date.index

Int64Index([  2,   3,   4,   6,  17,  19,  26,  27,  28,  29,  31,  32,  38,
             44,  60,  90,  93,  94,  98, 102, 110, 123, 145, 222, 235, 255],
           dtype='int64')

In [12]:
#index_rep = data_analyst_df[data_analyst_df.date == "Diesen Job melden"].date.index #bad values for old indeed.de format
index_rep = data_analyst_df[data_analyst_df.date.str.startswith('.css')].date.index #bad values for new indeed.de format 

In [13]:
#footers had it wrong sometimes, replaced with company_alt that had the right date which exist in the footer as well

actual_date = data_analyst_df.iloc[index_rep].company_alt
fake_date = data_analyst_df.iloc[index_rep].date
data_analyst_df.date.replace(fake_date.values, actual_date.values, inplace=True)

In [14]:
data_analyst_df.date.value_counts()

vor 30+ Tagen    188
vor 1 Tag         11
vor 2 Tagen        7
vor 19 Tagen       7
vor 12 Tagen       7
vor 9 Tagen        7
vor 8 Tagen        6
vor 26 Tagen       5
vor 11 Tagen       5
vor 10 Tagen       5
vor 17 Tagen       4
vor 3 Tagen        4
vor 22 Tagen       3
vor 16 Tagen       3
vor 15 Tagen       3
vor 23 Tagen       3
vor 13 Tagen       2
vor 30 Tagen       2
vor 29 Tagen       2
vor 4 Tagen        2
vor 18 Tagen       2
vor 5 Tagen        1
vor 25 Tagen       1
Name: date, dtype: int64

In [15]:
data_analyst_df.company.value_counts()

Delivery Hero                   19
                                 9
Zalando                          9
AUTO1                            8
Deutsche Bank                    8
                                ..
Takeda Pharmaceutical            1
Mambu                            1
E.ON Digital Technology GmbH     1
Zimmer Biomet                    1
i-potentials GmbH                1
Name: company, Length: 178, dtype: int64

In [16]:
ind = data_analyst_df[data_analyst_df.company == ""].company.index

In [17]:
#This time getting the company from the footer
actual = data_analyst_df.iloc[ind].company_alt
fake = data_analyst_df.iloc[ind].company
data_analyst_df.company.replace(fake.values, actual.values, inplace=True)

In [18]:
data_analyst_df.company.value_counts()

Delivery Hero                   19
Adevinta Group                   9
Zalando                          9
AUTO1                            8
Deutsche Bank                    8
                                ..
Takeda Pharmaceutical            1
Mambu                            1
E.ON Digital Technology GmbH     1
Zimmer Biomet                    1
i-potentials GmbH                1
Name: company, Length: 178, dtype: int64

In [19]:
data_analyst_df.head()

Unnamed: 0,date,job_location,job_title,company,company_alt,requirement,add_req,job_description
0,vor 1 Tag,Berlin,Intern (f/m/d) - Data Analyst,SAP,SAP,Preferred fields of study: Business Informatic...,,\n\nWe help the world run better\n\n Our compa...
1,vor 8 Tagen,Berlin,Data Analyst (m/f/d),ToolTime GmbH,ToolTime GmbH,Proven experience as data analyst or business ...,A modern office space as well as the ability t...,DESCRIPTION\nAbout us\n\nToolTime is a fast-gr...
2,vor 30+ Tagen,Berlin,Finance Data Analyst,Kenjo GmbH,vor 4 Tagen,"Have robust Relational Model, SQL and BI softw...",,\nWelcome to your Kenjo application journey\n*...
3,vor 30+ Tagen,Berlin,Data Analyst (gn),eduki,vor 30+ Tagen,"You deliver good knowledge of statistics, a hi...",,Company© eduki*About us*eduki was founded in 2...
4,vor 30+ Tagen,Berlin,Business Data Analyst – Level 1 (m/f/d),ICE International Copyright Enterprise Germany...,vor 16 Tagen,"You’re keen to delve into the world of coding,...",,What is ICE?ICE is the result of a big vision ...


Getting some css in the date field.
Needs a fix!

Add feature, company rating from (glassdoor?)

Add experience level feature

check which companies add years of experience that unmatch the experience level in the job post #HR accountability