# Web Scrapping for Data job in CO

Here is the link to the search query

https://www.indeed.com/jobs?q=data+scientist&l=CO

As you can see at the bottom of the page there are link to series of pages related to this search.
If you click on second page, search url changes to

https://www.indeed.com/jobs?q=data+scientist&l=CO&start=10

If you click on 3rd then url changes to

https://www.indeed.com/jobs?q=data+scientist&l=CO&start=20

Hence to go to more pages we can format search string(**change start=??** part) for **requests.get in a loop**


Scrape 10 pages and build a pandas DataFrame containing following information

   + job title, name of the company, location, summary of job description
   + Indicator columns(with value True/False) about keywords Python, SQL, AWS, Machine learning, Deep Learning, Text Mining, STATA, SAS, Tableau

In [1]:
import pandas as pd
import requests as rq
import numpy as np
import json
import re
import time
from bs4 import BeautifulSoup as bsp

In [2]:
# data+analyst OR data+scientist
mainpage="https://www.indeed.com"
detail_page_list,job_title_list=[],[]
for job in ["analyst","scientist"]:
    for i in range(0,10):
        page=i*10
        search_string=f"https://www.indeed.com/jobs?q=data+{job}&l=CO&start={page}"
        response=rq.get(search_string) 
        ## raise an error if the connection fails
        if response.status_code !=200:
            raise ValueError(f'The connection for page {search_string} failed.')
        soup= bsp(response.text, 'lxml')
        ## get hyperlink for each job 
        title_link=soup.find_all('a',{"data-tn-element":"jobTitle"})  
        for hyperlink in title_link:
            ## get job title
            job_title_list+=[hyperlink.get_text().strip("\n")] 
            ## for each hyperlink, save the link 
            detail_link=mainpage+hyperlink['href']
            detail_page_list+=[detail_link]

In [5]:
company_name_list,company_location_list,detail_description_list=[],[],[]
## enter each detailed page for more job information
## also records page with no details
no_content=set()
for i in range(0,len(detail_page_list)):
    individual_link=detail_page_list[i]
    response_detail=rq.get(individual_link)
    if response_detail.status_code !=200:
        raise ValueError(f'The connection for page {individual_link} failed.')
    soup_detail=bsp(response_detail.text,'lxml')
    company_info=soup_detail.find_all('div',{'class':'icl-u-lg-mr--sm icl-u-xs-mr--xs'}) 
    ## job links contain advertisement for Indeed Prime which would need to be excluded
    if company_info:
        ## get the companyname and location
        company_name_list+=[company_info[0].get_text().strip("\n")]
        company_location_list+=[company_info[1].next_sibling.get_text().strip("\n")]  
        ## search for key words in decription
        job_description=str(soup_detail.find_all("div",{"class":"jobsearch-JobComponent-description icl-u-xs-mt--md"}))
        detail_description_list+=[job_description]
    else:
        print(f'i={i}',company_info)
        no_content.add(i)
        company_name_list+=[None]
        company_location_list+=[None]
        detail_description_list+=[None]

i=11 []
i=13 []
i=16 []
i=58 []
i=78 []
i=88 []
i=114 []
i=160 []
i=163 []
i=173 []
i=187 []
i=204 []
i=224 []
i=225 []
i=261 []
i=263 []
i=294 []
i=322 []
i=330 []
i=333 []
i=339 []


In [7]:
## there are times that Python mistakenly missed some information other than those indeed prime ads
## need to repeately run the following to fill those holes untill all the Nones were due to indeed prime page
print(no_content)
filled=set()
for i in no_content:
    individual_link=detail_page_list[i]
    response_detail=rq.get(individual_link)
    if response_detail.status_code !=200:
        raise ValueError(f'The connection for page {individual_link} failed.')
    soup_detail=bsp(response_detail.text,'lxml')
    company_info=soup_detail.find_all('div',{'class':'icl-u-lg-mr--sm icl-u-xs-mr--xs'}) 
    ## job links contain advertisement for Indeed Prime which would need to be excluded
    if len(company_info)>0:
        ## get the companyname and location
        company_name_list[i]=[company_info[0].get_text()]
        company_location_list[i]=[company_info[1].next_sibling.get_text()]  
        ## search for key words in decription
        job_description=str(soup_detail.find_all("div",{"class":"jobsearch-JobComponent-description icl-u-xs-mt--md"}))
        detail_description_list[i]=[job_description]
        filled.add(i)  
print(filled)
no_content={i for i in no_content if i not in filled}

set()
set()


In [8]:
## verify above
for i in no_content:
    print(summary_description_list[i])
## all the links below are ads for Indeed Prime

In [9]:
Key_Python,Key_SQL,Key_AWS,Key_Machine_Learning,Key_Deep_Learning,Key_Text_Mining,\
Key_SAS,Key_Tableau,Key_Stata=[],[],[],[],[],[],[],[],[]
## search for key words
for i in range(0,len(detail_description_list)):
    if i not in no_content:
        job_description=str(detail_description_list[i])
        # Python
        if re.search('Python', job_description, re.IGNORECASE):
            Key_Python+=[True]
        else:
            Key_Python+=[False]
        # SQL
        if re.search('SQL', job_description):
            Key_SQL+=[True]
        else:
            Key_SQL+=[False]
        # AWS
        if re.search('AWS', job_description):
            Key_AWS+=[True]
        else:
            Key_AWS+=[False]   
        # Machine Learning
        if re.search('Machine Learning', job_description, re.IGNORECASE):
            Key_Machine_Learning+=[True]
        else:
            Key_Machine_Learning+=[False]
        # Deep Learning
        if re.search('Deep Learning', job_description, re.IGNORECASE):
            Key_Deep_Learning+=[True]
        else:
            Key_Deep_Learning+=[False]
        # Text Mining
        if re.search('Text Mining', job_description, re.IGNORECASE):
            Key_Text_Mining+=[True]
        else:
            Key_Text_Mining+=[False]
        # SAS
        if re.search('SAS', job_description):
            Key_SAS+=[True]
        else:
            Key_SAS+=[False]
        # Tableau
        if re.search('Tableau', job_description, re.IGNORECASE):
            Key_Tableau+=[True]
        else:
            Key_Tableau+=[False]
        # Stata
        if re.search('Stata', job_description, re.IGNORECASE):
            Key_Stata+=[True]
        else:
            Key_Stata+=[False]
    else:
        Key_Python+=[None]
        Key_SQL+=[None]
        Key_AWS+=[None]
        Key_Machine_Learning+=[None]
        Key_Deep_Learning+=[None]
        Key_Text_Mining+=[None]
        Key_SAS+=[None]
        Key_Tableau+=[None]
        Key_Stata+=[None]

In [10]:
## create data frame
data={"Title":job_title_list,"Company":company_name_list,\
      "Location":company_location_list,"SummaryDescription":detail_description_list,\
      "Python":Key_Python,"SQL":Key_SQL,"AWS":Key_AWS,\
        "Machine_Learning":Key_Machine_Learning,\
     "Deep_Learning":Key_Deep_Learning,"Text_Mining":Key_Text_Mining,\
"SAS":Key_SAS,"Tableau":Key_Tableau,\
     "STATA":Key_Stata}
job_indeed_df=pd.DataFrame.from_dict(data)
print(job_indeed_df.dtypes)

Title                 object
Company               object
Location              object
SummaryDescription    object
Python                  bool
SQL                     bool
AWS                     bool
Machine_Learning        bool
Deep_Learning           bool
Text_Mining             bool
SAS                     bool
Tableau                 bool
STATA                   bool
dtype: object


# City with the most job postings

In [11]:
## load data
df=job_indeed_df.astype("str").drop_duplicates()
## get unique cities/towns
unique_location=set(df.Location.str.replace("[^a-zA-Z]|CO",""))
print(unique_location)
for l in unique_location:
    df[l] = df.Location.map(lambda x: l in re.sub("[^a-zA-Z]|CO","",x))
df.head()

{'Centennial', 'ColoradoSprings', 'Boulder', 'GreenwoodVillage', 'LoneTree', 'Lafayette', 'Aurora', 'Denver', 'Broomfield', 'Littleton', 'Loveland', 'Englewood', 'Colorado', 'GrandJunction', 'Louisville', 'Henderson', 'Superior', 'Westminster', 'Lakewood', 'Longmont', 'Golden', 'HighlandsRanch'}


Unnamed: 0,Title,Company,Location,SummaryDescription,Python,SQL,AWS,Machine_Learning,Deep_Learning,Text_Mining,...,Colorado,GrandJunction,Louisville,Henderson,Superior,Westminster,Lakewood,Longmont,Golden,HighlandsRanch
0,Data Analyst,Private,"Aurora, CO 80011","[<div class=""jobsearch-JobComponent-descriptio...",False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Sales Analyst,Science Interactive Group,"Englewood, CO","[<div class=""jobsearch-JobComponent-descriptio...",False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Intern Analyst,Real Capital Solutions,"Louisville, CO 80027","[<div class=""jobsearch-JobComponent-descriptio...",False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3,2020 Summer Intern: Data Analyst,SPECTRUM,"Greenwood Village, CO 80121","[<div class=""jobsearch-JobComponent-descriptio...",True,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Director of Data Analytics,Suited Connector,"Englewood, CO 80112","[<div class=""jobsearch-JobComponent-descriptio...",False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [12]:
## count={i:0 for i in unique_location}
## for i in df.Location.str.replace("[^a-zA-Z]|CO",""):
##     if i in count:
##         count[i]+=1
## count
## ------------------------------------------------------
## or directly using 
## df.sum()[unique_location].sort_values(ascending=False)
df.sum()[unique_location].sort_values(ascending=False).head()

Denver        88
Boulder       18
Englewood     15
Broomfield    14
Aurora        11
dtype: object

# Top 3 most demanding skills(like Python, AWS, SQL ...)



In [14]:
## convert str Ture&False to Boolean
df_skills=df[ ['Python','SQL','AWS','Machine_Learning','Deep_Learning','Text_Mining','SAS','Tableau','STATA'] ].replace({'True': True, 'False': False})
## sort and show Top 3 most demanding skills
## df_skills.sum().sort_values(ascending=False)
df_skills.sum().sort_values(ascending=False)

Python              119
SQL                 113
Machine_Learning     75
Tableau              49
AWS                  36
SAS                  31
Deep_Learning        25
STATA                 5
Text_Mining           3
dtype: int64