In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
from bs4 import BeautifulSoup   # For HTML parsing
import urllib2  # Website connections
import re  # Regular expressions
from time import sleep  # To prevent overwhelming the server between connections
from collections import Counter  # Keep track of our term counts
from nltk.corpus import stopwords  # Filter out stopwords, such as 'the', 'or', 'and'
import pandas as pd  # For converting results to a dataframe and bar chart plots
%matplotlib inline

In [4]:
def text_cleaner(website):
    
    try:
        site = urllib2.urlopen(website).read() # Connect to the job posting
    except: 
        return
    
    soup_obj = BeautifulSoup(site, 'lxml')
    
    if len(soup_obj) == 0:    # In case lxml doesn't work, try another one
        soup_obj = BeautifulSoup(site, 'html5lib')
        
    for script in soup_obj(["script", "style"]):
        script.extract() # Remove these two elements from the BS4 object
        
    text = soup_obj.get_text()
    
    lines = (line.strip() for line in text.splitlines()) # break into lines
    
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) 
    
    text = ''.join(chunk for chunk in chunks if chunk).encode('utf-8')
    
    # Now clean out all of the unicode junk
    
    try:
        text = text.decode('unicode_escape').encode('ascii', 'ignore') # Need this as some websites aren't formatted
    except:                                                            # in a way that this works, can occasionally throw
        return                                                         # an exception
        
    text = re.sub("[^a-zA-Z+3]"," ", text)  # Now get rid of any terms that aren't words (include 3 for d3.js)
                                             # Also include + for C++
    
    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text) # Fix spacing issue from merged words
    
    text = text.lower().split()  # Go to lower case and split them apart
    
    stop_words = set(stopwords.words("english")) # Filter out any stop words
    text = [w for w in text if not w in stop_words]
    
    text = list(set(text))
    
    return text

In [5]:
text_cleaner('https://www.vocabulary.com/dictionary/king')

['founder',
 'chaldea',
 'consolidated',
 'resistance',
 'dynasty',
 'four',
 'barons',
 'battle',
 'idolatry',
 'affairsa',
 'religious',
 'children',
 'competition',
 'antoinette',
 'whose',
 'venice',
 'simon',
 'terms',
 'augustine',
 'vikings',
 'navarreking',
 'paris',
 'father',
 'young',
 'charles',
 'charge',
 'edward',
 'bruceking',
 'th',
 'bannockburn',
 'include',
 'sent',
 'philistines',
 'started',
 'activities',
 'supported',
 'seized',
 'jezebel',
 'tudor',
 'calais',
 'improvement',
 'reopened',
 'account',
 'pompey',
 'every',
 'reforms',
 'trouble',
 'vast',
 'theking',
 'alaricking',
 'judaism',
 'flodden',
 'louis',
 'iithe',
 'list',
 'brother',
 'leader',
 'queen',
 'assignments',
 'chequerone',
 'antonyms',
 'succession',
 'becket',
 'noted',
 'gordiuslegendary',
 'rufusthe',
 'revolution',
 'invader',
 'leaders',
 'preeminencehigh',
 'educators',
 'regained',
 'sign',
 'sumerian',
 'elegant',
 'invaded',
 'capet',
 'mussolini',
 'victor',
 'jesus',
 'establish

In [106]:
job_descriptions = [] # Store all our descriptions in this list

def skills_info(city, state):
    final_job = 'data+scientist'
    
    if city is not None:
        final_city = city.split() 
        final_city = '+'.join(word for word in final_city)
        final_site_list = ['http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city,
                   '%2C+', state] # Join all of our strings together so that indeed will search correctly
    else:
        final_site_list = ['http://www.indeed.com/jobs?q="', final_job, '"']
    
    final_site = ''.join(final_site_list)
    
    base_url = 'http://www.indeed.com'
    
    try:
        html = urllib2.urlopen(final_site).read() # Open up the front page of our search first
    except:
        'That city/state combination did not have any jobs. Exiting . . .' # In case the city is invalid
        return
    
    soup = BeautifulSoup(html, 'lxml')
    
    num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8')
    job_numbers = re.findall('\d+', num_jobs_area)
    
    if len(job_numbers) > 3: # Have a total number of jobs greater than 1000
        total_num_jobs = (int(job_numbers[1])*1000) + int(job_numbers[2])
    else:
        total_num_jobs = int(job_numbers[1]) 
    
    city_title = city
    if city is None:
        city_title = 'Nationwide'
    
    num_pages = total_num_jobs/10 # This will be how we know the number of times we need to iterate over each new
                                  # search result page
    
    for i in xrange(1,num_pages+1): # Loop through all of our search result pages
        print 'Getting page', i
        start_num = str(i*10) # Assign the multiplier of 10 to view the pages we want
        current_page = ''.join([final_site, '&start=', start_num])
        html_page = urllib2.urlopen(current_page).read() # Get the page
        
        page_obj = BeautifulSoup(html_page, 'lxml') # Locate all of the job links
        job_link_area = page_obj.find(id = 'resultsCol') # The center column on the page where the job postings exist
        
        job_URLS = [base_url + link.get('href') for link in job_link_area.find_all('a') if link.get('href') is not None] # Get the URLS for the jobs
        
        job_URLS = filter(lambda x:'clk' in x, job_URLS) # Now get just the job related URLS
        
        for j in xrange(0,len(job_URLS)):
            final_description = text_cleaner(job_URLS[j])
            if final_description: # So that we only append when the website was accessed correctly
                job_descriptions.append(final_description)
            sleep(1) # So that we don't be jerks. If you have a very fast internet connection you could hit the server a lot! 
    
    print 'Done with collecting the job postings!' 
    print 'There were', len(job_descriptions), 'jobs successfully found.'
    
skills_info(city = 'Hartford', state = 'CT')

Getting page 1
Getting page 2
Getting page 3
Done with collecting the job postings!
There were 25 jobs successfully found.


In [87]:
def get_soup(job, city=None, state=None):
    soup_list = []
    final_job = job.split()
    final_job = '+'.join(word for word in final_job)
    
    if city is not None:
        final_city = city.split() 
        final_city = '+'.join(word for word in final_city)
        final_site_list = ['http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city,
                   '%2C+', state] # Join all of our strings together so that indeed will search correctly
    elif state is not None:
        final_site_list = ['http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', state]
    else:
        final_site_list = ['http://www.indeed.com/jobs?q="', final_job, '"']
    
    final_site = ''.join(final_site_list)
    
    base_url = 'http://www.indeed.com'
    
    try:
        html = urllib2.urlopen(final_site).read() # Open up the front page of our search first
    except:
        'That city/state combination did not have any jobs. Exiting . . .' # In case the city is invalid
        return
    
    soup = BeautifulSoup(html, 'lxml')
    soup_list.append(soup)
    
    num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8')
    job_numbers = re.findall('\d+', num_jobs_area)
    
    if len(job_numbers) > 3: # Have a total number of jobs greater than 1000
        total_num_jobs = (int(job_numbers[1])*1000) + int(job_numbers[2])
    else:
        total_num_jobs = int(job_numbers[1]) 
    
    city_title = city
    if city is None:
        city_title = 'Nationwide'
    
    num_pages = total_num_jobs/10 # This will be how we know the number of times we need to iterate over each new
                                  # search result page
    
    for i in xrange(1,num_pages): # Loop through all of our search result pages
        start_num = str(i*10) # Assign the multiplier of 10 to view the pages we want
        current_page = ''.join([final_site, '&start=', start_num])
        html_page = urllib2.urlopen(current_page).read() # Get the page
        page_obj = BeautifulSoup(html_page, 'lxml') # Locate all of the job links
        soup_list.append(page_obj)
        sleep(1)
        
    return soup_list

def extract_job_title_from_result(soup_list):
    jobs = []
    for soup in soup_list:
        for div in soup.find_all(name="div", attrs={"class":"row"}):
            for a in div.find_all(name="a", attrs={"data-tn-element":"jobTitle"}):
                jobs.append(a["title"])
    return(jobs)

def extract_company_from_result(soup_list): 
    companies = []
    for soup in soup_list:
        for div in soup.find_all(name='div', attrs={'class':'row'}):
            company = div.find_all(name='span', attrs={'class':'company'})
            if len(company) > 0:
                for b in company:
                    companies.append(b.text.strip())
            else:
                sec_try = div.find_all(name='span', attrs={'class':'result-link-source'})
                for span in sec_try:
                    companies.append(span.text.strip())
    return(companies)

def extract_location_from_result(soup_list):
    locations = []
    for soup in soup_list:
        spans = soup.findAll('span', attrs={'class': 'location'})
        for span in spans:
            locations.append(span.text.strip())
            
    return(locations)

def extract_summary_from_result(soup_list): 
    summaries = []
    for soup in soup_list:
        spans = soup.findAll('span', attrs={'class': 'summary'})
        for span in spans:
            summaries.append(span.text.strip())
    return(summaries)

In [93]:
soup_list = get_soup(job = 'Data Scientist', city = 'Hartford', state = 'CT')

data = {
    'job_titles' : extract_job_title_from_result(soup_list),
    'companies' : extract_company_from_result(soup_list),
    'summary' : extract_summary_from_result(soup_list)
}

In [96]:
import pandas as pd
df = pd.DataFrame().from_dict(data)
df.head()

Unnamed: 0,companies,job_titles,summary
0,The Hartford,Associate Data Scientist - Incubation Lab,"Communicate with Data Scientists, Data Enginee..."
1,Travelers,Data Scientist- (Machine Learning & Artificial...,"Analyze source data, working with structured a..."
2,DISNEY,Data Science Manager (Data Scientist),You will be charged with delivering actionable...
3,The Hartford,Associate Data Scientist - Incubation Lab,"Communicate with Data Scientists, Data Enginee..."
4,Aetna,Data Scientist,Experience with data extraction and analysis. ...
