In [1]:
from bs4 import BeautifulSoup # For HTML parsing
import urllib # Website connections
import re # Regular expressions
from time import sleep # To prevent overwhelming the server between connections
from collections import Counter # Keep track of our term counts
from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'
import pandas as pd # For converting results to a dataframe and bar chart plots
%matplotlib inline
import requests

In [2]:
def text_cleaner(website):
    '''
    This function just cleans up the raw html so that I can look at it.
    Inputs: a URL to investigate
    Outputs: Cleaned text only
    '''
    try:
        site = requests.get(website).content # Connect to the job posting
    except: 
        return   # Need this in case the website isn't there anymore or some other weird connection problem 
    
    soup_obj = BeautifulSoup(site) # Get the html from the site
    
    if len(soup_obj) == 0: # In case the default parser lxml doesn't work, try another one
        soup_obj = BeautifulSoup(site, 'html5lib')
    
    
    for script in soup_obj(["script", "style"]):
        script.extract() # Remove these two elements from the BS4 object
    
    

    text = soup_obj.get_text() # Get the text from this

    

    lines = (line.strip() for line in text.splitlines()) # break into lines

    
    
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each
    

    text = ''.join(chunk for chunk in chunks if chunk).encode('utf-8') # Get rid of all blank lines and ends of line
    
    # Now clean out all of the unicode junk (this line works great!!!)
    
    try:
        text = text.decode('unicode_escape').encode('ascii', 'ignore') # Need this as some websites aren't formatted
    except:                                                            # in a way that this works, can occasionally throw
        return                                                         # an exception
   
    
    #text = re.sub("[^a-zA-Z+3]"," ", text)  # Now get rid of any terms that aren't words (include 3 for d3.js)
                                             # Also include + for C++
    #text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text) # Fix spacing issue from merged words
    
    text = text.lower().split()  # Go to lower case and split them apart
    
    
    stop_words = set(stopwords.words("english")) # Filter out any stop words
    text = [w for w in text if not w in stop_words]
    
    
    
    text = list(set(text)) # Last, just get the set of these. Ignore counts (we are just looking at whether a term existed
                           # or not on the website)
    
    return text

In [3]:
r = text_cleaner('https://improbable.io/careers/opportunities/games-engineer')

In [4]:
website = "https://www.linkedin.com/jobs/view/1371623902/"

try:
    site = requests.get(website).content # Connect to the job posting
except: 
      print("lost site")# Need this in case the website isn't there anymore or some other weird connection problem 

soup_obj = BeautifulSoup(site) # Get the html from the site

if len(soup_obj) == 0: # In case the default parser lxml doesn't work, try another one
    soup_obj = BeautifulSoup(site, 'html5lib')


for script in soup_obj(["script", "style"]):
    script.extract() # Remove these two elements from the BS4 object
    
soup_obj

<!DOCTYPE html>
<html lang="en"><head><meta content="d_jobs_guest_details" name="pageKey"/><meta content="https://static-exp1.licdn.com/sc/h/23cdp6duc7j59h1n8r0ctrfxz" name="seoBeaconWorkerUrl"/><meta content="en_US" name="locale"/><meta data-app-id="com.linkedin.jobs-guest-frontend.d_web" data-custom-tracking-code="" data-tracking-page-type="" id="config"/><link href="https://uk.linkedin.com/jobs/view/front-end-software-engineer-developer-consumer-onboarding-at-lloyds-banking-group-1371623902" rel="canonical"/><link href="https://static-exp1.licdn.com/scds/common/u/images/logos/favicons/v1/favicon.ico" rel="icon"/><title>Lloyds Banking Group hiring Front End Software Engineer/Developer - Consumer Onboarding in London, GB | LinkedIn</title><meta content="Posted 1 day ago. Our team are leading the way for our customers &amp;amp;amp;amp;amp; colleagues in transforming &amp;amp;amp;amp;amp;…See this and similar jobs on LinkedIn." name="description"/><meta content="noarchive" name="robots"

In [5]:
s = text_cleaner('https://www.linkedin.com/jobs/view/1333850119/')

In [6]:
r

[b'vast',
 b'world',
 b'great',
 b'everyone',
 b'tech',
 b'community',
 b'multiserver',
 b'community.pricingspatialos',
 b'today.engineering',
 b'roles.',
 b'rpgs',
 b'working',
 b'religion,',
 b'everything',
 b'engine.use',
 b'a',
 b'this',
 b'requirements',
 b'richly',
 b'allowing',
 b'exceptional',
 b'casesany',
 b'platform,',
 b'potential',
 b'titles',
 b'opportunities',
 b'bright',
 b'gdk',
 b'meaningful,',
 b'why',
 b'cryengine).excellent',
 b'new,',
 b'who',
 b'go',
 b'were',
 b'unityquickly',
 b'augment',
 b'quickly',
 b'see',
 b'complete',
 b'big',
 b'teams',
 b'yourself.',
 b'regular',
 b'mechanics,',
 b'equally',
 b'transformative.your',
 b'search',
 b'challenges.overviewhead',
 b'gameplay',
 b'costs.documentationeverything',
 b'our',
 b'makes',
 b'unrealthe',
 b'policyprivacy',
 b'age,',
 b'improbablewe',
 b'the',
 b'least',
 b'veteran',
 b'top',
 b'for',
 b'player',
 b'docs',
 b'become',
 b'robust',
 b'worlds.forumsvisit',
 b'design',
 b'power',
 b'will',
 b'workflows',
 b

In [7]:
r

[b'vast',
 b'world',
 b'great',
 b'everyone',
 b'tech',
 b'community',
 b'multiserver',
 b'community.pricingspatialos',
 b'today.engineering',
 b'roles.',
 b'rpgs',
 b'working',
 b'religion,',
 b'everything',
 b'engine.use',
 b'a',
 b'this',
 b'requirements',
 b'richly',
 b'allowing',
 b'exceptional',
 b'casesany',
 b'platform,',
 b'potential',
 b'titles',
 b'opportunities',
 b'bright',
 b'gdk',
 b'meaningful,',
 b'why',
 b'cryengine).excellent',
 b'new,',
 b'who',
 b'go',
 b'were',
 b'unityquickly',
 b'augment',
 b'quickly',
 b'see',
 b'complete',
 b'big',
 b'teams',
 b'yourself.',
 b'regular',
 b'mechanics,',
 b'equally',
 b'transformative.your',
 b'search',
 b'challenges.overviewhead',
 b'gameplay',
 b'costs.documentationeverything',
 b'our',
 b'makes',
 b'unrealthe',
 b'policyprivacy',
 b'age,',
 b'improbablewe',
 b'the',
 b'least',
 b'veteran',
 b'top',
 b'for',
 b'player',
 b'docs',
 b'become',
 b'robust',
 b'worlds.forumsvisit',
 b'design',
 b'power',
 b'will',
 b'workflows',
 b

# Importance and Wordcloud

In [8]:
# For data and matrix manipulation
import pandas as pd
import numpy as np

# For visualisation
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import cufflinks as cf
import plotly.graph_objs as go
from plotly.plotly import iplot

# For string manipulation
import re 
import string

# For text pre-processing
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# For assigning sentiment polarity scores
from textblob import TextBlob

# For extracting features -- i.e. the document-term matrix
from sklearn.feature_extraction.text import CountVectorizer

# For splitting the data into a training and a testing set
from sklearn.model_selection import train_test_split

# For using Naive Bayes - the typical basic machine learning algorithm of choice for Text analytics
from sklearn.naive_bayes import MultinomialNB

# For evaluating our machine learning model 
from sklearn.metrics import accuracy_score, classification_report


# Necessary dependencies from NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

ImportError: 
The plotly.plotly module is deprecated,
please install the chart-studio package and use the
chart_studio.plotly module instead. 


In [None]:
# Find the sum of occurences of each term
sum_of_words = r.sum(axis= 0)

# Create a list of tuples where each element represents the term in question and how many times it occurs in our 
# corpus.
words_freq = [(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]

# Sort in decreasing order of frequency.
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)

words_freq

In [9]:
def skills_info(city = None, state = None):
    '''
    This function will take a desired city/state and look for all new job postings
    on Indeed.com. It will crawl all of the job postings and keep track of how many
    use a preset list of typical data science skills. The final percentage for each skill
    is then displayed at the end of the collation. 
    
    Inputs: The location's city and state. These are optional. If no city/state is input, 
    the function will assume a national search (this can take a while!!!).
    Input the city/state as strings, such as skills_info('Chicago', 'IL').
    Use a two letter abbreviation for the state.
    
    Output: A bar chart showing the most commonly desired skills in the job market for 
    a data scientist. 
    '''
    
    final_job = 'data + scientist' # searching for data scientist exact fit("data scientist" on Indeed search)
    
    # Make sure the city specified works properly if it has more than one word (such as San Francisco)
    if city is not None:
        final_city = city.split() 
        final_city = '+'.join(word for word in final_city)
        final_site_list = ['http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city,
                   '%2C+', state] # Join all of our strings together so that indeed will search correctly
    else:
        final_site_list = ['http://www.indeed.com/jobs?q="', final_job, '"']

    final_site = ''.join(final_site_list) # Merge the html address together into one string


    base_url = 'http://www.indeed.com'


    try:
        html = requests.get(website).content # Open up the front page of our search first
    except:
        'That city/state combination did not have any jobs. Exiting . . .' # In case the city is invalid
        return
    soup = BeautifulSoup(html) # Get the html from the first page

    # Now find out how many jobs there were

    num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8') # Now extract the total number of jobs found
                                                                         # The 'searchCount' object has this

    job_numbers = re.findall('\d+', num_jobs_area) # Extract the total jobs found from the search result


    if len(job_numbers) > 3: # Have a total number of jobs greater than 1000
        total_num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
    else:
        total_num_jobs = int(job_numbers[2]) 

    city_title = city
    if city is None:
        city_title = 'Nationwide'
    
    print('There were', total_num_jobs, 'jobs found,', city_title )# Display how many jobs were found

    num_pages = total_num_jobs/10 # This will be how we know the number of times we need to iterate over each new
                                  # search result page
    job_descriptions = [] # Store all our descriptions in this list

    for i in xrange(1,num_pages+1): # Loop through all of our search result pages
        print('Getting page'), i
        start_num = str(i*10) # Assign the multiplier of 10 to view the pages we want
        current_page = ''.join([final_site, '&start=', start_num])
        # Now that we can view the correct 10 job returns, start collecting the text samples from each
        
        html_page = requests.get(current_page).content # Get the page
        
        page_obj = BeautifulSoup(html_page) # Locate all of the job links
        job_link_area = page_obj.find(id = 'resultsCol') # The center column on the page where the job postings exist
        
        job_URLS = [base_url + link.get('href') for link in job_link_area.find_all('a')] # Get the URLS for the jobs
        
        job_URLS = filter(lambda x:'clk' in x, job_URLS) # Now get just the job related URLS
        
    
        for j in xrange(0,len(job_URLS)):
            final_description = text_cleaner(job_URLS[j])
            if final_description: # So that we only append when the website was accessed correctly
                job_descriptions.append(final_description)
            sleep(1) # So that we don't be jerks. If you have a very fast internet connection you could hit the server a lot! 
    
    print('Done with collecting the job postings!')    
    print('There were', len(job_descriptions), 'jobs successfully found.')


    doc_frequency = Counter() # This will create a full counter of our terms. 
    [doc_frequency.update(item) for item in job_descriptions] # List comp

    # Now we can just look at our final dict list inside doc_frequency

    # Obtain our key terms and store them in a dict. These are the key data science skills we are looking for

    prog_lang_dict = Counter({'R':doc_frequency['r'], 'Python':doc_frequency['python'],
                    'Java':doc_frequency['java'], 'C++':doc_frequency['c++'],
                   'Ruby':doc_frequency['ruby'],
                  'Perl':doc_frequency['perl'], 'Matlab':doc_frequency['matlab'],
                  'JavaScript':doc_frequency['javascript'], 'Scala': doc_frequency['scala']})
                  
    analysis_tool_dict = Counter({'Excel':doc_frequency['excel'],  'Tableau':doc_frequency['tableau'],
                      'D3.js':doc_frequency['d3.js'], 'SAS':doc_frequency['sas'],
                      'SPSS':doc_frequency['spss'], 'D3':doc_frequency['d3']})  

    hadoop_dict = Counter({'Hadoop':doc_frequency['hadoop'], 'MapReduce':doc_frequency['mapreduce'],
               'Spark':doc_frequency['spark'], 'Pig':doc_frequency['pig'],
               'Hive':doc_frequency['hive'], 'Shark':doc_frequency['shark'],
               'Oozie':doc_frequency['oozie'], 'ZooKeeper':doc_frequency['zookeeper'],
               'Flume':doc_frequency['flume'], 'Mahout':doc_frequency['mahout']})
               
    database_dict = Counter({'SQL':doc_frequency['sql'], 'NoSQL':doc_frequency['nosql'],
                 'HBase':doc_frequency['hbase'], 'Cassandra':doc_frequency['cassandra'],
                 'MongoDB':doc_frequency['mongodb']})
                 
           
    overall_total_skills = prog_lang_dict + analysis_tool_dict + hadoop_dict + database_dict # Combine our Counter objects

    

    final_frame = pd.DataFrame(overall_total_skills.items(), columns = ['Term', 'NumPostings']) # Convert these terms to a 
                                                                                                # dataframe 

    # Change the values to reflect a percentage of the postings 

    final_frame.NumPostings = (final_frame.NumPostings)*100/len(job_descriptions) # Gives percentage of job postings 
                                                                                  #  having that term 

    # Sort the data for plotting purposes

    final_frame.sort(columns = 'NumPostings', ascending = False, inplace = True)

    # Get it ready for a bar plot
    
    final_plot = final_frame.plot(x = 'Term', kind = 'bar', legend = None, 
                              title = 'Percentage of Data Scientist Job Ads with a Key Skill, ' + city_title)
    
    final_plot.set_ylabel('Percentage Appearing in Job Ads')
    fig = final_plot.get_figure() # Have to convert the pandas plot object to a matplotlib object
    
    
    return fig, final_frame # End of the function

In [10]:
seattle_info = skills_info(city = 'New York', state = 'NY') 

AttributeError: 'NoneType' object has no attribute 'string'

In [None]:
seattle_info.info()

In [14]:
def skills_info(city = None, state = None):
    '''
    This function will take a desired city/state and look for all new job postings
    on Indeed.com. It will crawl all of the job postings and keep track of how many
    use a preset list of typical data science skills. The final percentage for each skill
    is then displayed at the end of the collation. 
    
    Inputs: The location's city and state. These are optional. If no city/state is input, 
    the function will assume a national search (this can take a while!!!).
    Input the city/state as strings, such as skills_info('Chicago', 'IL').
    Use a two letter abbreviation for the state.
    
    Output: A bar chart showing the most commonly desired skills in the job market for 
    a data scientist. 
    '''
    
    final_job = 'data + scientist' # searching for data scientist exact fit("data scientist" on Indeed search)
    
    # Make sure the city specified works properly if it has more than one word (such as San Francisco)
    if city is not None:
        final_city = city.split() 
        final_city = '+'.join(word for word in final_city)
        final_site_list = ['http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city,
                   '%2C+', state] # Join all of our strings together so that indeed will search correctly
    else:
        final_site_list = ['http://www.indeed.com/jobs?q="', final_job, '"']

    final_site = ''.join(final_site_list) # Merge the html address together into one string


    base_url = 'http://www.indeed.com'

    website = "https://www.indeed.co.uk/jobs?q=java&l=London"
    try:
        html = requests.get(website).content # Open up the front page of our search first
    except:
        'That city/state combination did not have any jobs. Exiting . . .' # In case the city is invalid
        return
    soup = BeautifulSoup(html) # Get the html from the first page

    # Now find out how many jobs there were

    num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8') 

In [15]:
skills_info(city = 'New York', state = 'New York')

In [None]:
t = text_cleaner('https://www.linkedin.com/jobs/search/?f_C=1214&locationId=OTHERS.worldwide')

In [13]:
t

NameError: name 't' is not defined

In [None]:
a = text_cleaner('https://www.linkedin.com/jobs/search/?currentJobId=1332555063&f_C=1214&locationId=OTHERS.worldwide')

In [None]:
a

In [None]:
m = text_cleaner('https://jobs.ubs.com')

In [None]:
m = text_cleaner('https://www.google.com/search?q=careers+at+ubs&rlz=1C1CHBF_en-GBGB810GB810&oq=careers+at+ubs+&aqs=chrome..69i57j0l5.4397j0j7&sourceid=chrome&ie=UTF-8&ibp=htl;jobs&sa=X&ved=2ahUKEwiX9IO-qr7jAhWEQUEAHZ9sD8EQiYsCKAF6BAgGEBA#fpstate=tldetail&htidocid=yG2N6pHkVHWalayKAAAAAA%3D%3D&htivrt=jobs')

In [None]:
m

In [None]:
u = text_cleaner('https://www.indeed.com/cmp/UBS/jobs')

In [None]:
u

In [None]:
l = text_cleaner('https://www.bbc.co.uk/news/uk-politics-49008826')

In [None]:
l

In [None]:
o = text_cleaner('https://www.indeed.co.uk/jobs?q=UBS&l&advn=4724745093346406&vjk=ea7946042bd5eac6')

In [None]:
o

In [None]:
i = text_cleaner('https://uk.linkedin.com/in/chris-finn-a40749b')

In [None]:
i