In [1]:
#https://jessesw.com/Data-Science-Skills/

In [1]:
from bs4 import BeautifulSoup # For HTML parsing
import urllib2 # Website connections
import re # Regular expressions
from time import sleep # To prevent overwhelming the server between connections
from collections import Counter # Keep track of our term counts
from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'
import pandas as pd # For converting results to a dataframe and bar chart plots
%matplotlib inline

In [2]:
def text_cleaner(website):
    '''
    This function just cleans up the raw html so that I can look at it.
    Inputs: a URL to investigate
    Outputs: Cleaned text only
    '''
    try:
        site = urllib2.urlopen(website).read() # Connect to the job posting
    except: 
        return   # Need this in case the website isn't there anymore or some other weird connection problem 
    
    soup_obj = BeautifulSoup(site) # Get the html from the site
    
    for script in soup_obj(["script", "style"]):
        script.extract() # Remove these two elements from the BS4 object
    
    

    text = soup_obj.get_text() # Get the text from this
    
        
    
    lines = (line.strip() for line in text.splitlines()) # break into lines
    
        
        
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each
    
    def chunk_space(chunk):
        chunk_out = chunk + ' ' # Need to fix spacing issue
        return chunk_out  
        
    
    text = ''.join(chunk_space(chunk) for chunk in chunks if chunk).encode('utf-8') # Get rid of all blank lines and ends of line
        
        
    # Now clean out all of the unicode junk (this line works great!!!)
        
    try:
        text = text.decode('unicode_escape').encode('ascii', 'ignore') # Need this as some websites aren't formatted
    except:                                                            # in a way that this works, can occasionally throw
        return                                                         # an exception
       
        
    text = re.sub("[^a-zA-Z.+3]"," ", text)  # Now get rid of any terms that aren't words (include 3 for d3.js)
                                                # Also include + for C++
        
       
    text = text.lower().split()  # Go to lower case and split them apart
        
        
    stop_words = set(stopwords.words("english")) # Filter out any stop words
    text = [w for w in text if not w in stop_words]
        
        
        
    text = list(set(text)) # Last, just get the set of these. Ignore counts (we are just looking at whether a term existed
                            # or not on the website)
        
    return text

In [3]:
def text_cleaner(website):
    '''
    This function just cleans up the raw html so that I can look at it.
    Inputs: a URL to investigate
    Outputs: Cleaned text only
    '''
    try:
        site = urllib2.urlopen(website).read() # Connect to the job posting
    except: 
        return   # Need this in case the website isn't there anymore or some other weird connection problem 
    
    soup_obj = BeautifulSoup(site) # Get the html from the site
    
    for script in soup_obj(["script", "style"]):
        script.extract() # Remove these two elements from the BS4 object
    
    

    text = soup_obj.get_text() # Get the text from this
    
        
    
    lines = (line.strip() for line in text.splitlines()) # break into lines
    
        
        
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each
    
    def chunk_space(chunk):
        chunk_out = chunk + ' ' # Need to fix spacing issue
        return chunk_out  
        
    
    text = ''.join(chunk_space(chunk) for chunk in chunks if chunk).encode('utf-8') # Get rid of all blank lines and ends of line
        
        
    # Now clean out all of the unicode junk (this line works great!!!)
        
    try:
        text = text.decode('unicode_escape').encode('ascii', 'ignore') # Need this as some websites aren't formatted
    except:                                                            # in a way that this works, can occasionally throw
        return                                                         # an exception
       
        
    text = re.sub("[^a-zA-Z.+3]"," ", text)  # Now get rid of any terms that aren't words (include 3 for d3.js)
                                                # Also include + for C++
        
       
    text = text.lower().split()  # Go to lower case and split them apart
        
        
    stop_words = set(stopwords.words("english")) # Filter out any stop words
    text = [w for w in text if not w in stop_words]
        
        
        
    text = list(set(text)) # Last, just get the set of these. Ignore counts (we are just looking at whether a term existed
                            # or not on the website)
        
    return text

In [4]:
sample = text_cleaner('https://www.indeed.com/job/ux-researcher-0e613ff3bce04c82')
sample[:20] # Just show the first 20 words



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


['roi',
 'help',
 'skip',
 'searchclose',
 'competitive',
 'trade',
 'customer',
 'relationships.to',
 'interpersonal',
 'including',
 'human',
 'research.manage',
 'keywords',
 'contractsalary',
 'benefit.consolidate',
 'find',
 'wireframes',
 'impact',
 'content',
 'like']

In [5]:
def skills_info(city = None, state = None):
    '''
    This function will take a desired city/state and look for all new job postings
    on Indeed.com. It will crawl all of the job postings and keep track of how many
    use a preset list of typical data science skills. The final percentage for each skill
    is then displayed at the end of the collation. 
        
    Inputs: The location's city and state. These are optional. If no city/state is input, 
    the function will assume a national search (this can take a while!!!).
    Input the city/state as strings, such as skills_info('Chicago', 'IL').
    Use a two letter abbreviation for the state.
    
    https://www.indeed.com/jobs?q=data+scientist%22&l=san+francisco,+CA  
    
    Output: A bar chart showing the most commonly desired skills in the job market for 
    a data scientist. 
    '''      
    
    
    final_job = 'python' # searching for data scientist exact fit("data scientist" on Indeed search)
    
    # Make sure the city specified works properly if it has more than one word (such as San Francisco)
    if city is not None:
        final_city = city.split() 
        final_city = '+'.join(word for word in final_city)
        final_site_list = ['http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city,
                    '%2C+', state] # Join all of our strings together so that indeed will search correctly
    else:
        final_site_list = ['http://www.indeed.com/jobs?q="', final_job, '"']

    final_site = ''.join(final_site_list) # Merge the html address together into one string

    
    base_url = 'http://www.indeed.com'
    
    
    try:
        html = urllib2.urlopen(final_site).read() # Open up the front page of our search first
    except:
        'That city/state combination did not have any jobs. Exiting . . .' # In case the city is invalid
        return
    soup = BeautifulSoup(html) # Get the html from the first page
    
    # Now find out how many jobs there were
    
    num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8') # Now extract the total number of jobs found
                                                                        # The 'searchCount' object has this
    
    job_numbers = re.findall('\d+', num_jobs_area) # Extract the total jobs found from the search result
    
    print(job_numbers)
    if len(job_numbers) > 3: # Have a total number of jobs greater than 1000
        total_num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
    else:
        total_num_jobs = int(job_numbers[2]) 
    
    city_title = city
    if city is None:
        city_title = 'Nationwide'
        
    print 'There were', total_num_jobs, 'jobs found,', city_title # Display how many jobs were found
    
    num_pages = total_num_jobs/10 # This will be how we know the number of times we need to iterate over each new
                                      # search result page
    job_descriptions = [] # Store all our descriptions in this list
    
    for i in xrange(1,num_pages+1): # Loop through all of our search result pages
        print 'Getting page', i
        start_num = str(i*10) # Assign the multiplier of 10 to view the pages we want
        current_page = ''.join([final_site, '&start=', start_num])
        # Now that we can view the correct 10 job returns, start collecting the text samples from each
            
        html_page = urllib2.urlopen(current_page).read() # Get the page
        
        
        page_obj = BeautifulSoup(html_page) # Locate all of the job links
        job_link_area = page_obj.find(id = 'resultsCol') # The center column on the page where the job postings exist

        job_URLS = [base_url + link.get('href') for link in job_link_area.find_all('a')] # Get the URLS for the jobs
            
        job_URLS = filter(lambda x:'clk' in x, job_URLS) # Now get just the job related URLS
            
        
        for j in xrange(0,len(job_URLS)):
            final_description = text_cleaner(job_URLS[j])
            if final_description: # So that we only append when the website was accessed correctly
                job_descriptions.append(final_description)
            sleep(1) # So that we don't be jerks. If you have a very fast internet connection you could hit the server a lot! 
        
    print 'Done with collecting the job postings!'    
    print 'There were', len(job_descriptions), 'jobs successfully found.'
    
    
    doc_frequency = Counter() # This will create a full counter of our terms. 
    [doc_frequency.update(item) for item in job_descriptions] # List comp
    
    # Now we can just look at our final dict list inside doc_frequency
    
    # Obtain our key terms and store them in a dict. These are the key data science skills we are looking for
    
    prog_lang_dict = Counter({'R':doc_frequency['r'], 'Python':doc_frequency['python'],
                    'Java':doc_frequency['java'], 'C++':doc_frequency['c++'],
                    'Ruby':doc_frequency['ruby'],
                    'Perl':doc_frequency['perl'], 'Matlab':doc_frequency['matlab'],
                    'JavaScript':doc_frequency['javascript'], 'Scala': doc_frequency['scala']})
                      
    analysis_tool_dict = Counter({'Excel':doc_frequency['excel'],  'Tableau':doc_frequency['tableau'],
                        'D3.js':doc_frequency['d3.js'], 'SAS':doc_frequency['sas'],
                        'SPSS':doc_frequency['spss'], 'D3':doc_frequency['d3']})  

    hadoop_dict = Counter({'Hadoop':doc_frequency['hadoop'], 'MapReduce':doc_frequency['mapreduce'],
                'Spark':doc_frequency['spark'], 'Pig':doc_frequency['pig'],
                'Hive':doc_frequency['hive'], 'Shark':doc_frequency['shark'],
                'Oozie':doc_frequency['oozie'], 'ZooKeeper':doc_frequency['zookeeper'],
                'Flume':doc_frequency['flume'], 'Mahout':doc_frequency['mahout']})
                
    database_dict = Counter({'SQL':doc_frequency['sql'], 'NoSQL':doc_frequency['nosql'],
                    'HBase':doc_frequency['hbase'], 'Cassandra':doc_frequency['cassandra'],
                    'MongoDB':doc_frequency['mongodb']})
                     
               
    overall_total_skills = prog_lang_dict + analysis_tool_dict + hadoop_dict + database_dict # Combine our Counter objects
    
        
    
    final_frame = pd.DataFrame(overall_total_skills.items(), columns = ['Term', 'NumPostings']) # Convert these terms to a 
                                                                                                # dataframe 
    
    # Change the values to reflect a percentage of the postings 
    
    final_frame.NumPostings = (final_frame.NumPostings)*100/len(job_descriptions) # Gives percentage of job postings 
                                                                                    #  having that term 
    
    # Sort the data for plotting purposes
    
    final_frame.sort(columns = 'NumPostings', ascending = False, inplace = True)
    
    # Get it ready for a bar plot
        
    final_plot = final_frame.plot(x = 'Term', kind = 'bar', legend = None, 
                            title = 'Percentage of Data Scientist Job Ads with a Key Skill, ' + city_title)
        
    final_plot.set_ylabel('Percentage Appearing in Job Ads')
    fig = final_plot.get_figure() # Have to convert the pandas plot object to a matplotlib object
        
        
    return fig, final_frame # End of the function

In [6]:
silicon_val_info = skills_info(city = 'San Francisco', state = 'CA')

['1', '3', '978']
There were 978 jobs found, San Francisco
Getting page 1


TypeError: cannot concatenate 'str' and 'NoneType' objects

In [38]:
city, state = "San Francisco", 'CA'
'''
This function will take a desired city/state and look for all new job postings
on Indeed.com. It will crawl all of the job postings and keep track of how many
use a preset list of typical data science skills. The final percentage for each skill
is then displayed at the end of the collation. 

Inputs: The location's city and state. These are optional. If no city/state is input, 
the function will assume a national search (this can take a while!!!).
Input the city/state as strings, such as skills_info('Chicago', 'IL').
Use a two letter abbreviation for the state.

https://www.indeed.com/jobs?q=data+scientist%22&l=san+francisco,+CA  

Output: A bar chart showing the most commonly desired skills in the job market for 
a data scientist. 
'''      

#data+scientist+
final_job = 'python' # searching for data scientist exact fit("data scientist" on Indeed search)

# Make sure the city specified works properly if it has more than one word (such as San Francisco)
if city is not None:
    final_city = city.split() 
    final_city = '+'.join(word for word in final_city)
    final_site_list = ['http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city,
                '%2C+', state] # Join all of our strings together so that indeed will search correctly
else:
    final_site_list = ['http://www.indeed.com/jobs?q="', final_job, '"']

final_site = ''.join(final_site_list) # Merge the html address together into one string


base_url = 'http://www.indeed.com'


try:
    html = urllib2.urlopen(final_site).read() # Open up the front page of our search first
except:
    'That city/state combination did not have any jobs. Exiting . . .' # In case the city is invalid
#     return
soup = BeautifulSoup(html) # Get the html from the first page

# Now find out how many jobs there were

num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8') # Now extract the total number of jobs found
                                                                    # The 'searchCount' object has this

job_numbers = re.findall('\d+', num_jobs_area) # Extract the total jobs found from the search result

print('hello')
print(job_numbers[1]+job_numbers[2])
job_numbers[1] = job_numbers[1]+job_numbers[2]

if len(job_numbers) > 3: # Have a total number of jobs greater than 1000
    total_num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
else:
    total_num_jobs = int(job_numbers[1]) 

city_title = city
if city is None:
    city_title = 'Nationwide'

print 'There were', total_num_jobs, 'jobs found,', city_title # Display how many jobs were found

num_pages = total_num_jobs/10 # This will be how we know the number of times we need to iterate over each new
                                  # search result page
job_descriptions = [] # Store all our descriptions in this list

print(num_pages, "number of pages")

for i in xrange(1,num_pages+1): # Loop through all of our search result pages
    print 'Getting page', i
    start_num = str(i*10) # Assign the multiplier of 10 to view the pages we want
    current_page = ''.join([final_site, '&start=', start_num])
    # Now that we can view the correct 10 job returns, start collecting the text samples from each
    sleep(5)
    print (current_page, "current page")
    print ("Oyeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee")
    
    html_page = urllib2.urlopen(current_page).read() # Get the page

    page_obj = BeautifulSoup(html_page) # Locate all of the job links
    job_link_area = page_obj.find(id = 'resultsCol') # The center column on the page where the job postings exist
    print ("job_link_area")
    #print (job_link_area)
    
    print ((link.get('href') for link in job_link_area.find_all('a')))
    print (base_url, "base_url")
    # This thing no longer exists
    
    #job_URLS = [base_url + link.get('href') for link in job_link_area.find_all('a')] # Get the URLS for the jobs

    #job_URLS = filter(lambda x:'clk' in x, job_URLS) # Now get just the job related URLS
   # print job_URLS

#     for j in xrange(0,len(job_URLS)):
#         final_description = text_cleaner(job_URLS[j])
#         if final_description: # So that we only append when the website was accessed correctly
#             job_descriptions.append(final_description)
#         sleep(1) # So that we don't be jerks. If you have a very fast internet connection you could hit the server a lot! 

print 'Done with collecting the job postings!'    
print 'There were', len(job_descriptions), 'jobs successfully found.'


doc_frequency = Counter() # This will create a full counter of our terms. 
[doc_frequency.update(item) for item in job_descriptions] # List comp

# Now we can just look at our final dict list inside doc_frequency

# Obtain our key terms and store them in a dict. These are the key data science skills we are looking for

prog_lang_dict = Counter({'R':doc_frequency['r'], 'Python':doc_frequency['python'],
                'Java':doc_frequency['java'], 'C++':doc_frequency['c++'],
                'Ruby':doc_frequency['ruby'],
                'Perl':doc_frequency['perl'], 'Matlab':doc_frequency['matlab'],
                'JavaScript':doc_frequency['javascript'], 'Scala': doc_frequency['scala']})

analysis_tool_dict = Counter({'Excel':doc_frequency['excel'],  'Tableau':doc_frequency['tableau'],
                    'D3.js':doc_frequency['d3.js'], 'SAS':doc_frequency['sas'],
                    'SPSS':doc_frequency['spss'], 'D3':doc_frequency['d3']})  

hadoop_dict = Counter({'Hadoop':doc_frequency['hadoop'], 'MapReduce':doc_frequency['mapreduce'],
            'Spark':doc_frequency['spark'], 'Pig':doc_frequency['pig'],
            'Hive':doc_frequency['hive'], 'Shark':doc_frequency['shark'],
            'Oozie':doc_frequency['oozie'], 'ZooKeeper':doc_frequency['zookeeper'],
            'Flume':doc_frequency['flume'], 'Mahout':doc_frequency['mahout']})

database_dict = Counter({'SQL':doc_frequency['sql'], 'NoSQL':doc_frequency['nosql'],
                'HBase':doc_frequency['hbase'], 'Cassandra':doc_frequency['cassandra'],
                'MongoDB':doc_frequency['mongodb']})


overall_total_skills = prog_lang_dict + analysis_tool_dict + hadoop_dict + database_dict # Combine our Counter objects



final_frame = pd.DataFrame(overall_total_skills.items(), columns = ['Term', 'NumPostings']) # Convert these terms to a 
                                                                                            # dataframe 

# Change the values to reflect a percentage of the postings 

final_frame.NumPostings = (final_frame.NumPostings)*100/len(job_descriptions) # Gives percentage of job postings 
                                                                                #  having that term 

# Sort the data for plotting purposes

#final_frame.sort(columns = 'NumPostings', ascending = False, inplace = True)

# Get it ready for a bar plot

#final_plot = final_frame.plot(x = 'Term', kind = 'bar', legend = None, 
 #                       title = 'Percentage of Data Scientist Job Ads with a Key Skill, ' + city_title)

#final_plot.set_ylabel('Percentage Appearing in Job Ads')
#fig = final_plot.get_figure() # Have to convert the pandas plot object to a matplotlib object


#return fig, final_frame # End of the function

hello
3981
There were 3981 jobs found, San Francisco
(398, 'number of pages')
Getting page 1
('http://www.indeed.com/jobs?q=%22python%22&l=San+Francisco%2C+CA&start=10', 'current page')
Oyeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
job_link_area
<generator object <genexpr> at 0x000000000E5A98B8>
('http://www.indeed.com', 'base_url')
Getting page 2
('http://www.indeed.com/jobs?q=%22python%22&l=San+Francisco%2C+CA&start=20', 'current page')
Oyeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
job_link_area
<generator object <genexpr> at 0x000000000CF468B8>
('http://www.indeed.com', 'base_url')
Getting page 3
('http://www.indeed.com/jobs?q=%22python%22&l=San+Francisco%2C+CA&start=30', 'current page')
Oyeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
job_link_area
<generator object <genexpr> at 0x000000000DFE3F30>
('http://www.indeed.com', 'base_url')
Getting page 4
('http://www.indeed.com/jobs?q=%22python%22&l=San+Francisco%2C+CA&start=40', 'current page')
Oyeeeeeeeeeeeeeeeeeeeeeeeeee

KeyboardInterrupt: 

In [24]:
site = urllib2.urlopen('https://www.indeed.com/jobs?q=%22python%22&l=San%20Francisco,%20CA&start=30').read() # Connect to the job posting
soup = BeautifulSoup(site) # Get the html from the site
for div in soup.find_all('div',attrs={"class" : "jobtitle turnstileLink"}):
    print div#text[div.text]

In [45]:
import urllib2
from bs4 import BeautifulSoup

URL = "http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l=New+York&start=10"

soup = BeautifulSoup(urllib2.urlopen(URL).read(), 'html.parser')

results = soup.find_all('div', attrs={'data-tn-component': 'organicJob'})

#print results

for x in results:
    company = x.find('span', attrs={"class":"summary"})
    print 'summary:', company.text.strip()

    company = x.find('span', attrs={"class":"location"})
    print 'location:', company.text.strip()
    
    company = x.find('span', attrs={"class":"date"})
    print 'posted:', company.text.strip()
    
    company = x.find('span', attrs={"class":"company"})
    print 'company:', company.text.strip()
    
    job = x.find('a', attrs={'data-tn-element': "jobTitle"})
    print 'job:', job.text.strip()

    salary = x.find('nobr')
    if salary:
        print 'salary:', salary.text.strip()

    print '----------'

company: HelloFresh is seeking a Data Scientist to help spearhead our personalization initiative. Promote a data smart and data aware culture within a large...
company: New York, NY
posted: 16 days ago
job: Data Scientist, Personalization
----------
company: Director of Data Science*. Identify patterns in large data sets. Ability to lead and manage a team of scientists....
company: New York, NY
posted: 30+ days ago
job: Director of Data Science
----------
company: Research Data Analyst. The Educational research data analyst reports to the Educational Director and focuses on compiling survey data, producing statistical...
company: New York State
posted: 30+ days ago
job: Research Data Analyst
----------
company: About the Data Scientist position. Pluvio is a data driven tech company looking for an experienced Lead Data Scientist to help us scale our data science team....
company: New York State
posted: 30+ days ago
job: Data Science Engineer
----------
company: 3+ years professional exp

In [56]:
import urllib2
from bs4 import BeautifulSoup

data =[]

city, state = "New York", 'NY'
    

#data+scientist+
final_job = 'python' # searching for data scientist exact fit("data scientist" on Indeed search)

# Make sure the city specified works properly if it has more than one word (such as San Francisco)
if city is not None:
    final_city = city.split() 
    final_city = '+'.join(word for word in final_city)
    final_site_list = ['http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city,
                '%2C+', state] # Join all of our strings together so that indeed will search correctly
else:
    final_site_list = ['http://www.indeed.com/jobs?q="', final_job, '"']

final_site = ''.join(final_site_list) # Merge the html address together into one string

base_url = 'http://www.indeed.com'

try:
    html = urllib2.urlopen(final_site).read() # Open up the front page of our search first
except:
    'That city/state combination did not have any jobs. Exiting . . .' # In case the city is invalid
#     return
soup = BeautifulSoup(html) # Get the html from the first page

# Now find out how many jobs there were

num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8') # Now extract the total number of jobs found
                                                                    # The 'searchCount' object has this

job_numbers = re.findall('\d+', num_jobs_area) # Extract the total jobs found from the search result

print('hello')
print(job_numbers[1]+job_numbers[2])
job_numbers[1] = job_numbers[1]+job_numbers[2]

if len(job_numbers) > 3: # Have a total number of jobs greater than 1000
    total_num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
else:
    total_num_jobs = int(job_numbers[1]) 

city_title = city
if city is None:
    city_title = 'Nationwide'

print 'There were', total_num_jobs, 'jobs found,', city_title # Display how many jobs were found

num_pages = total_num_jobs/10 # This will be how we know the number of times we need to iterate over each new
                                  # search result page
job_descriptions = [] # Store all our descriptions in this list

print(num_pages, "number of pages")


for i in xrange(1,num_pages+1): # Loop through all of our search result pages
    print 'Getting page', i
    start_num = str(i*10) # Assign the multiplier of 10 to view the pages we want
    current_page = ''.join([final_site, '&start=', start_num])
    # Now that we can view the correct 10 job returns, start collecting the text samples from each
    sleep(5)
    print (current_page, "current page")


    URL = current_page #"http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l=New+York&start=10"

    soup = BeautifulSoup(urllib2.urlopen(URL).read(), 'html.parser')

    results = soup.find_all('div', attrs={'data-tn-component': 'organicJob'})

    #print results

    for x in results:
        summary = x.find('span', attrs={"class":"summary"})
        #print 'company:', company.text.strip()

        location = x.find('span', attrs={"class":"location"})
        #print 'company:', location.text.strip()

        job = x.find('a', attrs={'data-tn-element': "jobTitle"})
        #print 'job:', job.text.strip()
        
        date = x.find('span', attrs={"class":"date"})
        #print 'posted:', company.text.strip()
    
        company = x.find('span', attrs={"class":"company"})
        #print 'company:', company.text.strip()

        data.append([summary.text.strip(),location.text.strip(),job.text.strip(), date.text.strip(), company.text.strip()])
        
        print (data)
        print '----------'



hello
4745
There were 4745 jobs found, New York
(474, 'number of pages')
Getting page 1
('http://www.indeed.com/jobs?q=%22python%22&l=New+York%2C+NY&start=10', 'current page')
[[u'Expert in Python. Contribute to the design and development of python data workflow management platform....', u'New York, NY 10152 (Midtown area)', u'Data Engineer (Python)', u'9 days ago', u'Crux Informatics']]
----------
[[u'Expert in Python. Contribute to the design and development of python data workflow management platform....', u'New York, NY 10152 (Midtown area)', u'Data Engineer (Python)', u'9 days ago', u'Crux Informatics'], [u'Full Stack Python Developer Intern. You are experienced in building applications using Python, HTML, CSS, JS and jQuery....', u'New York, NY', u'Full Stack Python Developer Intern', u'3 days ago', u'EnerKnol']]
----------
[[u'Expert in Python. Contribute to the design and development of python data workflow management platform....', u'New York, NY 10152 (Midtown area)', u'Data 

('http://www.indeed.com/jobs?q=%22python%22&l=New+York%2C+NY&start=20', 'current page')
[[u'Expert in Python. Contribute to the design and development of python data workflow management platform....', u'New York, NY 10152 (Midtown area)', u'Data Engineer (Python)', u'9 days ago', u'Crux Informatics'], [u'Full Stack Python Developer Intern. You are experienced in building applications using Python, HTML, CSS, JS and jQuery....', u'New York, NY', u'Full Stack Python Developer Intern', u'3 days ago', u'EnerKnol'], [u'Excellent computational data engineering know-how with complete fluency with Python and SQL. Extraordinary Internship Opportunity....', u'New York, NY', u'Data Science Internship', u'30+ days ago', u'Seaport Global Holdings LLC'], [u'Java, Python, C++ or Groovy is a prerequisite. We are seeking a creative and research-minded portfolio analyst to join one of our growing Quantitative...', u'New York, NY', u'Portfolio Research Analyst', u'30+ days ago', u'Two Sigma Investments, 

('http://www.indeed.com/jobs?q=%22python%22&l=New+York%2C+NY&start=30', 'current page')
[[u'Expert in Python. Contribute to the design and development of python data workflow management platform....', u'New York, NY 10152 (Midtown area)', u'Data Engineer (Python)', u'9 days ago', u'Crux Informatics'], [u'Full Stack Python Developer Intern. You are experienced in building applications using Python, HTML, CSS, JS and jQuery....', u'New York, NY', u'Full Stack Python Developer Intern', u'3 days ago', u'EnerKnol'], [u'Excellent computational data engineering know-how with complete fluency with Python and SQL. Extraordinary Internship Opportunity....', u'New York, NY', u'Data Science Internship', u'30+ days ago', u'Seaport Global Holdings LLC'], [u'Java, Python, C++ or Groovy is a prerequisite. We are seeking a creative and research-minded portfolio analyst to join one of our growing Quantitative...', u'New York, NY', u'Portfolio Research Analyst', u'30+ days ago', u'Two Sigma Investments, 

[[u'Expert in Python. Contribute to the design and development of python data workflow management platform....', u'New York, NY 10152 (Midtown area)', u'Data Engineer (Python)', u'9 days ago', u'Crux Informatics'], [u'Full Stack Python Developer Intern. You are experienced in building applications using Python, HTML, CSS, JS and jQuery....', u'New York, NY', u'Full Stack Python Developer Intern', u'3 days ago', u'EnerKnol'], [u'Excellent computational data engineering know-how with complete fluency with Python and SQL. Extraordinary Internship Opportunity....', u'New York, NY', u'Data Science Internship', u'30+ days ago', u'Seaport Global Holdings LLC'], [u'Java, Python, C++ or Groovy is a prerequisite. We are seeking a creative and research-minded portfolio analyst to join one of our growing Quantitative...', u'New York, NY', u'Portfolio Research Analyst', u'30+ days ago', u'Two Sigma Investments, LLC.'], [u'You write great Python. Packaging techniques as reusable modules written in p

('http://www.indeed.com/jobs?q=%22python%22&l=New+York%2C+NY&start=40', 'current page')
[[u'Expert in Python. Contribute to the design and development of python data workflow management platform....', u'New York, NY 10152 (Midtown area)', u'Data Engineer (Python)', u'9 days ago', u'Crux Informatics'], [u'Full Stack Python Developer Intern. You are experienced in building applications using Python, HTML, CSS, JS and jQuery....', u'New York, NY', u'Full Stack Python Developer Intern', u'3 days ago', u'EnerKnol'], [u'Excellent computational data engineering know-how with complete fluency with Python and SQL. Extraordinary Internship Opportunity....', u'New York, NY', u'Data Science Internship', u'30+ days ago', u'Seaport Global Holdings LLC'], [u'Java, Python, C++ or Groovy is a prerequisite. We are seeking a creative and research-minded portfolio analyst to join one of our growing Quantitative...', u'New York, NY', u'Portfolio Research Analyst', u'30+ days ago', u'Two Sigma Investments, 

[[u'Expert in Python. Contribute to the design and development of python data workflow management platform....', u'New York, NY 10152 (Midtown area)', u'Data Engineer (Python)', u'9 days ago', u'Crux Informatics'], [u'Full Stack Python Developer Intern. You are experienced in building applications using Python, HTML, CSS, JS and jQuery....', u'New York, NY', u'Full Stack Python Developer Intern', u'3 days ago', u'EnerKnol'], [u'Excellent computational data engineering know-how with complete fluency with Python and SQL. Extraordinary Internship Opportunity....', u'New York, NY', u'Data Science Internship', u'30+ days ago', u'Seaport Global Holdings LLC'], [u'Java, Python, C++ or Groovy is a prerequisite. We are seeking a creative and research-minded portfolio analyst to join one of our growing Quantitative...', u'New York, NY', u'Portfolio Research Analyst', u'30+ days ago', u'Two Sigma Investments, LLC.'], [u'You write great Python. Packaging techniques as reusable modules written in p

KeyboardInterrupt: 

In [57]:
import pandas as pd

data

labels = ['Summary', 'Location', 'Role', 'Date', 'Company']
df = pd.DataFrame.from_records(data, columns=labels)

df

Unnamed: 0,Summary,Location,Role,Date,Company
0,Expert in Python. Contribute to the design and...,"New York, NY 10152 (Midtown area)",Data Engineer (Python),9 days ago,Crux Informatics
1,Full Stack Python Developer Intern. You are ex...,"New York, NY",Full Stack Python Developer Intern,3 days ago,EnerKnol
2,Excellent computational data engineering know-...,"New York, NY",Data Science Internship,30+ days ago,Seaport Global Holdings LLC
3,"Java, Python, C++ or Groovy is a prerequisite....","New York, NY",Portfolio Research Analyst,30+ days ago,"Two Sigma Investments, LLC."
4,You write great Python. Packaging techniques a...,"New York, NY",Data Science Intern,11 days ago,Dataiku
5,We prefer experience with JavaScript or Python...,"New York, NY",Software Programmer,30+ days ago,Tarifica
6,Python needs to be upgraded from 2.5.2 to the ...,"New York, NY",Python Developer,20 days ago,New York State Psychiatry Institute
7,I.T. Support Assistants support routine operat...,"New York, NY",Junior System Support Engineer IT Support- Hourly,30+ days ago,Borough of Manhattan Community College
8,We are looking for a Marketing Analytics profe...,"New York, NY",Marketing Data Analyst,3 days ago,Pefin
9,Python or Shell Scripting language familiarity...,"New York, NY",Software Engineer - Backend,30+ days ago,SmartThings


In [62]:
df['Company'].unique

<bound method Series.unique of 0                           Crux Informatics
1                                   EnerKnol
2                Seaport Global Holdings LLC
3                Two Sigma Investments, LLC.
4                                    Dataiku
5                                   Tarifica
6        New York State Psychiatry Institute
7     Borough of Manhattan Community College
8                                      Pefin
9                                SmartThings
10                                    Vitals
11                              L&T Infotech
12                                 Synechron
13                                    Rukkus
14                 Intercontinental Exchange
15                               iHeartRadio
16              Tudor Investment Corporation
17                           Rent the Runway
18                             Gartner, Inc.
19                           CA CIB Americas
20                       Crédit Agricole CIB
21         Source of Fut