In [1]:
#!/usr/bin/env python
# coding: utf-8

#  
# - Python web scraping libraries you need for the course and how to install them.
# - How to extract URLs from one webpage.
# - How to extract other text data pieces from one webpage.
# - How to crawl multiple webpages and extract data from each of them.
# - How to handle navigation links and move to *next* pages.
# - How to save your scraped data into a CSV file.
# - And finally, a quick overview about *other* popular web scraping frameworks.

from bs4 import BeautifulSoup
import requests 
import pandas as pd


In [2]:

url = "https://boston.craigslist.org/search/npo"


# Create a dictionary
d = {'key':'value'}
print(d)


# Update the dictionary
d['new key'] = 'new value'
print(d)


{'key': 'value'}
{'key': 'value', 'new key': 'new value'}


**Using developer tools in the ie web browser to find classes used in the web page**
![title](Figures_Used/DeveloperTools.png)

![title](Figures_Used/elementInspect_result_tile.png)

![title](Figures_Used/elementInspect_result_tile2.png)

![title](Figures_Used/emelmentInstpect_result_date.png)

![title](Figures_Used/elementInspect_Location.png)

![title](Figures_Used/elementInspect_Location2.png)

![title](Figures_Used/elementInspectPosition1.png)

![title](Figures_Used/elementInspectPosition2.png)

In [3]:
npo_jobs = {}
job_no = 0
while True:
    
    response = requests.get(url)
    data = response.text
    soup = BeautifulSoup(data,'html.parser')
    jobs = soup.find_all('p',{'class':'result-info'})
    
    for job in jobs:
        
        title = job.find('a',{'class':'result-title'}).text
        location_tag = job.find('span',{'class':'result-hood'})
        location = location_tag.text[2:-1] if location_tag else "N/A"
        date = job.find('time', {'class': 'result-date'}).text
        link = job.find('a', {'class': 'result-title'}).get('href')
        
        job_response = requests.get(link)
        job_data = job_response.text
        job_soup = BeautifulSoup(job_data, 'html.parser')
        job_description = job_soup.find('section',{'id':'postingbody'}).text
        job_attributes_tag = job_soup.find('p',{'class':'attrgroup'})
        job_attributes = job_attributes_tag.text if job_attributes_tag else "N/A"
        
        job_no+=1
        npo_jobs[job_no] = [title, location, date, link, job_attributes, job_description]
        
        
#       print('Job Title:', title, '\nLocation:', location, '\nDate:', date, '\nLink:', link,"\n", job_attributes, '\nJob Description:', job_description,'\n---')
        
    url_tag = soup.find('a',{'title':'next page'})
    if url_tag.get('href'):
        url= 'https://boston.craigslist.org' + url_tag.get('href')
        print(url)
    else:
        break
        





![title](Figures_Used/elementInspect_Attr.png)

In [4]:
print("Total Jobs:", job_no)
npo_jobs_df = pd.DataFrame.from_dict(npo_jobs, orient = 'index', columns = ['Job Title','Location','Date', 'Link', 'Job Attributes', 'Job Description'])


npo_jobs_df.head()


Total Jobs: 116


Unnamed: 0,Job Title,Location,Date,Link,Job Attributes,Job Description
1,Quality and Compliance Manager,"Waltham, MA",May 13,https://boston.craigslist.org/gbs/npo/d/waltha...,\ncompensation: Competitive salary\n\nemployme...,\n\nQR Code Link to This Post\n\n\nExciting op...
2,Administrative Assistant,"Waltham, MA",May 13,https://boston.craigslist.org/gbs/npo/d/waltha...,\ncompensation: Competitive pay\n\nemployment ...,\n\nQR Code Link to This Post\n\n\nSpringwell ...
3,Human Resources Recruitment Coordinator,"Waltham, MA",May 13,https://boston.craigslist.org/gbs/npo/d/waltha...,\ncompensation: Competitive\n\nemployment type...,\n\nQR Code Link to This Post\n\n\nSpringwell ...
4,Nurse Manager,"Waltham, MA",May 13,https://boston.craigslist.org/gbs/npo/d/waltha...,\ncompensation: Competitive salary\n\nemployme...,\n\nQR Code Link to This Post\n\n\nExciting op...
5,"Registered Nurse, 70k/year","Waltham, MA",May 13,https://boston.craigslist.org/gbs/npo/d/waltha...,\ncompensation: $70k annually Salary\n\nemploy...,\n\nQR Code Link to This Post\n\n\nAre you a R...


In [6]:
npo_jobs_df.to_csv('data/npo_jobs.csv')