In [1]:
#!/usr/bin/env python
# coding: utf-8

#  
# - Python web scraping libraries you need for the course and how to install them.
# - How to extract URLs from one webpage.
# - How to extract other text data pieces from one webpage.
# - How to crawl multiple webpages and extract data from each of them.
# - How to handle navigation links and move to *next* pages.
# - How to save your scraped data into a CSV file.
# - And finally, a quick overview about *other* popular web scraping frameworks.

from bs4 import BeautifulSoup
import requests 
import pandas as pd


In [2]:

url = "https://boston.craigslist.org/search/npo"


# Create a dictionary
d = {'key':'value'}
print(d)


# Update the dictionary
d['new key'] = 'new value'
print(d)


{'key': 'value'}
{'key': 'value', 'new key': 'new value'}


**Using developer tools in the ie web browser to find classes used in the web page**
![title](Figures_Used/DeveloperTools.png)

![title](Figures_Used/elementInspect_result_tile.png)

![title](Figures_Used/elementInspect_result_tile2.png)

![title](Figures_Used/emelmentInstpect_result_date.png)

![title](Figures_Used/elementInspect_Location.png)

![title](Figures_Used/elementInspect_Location2.png)

![title](Figures_Used/elementInspectPosition1.png)

![title](Figures_Used/elementInspectPosition2.png)

In [15]:
npo_jobs = {}
job_no = 0
while True:
    # connect to url
    response = requests.get(url)
    print(response)

    # get url text
    data = response.text
    print(data)
    #
    soup = BeautifulSoup(data,'html.parser')
    # get all the result-info
    jobs = soup.find_all('p',{'class':'result-info'})
    
    for job in jobs:
        # connect to result tittle, get the text
        title = job.find('a',{'class':'result-title'}).text
        print(title)
        # location, tag is span, result-hood
        location_tag = job.find('span',{'class':'result-hood'})
        location = location_tag.text[2:-1] if location_tag else "N/A"
        # connect to date, get the text
        date = job.find('time', {'class': 'result-date'}).text
        # find link, get the link
        link = job.find('a', {'class': 'result-title'}).get('href')
        
        # enter job link
        job_response = requests.get(link)
        # get the text in job link
        job_data = job_response.text
        # create object soup
        job_soup = BeautifulSoup(job_data, 'html.parser')
        # find the section, get the id : postingbody text
        job_description = job_soup.find('section',{'id':'postingbody'}).text
        # find p, get class:attrgroup text
        job_attributes_tag = job_soup.find('p',{'class':'attrgroup'})
        job_attributes = job_attributes_tag.text if job_attributes_tag else "N/A"
        
        job_no+=1
        npo_jobs[job_no] = [title, location, date, link, job_attributes, job_description]
        
        
#       print('Job Title:', title, '\nLocation:', location, '\nDate:', date, '\nLink:', link,"\n", job_attributes, '\nJob Description:', job_description,'\n---')
        
    url_tag = soup.find('a',{'title':'next page'})
    if url_tag.get('href'):
        url= 'https://boston.craigslist.org' + url_tag.get('href')
        print(url)
    else:
        break
        





<Response [200]>
<!DOCTYPE html>
<html>
<head>
    
	<meta charset="UTF-8">
	<meta http-equiv="X-UA-Compatible" content="IE=Edge">
	<meta name="viewport" content="width=device-width,initial-scale=1">
	<meta property="og:site_name" content="craigslist">
	<meta name="twitter:card" content="preview">
	<meta property="og:title" content="boston nonprofit jobs - craigslist">
	<meta name="description" content="boston nonprofit jobs - craigslist">
	<meta property="og:description" content="boston nonprofit jobs - craigslist">
	<meta property="og:url" content="https://boston.craigslist.org/search/npo">
	<meta name="smartbanner:api" content="true">
	<meta name="smartbanner:title" content="the craigslist app">
	<meta name="smartbanner:author" content="what&#39;s old is new">
	<meta name="smartbanner:icon-apple" content="/images/app_icon.png">
	<meta name="smartbanner:icon-google" content="/images/app_icon.png">
	<meta name="smartbanner:button" content="view">
	<meta name="smartbanner:close-label" 

![title](Figures_Used/elementInspect_Attr.png)

In [16]:
print("Total Jobs:", job_no)
npo_jobs_df = pd.DataFrame.from_dict(npo_jobs, orient = 'index', columns = ['Job Title','Location','Date', 'Link', 'Job Attributes', 'Job Description'])


npo_jobs_df.head()


Total Jobs: 0


Unnamed: 0,Job Title,Location,Date,Link,Job Attributes,Job Description


In [6]:
npo_jobs_df.to_csv('data/npo_jobs.csv')