Module 4: Project Basics

In [1]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import re

In [164]:
# This function reads specific sections of the advertisement and store them in a dictionary.
def getad (thisurl):
  thispage = requests.get(thisurl)
  bs = BeautifulSoup(thispage.text,'html.parser')
    
  postid = bs.find(class_='postinginfos').text.strip()   # Post id number
  temp = re.findall(r'\d+', postid)
  res = list(map(int, temp))
  postid = res[0]
    
  title = bs.find('h1').text.strip()   # Post title

  description = bs.find(id='postingbody').text.strip()   # Post description
  regex = re.search('\n\n\n', description)
  temp2 = regex.span()
  desc = description
  mod_string = ""
  n = temp2[1]
  for i in range(n, len(desc)):
    mod_string = mod_string + desc[i]
  description = mod_string

  adpostdate = bs.find('time').text.strip()   # Date ad was posted
    
  gsdates = bs.find(class_='attrgroup').text.strip()   # Garage sale dates
  temp1 = gsdates.replace("dates\n\n\n","")
  gsdates = temp1.replace("\n\n\n",",")

  return {'URL':thisurl, 'Post id':postid, 'Title':title, 
          'Date Posted':adpostdate, 'Description':description, 'Garage Sales Dates':gsdates}

In [165]:
# This boolean function checks if a term is found in a string (ex., in ad title or description).
def search_term (term,field):
    res = False
    if term.casefold() in field.casefold():
        res = True
    return res

In [4]:
# Read empty csv file to initialize the dataframe.
df = pd.read_csv('PhoenixCraigslist.csv')

In [5]:
df   # Check dataframe headers in csv file

Unnamed: 0,Post id,Title,URL,Date Posted,Description,Garage Sales Dates


In [7]:
# Request the "Garage & Moving Sales" page on Craigslist with the listings
listurl = ('https://phoenix.craigslist.org/search/gms')
pagelistings = requests.get (listurl)

In [8]:
# Use BeautifulSoup to parse the loaded page.
bslistings = BeautifulSoup(pagelistings.text,'html.parser')

In [9]:
bslistings   # Check content of loaded page

<!DOCTYPE html>

<html>
<head>
<meta charset="utf-8"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="craigslist" property="og:site_name"/>
<meta content="preview" name="twitter:card"/>
<meta content="phoenix garage &amp; moving sales - craigslist" property="og:title"/>
<meta content="phoenix garage &amp; moving sales - craigslist" name="description"/>
<meta content="phoenix garage &amp; moving sales - craigslist" property="og:description"/>
<meta content="https://phoenix.craigslist.org/search/gms" property="og:url"/>
<title>phoenix garage &amp; moving sales - craigslist</title>
<link href="https://phoenix.craigslist.org/search/gms" rel="canonical"/>
<script id="ld_breadcrumb_data" type="application/ld+json">
    {"@context":"https://schema.org","itemListElement":[{"item":{"name":"phoenix.craigslist.org","@id":"https://phoenix.craigslist.org"},"position":1,"@type":"ListItem"},{"item":{"name":"fo

In [10]:
# The ad listings are found under an unordered list with id='search-results'.
listings = bslistings.find(id='search-results')   # Store the list of ads in a variable.
links = listings.find_all('a')   # Search for all links to the individual ad pages.

In [11]:
links   # Check which links are available.

[<a class="result-image gallery empty" href="https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-193/7440530259.html"></a>,
 <a class="result-title hdrlnk" data-id="7440530259" href="https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-193/7440530259.html" id="postid_7440530259">Storage Unit Auction #193</a>,
 <a class="restore-link" href="#">
 <span class="restore-narrow-text">restore</span>
 <span class="restore-wide-text">restore this posting</span>
 </a>,
 <a class="result-image gallery empty" href="https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-151/7440529125.html"></a>,
 <a class="result-title hdrlnk" data-id="7440529125" href="https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-151/7440529125.html" id="postid_7440529125">Storage Unit Auction #151</a>,
 <a class="restore-link" href="#">
 <span class="restore-narrow-text">restore</span>
 <span class="restore-wide-text">restore this posting</span>
 </a>,
 <a class="res

In [12]:
# Check how the ads URLs are listed on the page
links = listings.find_all('a')
for eachlink in links:
  full_url = eachlink['href']
  print (full_url)
# Returned pattern: [0]-URL1, [1]-URL1, [2]-#, ...

https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-193/7440530259.html
https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-193/7440530259.html
#
https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-151/7440529125.html
https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-151/7440529125.html
#
https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-030/7440526166.html
https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-030/7440526166.html
#
https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-124/7440528009.html
https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-124/7440528009.html
#
https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-324/7440851394.html
https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-324/7440851394.html
#
https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-400/7440956349.html
https://phoenix.craigslist.org/ev

In [13]:
# Compile the URLs for the page of each individual ad post.
# Store this list in the existing dataframe obtained from the csv file.
for eachlink in links:
  full_url = eachlink['href']
  if not df.URL.str.contains(full_url).any():
    if not '#' in full_url:
      if full_url not in df.URL:
        print ('getting', full_url)
        ad = getad(full_url)
        sleep(15)
        df = df.append(ad,ignore_index=True)

getting https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-193/7440530259.html
getting https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-151/7440529125.html
getting https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-030/7440526166.html
getting https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-124/7440528009.html
getting https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-324/7440851394.html
getting https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-400/7440956349.html
getting https://phoenix.craigslist.org/evl/gms/d/mesa-storage-unit-auction-442/7440957345.html
getting https://phoenix.craigslist.org/nph/gms/d/phoenix-lugchevy-wheels-with/7434046266.html
getting https://phoenix.craigslist.org/nph/gms/d/phoenix-chevy-wheels-lb-10-lb-weights/7439457025.html
getting https://phoenix.craigslist.org/nph/gms/d/phoenix-estate-sale/7443039431.html
getting https://phoenix.craigslist.org/nph/gms/d/phoe

getting https://tucson.craigslist.org/gms/d/tucson-thrift-store-going-out-of/7432738934.html
getting https://tucson.craigslist.org/gms/d/tucson-new-year-countertop-sale/7430915672.html
getting https://tucson.craigslist.org/gms/d/tucson-voyager-market-daze-craft-fair/7443581860.html
getting https://tucson.craigslist.org/gms/d/green-valley-household-garage-sale/7442720083.html
getting https://tucson.craigslist.org/gms/d/tucson-moving-sale-everything-but-the/7431573220.html
getting https://tucson.craigslist.org/gms/d/tucson-huge-family-sale/7443483713.html
getting https://tucson.craigslist.org/gms/d/tucson-top-hat-estate-sales-fri-sat/7442894202.html
getting https://tucson.craigslist.org/gms/d/tucson-storage-sale/7441660956.html
getting https://tucson.craigslist.org/gms/d/green-valley-patio-estate-sale/7442202758.html
getting https://tucson.craigslist.org/gms/d/tucson-yard-sale-11-and-12-east-side/7442680677.html
getting https://tucson.craigslist.org/gms/d/catalina-estate-saleeverything-m

In [14]:
df   # Check the information obtained from the scraping.

Unnamed: 0,Post id,Title,URL,Date Posted,Description,Garage Sales Dates
0,7440530259,Storage Unit Auction #193,https://phoenix.craigslist.org/evl/gms/d/mesa-...,2022-02-01 14:58,Online Storage Auction\nwww.lockerfox.com\n2/2...,"wednesday 2022-02-02,\ntuesday 2022-02-08,\nwe..."
1,7440529125,Storage Unit Auction #151,https://phoenix.craigslist.org/evl/gms/d/mesa-...,2022-02-01 14:55,Online Storage Auction\nwww.lockerfox.com\n2/2...,"wednesday 2022-02-02,\ntuesday 2022-02-08,\nwe..."
2,7440526166,Storage Unit Auction #030,https://phoenix.craigslist.org/evl/gms/d/mesa-...,2022-02-01 14:49,Online Storage Auction\nwww.lockerfox.com\n2/2...,"wednesday 2022-02-02,\ntuesday 2022-02-08,\nwe..."
3,7440528009,Storage Unit Auction #124,https://phoenix.craigslist.org/evl/gms/d/mesa-...,2022-02-01 14:53,Online Storage Auction\nwww.lockerfox.com\n2/2...,"wednesday 2022-02-02,\ntuesday 2022-02-08,\nwe..."
4,7440851394,Storage Unit Auction #324,https://phoenix.craigslist.org/evl/gms/d/mesa-...,2022-02-02 11:25,Online Storage Auction\nwww.lockerfox.com\n2/2...,"wednesday 2022-02-02,\ntuesday 2022-02-08,\nwe..."
...,...,...,...,...,...,...
115,7443184122,Huge Estate Sale (Foothills),https://yuma.craigslist.org/gms/d/yuma-huge-es...,2022-02-07 19:27,Coming later this week so mark this one down. ...,"thursday 2022-02-10,\nfriday 2022-02-11,\nsatu..."
116,7442166368,Yard Sale (Yuma Foothills),https://yuma.craigslist.org/gms/d/yuma-yard-sa...,2022-02-05 11:06,"Tools, outdoor kitchen, indoor and outdoor fur...",thursday 2022-02-10
117,7440553920,GARAGE SALE!! (Foothills),https://yuma.craigslist.org/gms/d/yuma-garage-...,2022-02-01 14:52,HOORAY...A Garage Sale!\n13950 E 54th Drive Yu...,"thursday 2022-02-10,\nsaturday 2022-02-12"
118,7442945838,Moving Sale (YUMA),https://yuma.craigslist.org/gms/d/yuma-moving-...,2022-02-07 10:16,"Moving sale, RV parts, electric smoker, cook w...","friday 2022-02-11,\nsaturday 2022-02-12"


In [17]:
df.to_csv('all_results.csv')   # Save all scraped information in a backup csv file.

After the collection of the data, the following section analyzes for word matching.

In [166]:
# Global variables
lookup_terms = ['mattress','cabinet','wrench']
headers = ['Post id','Terms','In T or D','URL','Title','Description','Date Posted','Garage Sale Dates']
df_res = pd.DataFrame(columns = headers)   # Dataframe with results from scraping

In [167]:
df_res   # Check headers of new df_res. 
         # Obs.: 'Terms' are the terms looked up, and 
         # 'In T or D' indicates whether the terms were found in the Title or Description

Unnamed: 0,Post id,Terms,In T or D,URL,Title,Description,Date Posted,Garage Sale Dates


In [None]:
# This section populates the dataframe df_res with the results of the analysis.
# Only matches are added to df_res.
ix = 0
length = len(lookup_terms)
df_res['Terms'] = ''

for i in df.index:
    lookup_title = df.at[i,'Title']
    lookup_descr = df.at[i,'Description']
    for j in range(length):
        res_title = search_term(lookup_terms[j],lookup_title)   # Check for terms in title.
        res_descr = search_term(lookup_terms[j],lookup_descr)   # Check for terms in description.
        t = lookup_terms[j]
        if res_title or res_descr:   # If there is a match, populate the fields of df_res with original df values.
            df_res.at[ix,'Post id'] = df.at[i,'Post id']
            df_res.at[ix,'Terms'] = df_res.at[ix,'Terms']+t   # Add matched terms.
            df_res.at[ix,'URL'] = df.at[i,'URL']
            df_res.at[ix,'Title'] = df.at[i,'Title']
            df_res.at[ix,'Description'] = df.at[i,'Description']
            df_res.at[ix,'Date Posted'] = df.at[i,'Date Posted']
            df_res.at[ix,'Garage Sale Dates'] = df.at[i,'Garage Sales Dates']
            if res_title and res_descr:   # Indicate if matched term was found in title, description, or both.
                df_res.at[ix,'In T or D'] = 'T&D'
            elif res_title:
                 df_res.at[ix,'In T or D'] = 'T'
            elif res_descr:
                 df_res.at[ix,'In T or D'] = 'D'
            ix += 1

In [188]:
df_res   # Check dataframe with the results from the analysis.
         # It shows the full records that mathed the words, plus which words were found and where in the ad.

Unnamed: 0,Post id,Terms,In T or D,URL,Title,Description,Date Posted,Garage Sale Dates
0,7439907175,mattress,D,https://phoenix.craigslist.org/evl/gms/d/mesa-...,Storage Unit Auction C060 (Mesa),"shoes, boxes, patio furniture, mattress, box s...",2022-01-31 09:10,"wednesday 2022-02-02,\nwednesday 2022-02-09"
1,7442605011,cabinet,D,https://phoenix.craigslist.org/evl/gms/d/gold-...,Moving Sale (Gold Canyon),"Moving sale Thursday Friday and Saturday, Febr...",2022-02-06 14:05,"thursday 2022-02-10,\nfriday 2022-02-11,\nsatu..."
2,7441028255,cabinet,D,https://phoenix.craigslist.org/nph/gms/d/glend...,~ESTATE SALE!~GARAGE SALE~YARD SALE~Everything...,This is it. I'm cleaning out everything! Come ...,2022-02-02 17:52,"thursday 2022-02-10,\nfriday 2022-02-11,\nsatu..."
3,7441914080,cabinet,D,https://phoenix.craigslist.org/wvl/gms/d/peori...,MOVING SALE (SUN CITY),Moving Sale: Asian lacquer cabinet with mirror...,2022-02-04 19:02,"thursday 2022-02-10,\nsunday 2022-02-13"
4,7443596623,cabinet,D,https://phoenix.craigslist.org/nph/gms/d/phoen...,Moving Sale High End Items (Phoenix),Round table with six chairs for dining room $1...,2022-02-08 18:23,"friday 2022-02-11,\nsaturday 2022-02-12,\nwedn..."
5,7442907032,cabinet,D,https://phoenix.craigslist.org/evl/gms/d/mesa-...,Moving Sale (Mesa),"Moving sale, we have a china cabinet, dressers...",2022-02-07 10:13,"friday 2022-02-11,\nsaturday 2022-02-12,\nsund..."
6,7443345286,mattress,D,https://phoenix.craigslist.org/evl/gms/d/eloy-...,Garage sale (Eloy),3 family garage sale. Women’s designer clothi...,2022-02-08 09:42,saturday 2022-02-12
7,7443295131,mattress,D,https://phoenix.craigslist.org/wvl/gms/d/phoen...,Saturday Garage Sale (Litchfield Park),Tons of great stuff this Saturday morning Febr...,2022-02-08 08:07,saturday 2022-02-12
8,7441572873,cabinet,D,https://phoenix.craigslist.org/evl/gms/d/mesa-...,Garage / Moving Sale (Encore @ Eastmark),"Misc. Kitchen - small appliances, Crock Pot, R...",2022-02-04 07:02,saturday 2022-02-12
9,7443002439,mattress,D,https://phoenix.craigslist.org/wvl/gms/d/peori...,ESTATE SALE (sun city),"dining room set, dinette set, wine rack, side ...",2022-02-07 12:53,"saturday 2022-02-12,\nsunday 2022-02-13"
