### Import Libraries 

 - Request library allows you to send HTTP request in python to a specific URL. In our case we send an HTTP request to Zillow
 - Time module allows to handle time related task including formatting dates, waiting and representing time
 - The random module allows you to generate random 
 - The bs4 module allows you to pull data from HTML document after you get a response from HTTP request
 - The os modules allows ou to interact with operating systems including changing working directory
 - The selenium module allows you to automate interaction with a web browser including sending URL request and extracting HTML
   document response

In [2]:
import requests
import time
from bs4 import BeautifulSoup
from random import sample 
from random import randint
import pandas as pd 
import os
import json
from datetime import datetime
import numpy as np


### Set Path
 - Identify your destination folder
 - Use os change directory to set your destination directory as the default. That is where all outputs will be exported to

In [3]:
path = "C:\\Users\\padu\\Desktop\\Zillow\\Final\\InitialCraiglist"
os.chdir(path)

###  Set URL 
- https:// yourcityname.craigslist.org/search/apa
- This URL is created to search for the number of pages in a later step

In [4]:
url = 'https://charlotte.craigslist.org/search/apa'

### Set Headers

- We set headers here because with headers, Craigslist without will know we a webscraping their data and block our request. 


In [5]:
headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language' : 'en-US,en;q=0.9',
    'cache-control': 'max-age=0',
    'cookie': 'cl_b=4|51d3ac35358ee42ae349258960e058830d03e113|1633897219_XNdY; cl_def_hp=charlotte',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-stie': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0'
    }


### Test Response
- We test response from our request, as long as it returns 200, we know the resquest is working

In [6]:
response = requests.get(url, headers = headers)
response
print(response.status_code)

200


In [8]:
# This step reports the time code is intialized 
print("Script Start: " + strftime("%Y-%m-%d %H:%M:%S"))

# In this step we identified the total number of pages for all the rental listings on craiglist in our city
# to do this we create a beautiful soup object from our request response
# we then use the find method to search through the div class for the class id "search-legend"
# under the 'search-legend' class we search for 'totalcount' which has the total number of pages
# After finding the total number of pages (in our case 3000),
# we create a range values between 0 and 3000 with a step of 120 since there are listing on each page

soup = BeautifulSoup(response.text, 'html.parser')
results_num = soup.find('div', class_= 'search-legend')
results_total = int(results_num.find('span', class_='totalcount').text)
pages = np.arange(0, results_total+1, 120)

# Create an outfile file name, I called mine FinalCraiglist and formatted it a date time stamp
# Note: If you are scraping multiple times in a day, then you need to format the time stamp with hours 
# that way you don't overwrite already exported data

finalfile = "FinalCraiglist" + "_" + "{:%Y_%m_%d}".format(datetime.now()) +".csv"

# We create a list that holds the results

result = []

# We loop through each of the pages and append the number at the end of the search URL 

for page in pages:
    print(page)
    
    # The URL inlcude the name of the city in this case charlotte, and appended the page number as string
    # for example 'https://yourcity.craigslist.org/d/apartments-housing-for-rent/search/apa?'+"s=" +str(page)
    
    url = 'https://charlotte.craigslist.org/d/apartments-housing-for-rent/search/apa?'+"s=" +str(page)
    
    # If we overload the Craigslist server with repeated request, they will block us so we set random wait 
    # with the time module 
    time.sleep(randint(0, 5))
    
    # We send a request to the URL and pass our headers as an additional argument 
    response = requests.get(url, headers = headers)
    response
    
    # We create a beautifulsoup object for our request response 
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # We search through the beautiful soup object using the find method for the class id "result-row"
    # The 'result-row' is a list of all the rental listings on each page
    
    posts  = soup.find_all('li', class_= 'result-row')
    time.sleep(randint(0, 2))
   
    # We loop through each listings to extract the URL associated with each individual listings
    # this would allow us to access listing specific URL 
    
    for post in posts:
        
        # We use the find method to search for the class id 'result-title hdrlnk', 
        # this id stores the URL for each rental listings 
        
        listingURL = post.find('a', class_='result-title hdrlnk')['href']
        time.sleep(randint(0, 5))
        
        # We pass the URL of each listing as a request, and create a beautiful soup object for the results
        
        responserent = requests.get(listingURL, headers = headers)
        renthtml = BeautifulSoup(responserent.text, 'html.parser')
        
        # We extract the variables for each rental listings. 
        # For rental price, date of rental posting, neighborhood of rental listing, and rental URL
        # we can extract them for general html document
        # for the listing address, description text, number of bathrooms, latitude, longitude and floor size
        # we extract them from rental listing specific html document
        
        # We append the results to a list 
        
        try:
            result.append({
                
            'Post_datetime':post.find('time', class_= 'result-date')['datetime'],    
            'Post Price': renthtml.find('span',{'class': 'price'}).text,
            'Post hood': post.find('span',class_ = 'result-hood'),
            'Post title': renthtml.find('span',{'id': 'titletextonly'}).text,
            'PostURL': post.find('a', class_='result-title hdrlnk')['href'],
            'address':  renthtml.find('div', {'class': 'mapaddress'}).text,
            'overview': renthtml.find('section', {'id': 'postingbody'}),
            'bedroom_baths':renthtml.find('span', {'class': 'shared-line-bubble'}).text,
            'Title': renthtml.find('span',{'id':'titletextonly'}).text,
            'Price': renthtml.find('span',{'class': 'price'}).text,
            'lat': renthtml.find('div',{'class':'viewposting'})['data-latitude'],
            'lon': renthtml.find('div',{'class':'viewposting'})['data-longitude'],
            'Other':renthtml.find('p', {'class': 'attrgroup'})

                
            })
            
        except:
            pass
        
        
        # The results are converted to a pandas dataframe, and exported as CSV file 
   
        Craiglistdata = pd.DataFrame(result)
        Craiglistdata.to_csv(finalfile, index=False)
        
print("Script End: " + strftime("%Y-%m-%d %H:%M:%S"))

Script Start: 2022-02-16 13:21:42
0
120
240
360
480
600
720
840
960
1080
1200
1320
1440
1560
1680
1800
1920
2040
2160
2280
2400
2520
2640
2760
2880
3000
Script End: 2022-02-16 16:14:05
