## Steven Nasseri
#### craigslist web scraping function


# USE DOUBLE VPN FOR BEST RESULTS

In [1]:
import pandas as pd
import requests
from requests.exceptions import Timeout as TO
from bs4 import BeautifulSoup

In [2]:

#### Function to get data from craigslist pages ####
# inpit url to search results page

# example url: https://newjersey.craigslist.org/search/cta#search=1~gallery~0~0

def get_data(given_url):

    try:
        ## Link capture
        html = requests.get(given_url)
        ## Create soup
        s = BeautifulSoup(html.content, 'html.parser')

        ## Get craigslist results
        results = s.find('ol', class_="cl-static-search-results")

        ## Get the price information
        price_results = results.find_all('div', class_="price")

        ## Convert price information to integers
        prices_int = [int(p.text.strip('$').replace(',', '')) for p in price_results]

        ## get urls
        a_tags = s.find_all('a')
        urls = [a['href'] for a in a_tags if a.get('href')]

        ## remove first 6 elements in list (not needed)
        urls.pop(0)
        urls.pop(0)
        urls.pop(0)
        urls.pop(0)
        urls.pop(0)
        urls.pop(0)

        ## create empty list for dictionaries
        df_list = []

        ## exception occured
        exception_occured = False
        
        ## loop through urls
        for url in urls:
            
            try:

                # get the page and scrape
                html = requests.get(url, timeout=5)
                s = BeautifulSoup(html.content, 'html.parser')
                results = s.find_all('p', class_="attrgroup")

                # create empty dictionary
                data = {}

                # loop through results
                for result in results: # loop through results
                    spans = result.find_all('span') # find all span tags
            
                    for i, span in enumerate(spans): # loop through span tags
                        text = span.get_text() # get the text

                        if ':' in text: # if ':' in text, split it and add to dictionary
                            attribute, value = text.split(':', 1) # split text
                            data[attribute.strip()] = value.strip() # add to dictionary

                        elif i == 0: # if it's the first span tag, add it as the post title
                            
                            # if data['title'] is empty, add the text:
                            if not data.get('title'):
                                data['title'] = text
                    
                        
                # add url to dictionary
                data['url'] = url

                # add price to dictionary
                data['price'] = prices_int[urls.index(url)]
                
                # if the most recent title is empty, end the loop
                if data.get('title') == '':
                    break

                # append dictionary to list
                df_list.append(data)

                for index, value in data.items(): # print the dictionary
                    print(f"{index}: {value}")

                print('\n\n')
            
            except (TO, ConnectionError) as e: # Timeout error
                print(e)
                exception_occured = True
                break
        
        if exception_occured:
            return None
                
    except (TO, ConnectionError) as e:
        print(e)
        

    #### Clean data before returning dataframe ####
    
    ## create dataframe
    df = pd.DataFrame(df_list)

    ## remove rows with a specific column nan
    df = df[df['title'].notna()]

    ## remove duplicates
    df = df.drop_duplicates()

    # return dataframe
    return df
        
    
    

In [3]:
## search all 4 NJ sections of craigslist and add concat dataframes

data_north_jersey = get_data("https://newjersey.craigslist.org/search/cta#search=1~gallery~0~0")


title: 1997 FORD F-SUPER DUTY
VIN: 3FELF47F1VMA64962
drive: fwd
fuel: diesel
odometer: 164417
paint color: white
title status: clean
transmission: manual
type: other
url: https://newjersey.craigslist.org/ctd/d/paterson-1997-ford-450-sd-flat-bed-tow/7689802014.html
price: 14995



title: 2008 volvo s40
cylinders: 5 cylinders
drive: 4wd
fuel: gas
odometer: 133000
paint color: red
title status: clean
transmission: automatic
url: https://newjersey.craigslist.org/ctd/d/jersey-city-2008-volvo-s40-t5/7689801848.html
price: 5999



title: 2011 cadillac cts
condition: excellent
cylinders: 6 cylinders
drive: 4wd
fuel: gas
odometer: 110000
title status: clean
transmission: automatic
url: https://newjersey.craigslist.org/ctd/d/jersey-city-2011-cadillac-cts-30l-luxury/7689800975.html
price: 7999



title: 2013 Ford explorer
condition: excellent
cylinders: 6 cylinders
drive: 4wd
fuel: gas
odometer: 118000
title status: clean
transmission: automatic
url: https://newjersey.craigslist.org/ctd/d/jersey-

In [4]:
data_north_jersey.tail()

Unnamed: 0,title,VIN,drive,fuel,odometer,paint color,title status,transmission,type,url,price,cylinders,condition,size
282,2013 Toyota Tacoma,5TFTX4GN7DX024164,rwd,gas,152696,white,clean,automatic,truck,https://newjersey.craigslist.org/ctd/d/sussex-...,14900,4 cylinders,good,full-size
283,2004 Lincoln Town Car,1LNHM83W84Y663308,rwd,gas,86949,custom,clean,automatic,sedan,https://newjersey.craigslist.org/ctd/d/sussex-...,7900,8 cylinders,good,
284,2015 Ford Fiesta,3FADP4BJ3FM202563,fwd,gas,45859,custom,clean,automatic,sedan,https://newjersey.craigslist.org/ctd/d/sussex-...,7900,4 cylinders,good,
285,2007 2007 nissan murano,,fwd,gas,157860,grey,clean,automatic,hatchback,https://newjersey.craigslist.org/cto/d/dover-2...,2800,6 cylinders,good,mid-size
286,2013 ram 2500,3C6TR5DT4DG560505,4wd,gas,223661,black,clean,automatic,pickup,https://newjersey.craigslist.org/ctd/d/garfiel...,4000,8 cylinders,excellent,full-size


In [53]:
data_jersey_shore= get_data("https://jerseyshore.craigslist.org/search/cta#search=1~gallery~0~0")


title: 2016 gmc yukon denali
condition: excellent
cylinders: 8 cylinders
drive: 4wd
fuel: gas
odometer: 121500
paint color: black
title status: clean
transmission: automatic
type: SUV
url: https://jerseyshore.craigslist.org/cto/d/brielle-yukon-denali/7688485977.html
price: 23500



title: 2012 Chevy Tahoe PPV
condition: good
cylinders: 8 cylinders
drive: 4wd
fuel: gas
odometer: 249000
paint color: black
size: full-size
title status: clean
transmission: automatic
type: SUV
url: https://jerseyshore.craigslist.org/cto/d/jackson-2012-chevy-tahoe-ppv/7688435984.html
price: 7000



title: 2011 volvo c70
VIN: YV1672MC8BJ110081
condition: good
cylinders: 4 cylinders
drive: fwd
fuel: gas
odometer: 93900
paint color: black
size: mid-size
title status: clean
transmission: automatic
type: convertible
url: https://jerseyshore.craigslist.org/cto/d/brick-2011-volvo-c70-convertible/7688402105.html
price: 7500



title: 2019 acura rdx
condition: excellent
drive: 4wd
fuel: gas
odometer: 58823
paint colo

In [54]:
data_central_nj = get_data("https://cnj.craigslist.org/search/cta#search=1~gallery~0~0")


title: 2000 dodge durango
condition: good
cylinders: 8 cylinders
drive: 4wd
fuel: gas
odometer: 163000
paint color: red
size: full-size
title status: clean
transmission: automatic
type: SUV
url: https://cnj.craigslist.org/cto/d/somerville-two-2000-dodge-durangos/7688502757.html
price: 3500



title: 1987 Chevy Monte Carlo SS
condition: fair
cylinders: 8 cylinders
drive: rwd
fuel: gas
odometer: 156000
paint color: black
title status: clean
transmission: automatic
url: https://cnj.craigslist.org/cto/d/old-bridge-87-monte-carlo-ss/7688502641.html
price: 8500



title: 2006 ford escape limited
drive: 4wd
fuel: gas
odometer: 73000
title status: clean
transmission: automatic
url: https://cnj.craigslist.org/cto/d/bridgewater-low-mileage-ford-escape/7688500269.html
price: 4500



title: 2009 chrysler aspen
VIN: 1A8HW58P59F708523
condition: fair
cylinders: 8 cylinders
drive: 4wd
fuel: gas
odometer: 152350
paint color: white
size: full-size
title status: clean
transmission: automatic
type: SUV
u

In [57]:
data_south_jersey = get_data("https://southjersey.craigslist.org/search/cta#search=1~gallery~0~0")


title: 1957 chevy belair
fuel: gas
odometer: 123456
title status: clean
transmission: other
url: https://southjersey.craigslist.org/cto/d/marlton-57-chevy/7688505335.html
price: 8500



title: 2012 jeep liberty
fuel: gas
odometer: 95000
title status: clean
transmission: automatic
url: https://southjersey.craigslist.org/cto/d/philadelphia-2012-jeep-liberty-latitude/7688502862.html
price: 8500



title: 2011 LAND ROVER LR4
VIN: SALAG2D4XBA558064
fuel: gas
odometer: 133670
paint color: white
title status: clean
transmission: automatic
type: SUV
url: https://southjersey.craigslist.org/ctd/d/new-brunswick-2011-land-rover-lr4/7688491187.html
price: 8900



title: 2016 FORD F-250
VIN: 1FDBF2A63GEC45941
drive: fwd
fuel: gas
odometer: 91984
paint color: white
title status: clean
transmission: automatic
type: pickup
url: https://southjersey.craigslist.org/ctd/d/new-brunswick-2016-ford-250-f250-250/7688490977.html
price: 16499



title: 2020 ISUZU NRR
VIN: JALE5W167L7306216
drive: fwd
fuel: diese

In [72]:
# concat dataframes
data = pd.concat([data_north_jersey, data_jersey_shore, data_central_nj, data_south_jersey])

# reset index
data = data.reset_index(drop=True)

In [73]:
### clean data ###

## narrow down car titles specifically to brands
# create list of brands to search for
car_brands = [
    'Acura', 'Alfa Romeo', 'Aston Martin', 'Audi', 'Bentley', 'BMW', 'Bugatti', 
    'Buick', 'Cadillac', 'Chevrolet', 'Chrysler', 'Citroen', 'Dodge', 'Ferrari', 
    'Fiat', 'Ford', 'Geely', 'General Motors', 'GMC', 'Honda', 'Hyundai', 'Infiniti', 
    'Jaguar', 'Jeep', 'Kia', 'Koenigsegg', 'Lamborghini', 'Land Rover', 'Lexus', 
    'Maserati', 'Mazda', 'McLaren', 'Mercedes-Benz', 'Mini', 'Mitsubishi', 'Nissan', 
    'Peugeot', 'Porsche', 'Ram', 'Renault', 'Rolls Royce', 'Saab', 'Subaru', 
    'Suzuki', 'Tesla', 'Toyota', 'Volkswagen', 'Volvo', 'Hummer', 'Pontiac',
    'ferrari', 'honda', 'hyundai', 'infiniti', 'jaguar', 'jeep', 'kia', 'lamborghini',
    'land rover', 'lexus', 'maserati', 'mazda', 'mclaren', 'mercedes-benz', 'mini',
    'mitsubishi', 'nissan', 'peugeot', 'porsche', 'ram', 'renault', 'rolls royce',
    'saab', 'subaru', 'suzuki', 'tesla', 'toyota', 'volkswagen', 'volvo', 'hummer',
    'pontiac', 'acura', 'alfa romeo', 'aston martin', 'audi', 'bentley', 'bmw',
    'bugatti', 'buick', 'cadillac', 'chevrolet', 'chrysler', 'citroen', 'dodge',
    'fiat', 'ford', 'geely', 'general motors', 'gmc', 'chevy'
]

## create a new column in dataframe with the brand of the car
# check each title for a brand
data['brand'] = data['title'].apply(lambda x: next((brand for brand in car_brands if brand in x.lower()), None))

data.head()

Unnamed: 0,title,condition,fuel,odometer,title status,transmission,url,price,cylinders,drive,paint color,size,type,VIN,brand
0,2010 ferrari california,excellent,gas,32000,clean,automatic,https://newjersey.craigslist.org/cto/d/wood-ri...,90000,,,,,,,ferrari
1,2017 dodge durango gt,good,gas,156000,clean,automatic,https://newjersey.craigslist.org/cto/d/caldwel...,10500,6 cylinders,4wd,black,full-size,SUV,,dodge
2,2009 ford explorer xlt 4wd,good,gas,95895,clean,automatic,https://newjersey.craigslist.org/cto/d/wayne-2...,7500,6 cylinders,4wd,black,,,,ford
3,2018 BMW X1,,gas,50570,clean,,https://newjersey.craigslist.org/ctd/d/woodbur...,20995,,,blue,full-size,SUV,WBXHT3C37J5F92685,bmw
4,2019 ACURA TLX,,gas,37111,clean,,https://newjersey.craigslist.org/ctd/d/woodbur...,27495,,fwd,black,,sedan,19UUB2F68KA007621,acura


In [74]:
# save to csv
data.to_csv('craigslist_data.csv', index=False)