In [9]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import multiprocessing as mp
from tqdm import tqdm
from datetime import datetime
import sys

In [10]:
base_folder = 'Endangered Current Sept'

In [11]:
def get_search(search_term):
    # Initial query
    search_term = '+'.join(search_term.split(' '))
    URL = 'https://www.ebay.com/sch/i.html?_sacat=0&_udlo&_udhi&_ftrt=901&_ftrv=1&_sabdlo&_sabdhi&_samilow&_samihi&_sop=12&_dmd=1&_ipg=200&_fosrp=1&_nkw=%22'+search_term+'%22&rt=nc&LH_PrefLoc=2&_trksid=p2045573.m1684'
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # Get number of results
    try:
        count = soup.find('div', id='cbelm').find_all('span', class_='rcnt')[0].text
    except:
        count = soup.find_all('span', class_='nllclt')[0].find_all('b')[0].text
            
    #If no results are returned then skip it
    if count == '0':
        return URL, count, False

    #If no results are returned then skip it
    results = soup.find('ul', id='ListViewInner')

    # Get listing items
    items = results.find_all('li', class_='sresult', recursive=False)

    # Capture all data in this list
    dict_list = []

    # Loop through listings
    for item in items:
        # Define empty dictionary
        dict_item = {
            'id': None,
            'title': None,
            'link': None,
            'price': None,
            'origin': None,
            'image': None,
            'seller_name': None,
            'seller_link': None,
            'sell_type': None,
            'num_bids': 0,
            'num_watchers': 0,
            'num_stock': 1,
            'date': None,
            #'description': None
        }
        
        # Grab all the high level data
        id = item.get('id')
        title_and_link = item.find_all('a', class_='vip')[0]
        title = title_and_link.get('title').replace('Click this link to access ','')
        link = title_and_link.get('href')
        price = item.find_all('li', class_='lvprice')[0].find_all('span')[0].text.strip()
        try:
            image = item.find_all('img', class_='img')[0].get('src')
        except:
            image = 'NA'

        # Detailed Scraping
        try:
            deeper_info = BeautifulSoup(requests.get(link).content, 'html.parser')
        except:
            print(link)
        
        # Get seller info
        try:
            seller_info = deeper_info.find_all('div', class_='si-inner')[0]
            seller_name = seller_info.find_all('span', class_='mbg-nw')[0].text
            seller_link = seller_info.find_all('a')[0].get('href')
        except:
            continue
        
        # Get locality
        try:
            origin = deeper_info.find_all('div', id='itemLocation')[0].find_all('span')[0].text.strip()
        except:
            origin = 'Unknown'
        
        # Get sales type, num bids, num stock
        try:
            num_bids = deeper_info.find('a', id='vi-VR-bid-lnk').find('span', id='qty-test').text.strip()
            sell_type = 'Auction'
            num_stock = 1
        except:
            sell_type = 'Instant'
            num_bids = 0
            try:
                num_stock = deeper_info.find('span', id='qtySubTxt').find_all('span')[0].text.strip().replace(' available','')
            except:
                num_stock = 1
        
        # Get num watchers
        try:
            num_watchers = deeper_info.find('div', id='why2buy').find_all('span', class_='w2b-sgl')[0].text.strip().replace(' watchers')
        except:
            num_watchers = 0
            
        # Get starting date
        try:
            bid_link = deeper_info.find('a', id='vi-VR-bid-lnk').get('href')
            bid_page = requests.get(bid_link)
            bid_info = BeautifulSoup(bid_page.content, 'html.parser')
            date = bid_info.find_all('table', class_='ui-component-table_wrapper')[0].find_all('tr')[-1].find_all('td')[-1].get_text().strip()
        except:
            try:
                results = deeper_info.find('div', id='vi-desc-maincntr').find_all('div', class_='vi-desc-revHistory')[0].parent()[5].get_text().strip()
                date = results.replace('Last updated on','').replace('View all revisions','').strip()
            except:
                date = 'Unknown'

        # Description
        #description_src = deeper_info.find_all('iframe')[0].get('src')
        #description = ' '.join(BeautifulSoup(requests.get(description_src).content, 'html.parser').find('div', id='ds_div').text.strip().split())

        # Put the data into the dictionary and append to the list
        dict_item['id'] = id
        dict_item['title'] = title
        dict_item['link'] = link
        dict_item['price'] = price
        dict_item['image'] = image
        dict_item['origin'] = origin
        dict_item['seller_name'] = seller_name
        dict_item['seller_link'] = seller_link
        dict_item['sell_type'] = sell_type
        dict_item['num_watchers'] = num_watchers
        dict_item['num_bids'] = num_bids
        dict_item['num_stock'] = num_stock
        dict_item['date'] = date
        #dict_item['description'] = description

        dict_list.append(dict_item)

    # convert data to pandas dataframe and save species search results
    df = pd.DataFrame(dict_list)
    df.to_csv(base_folder+"/current sales/current_sales_"+search+".csv", index=False, encoding='utf-8-sig')

    return URL, count, True

In [None]:
# Get today's date
date = '2021-08-30'

# Load file containing past scraps so that we can resume where we left off if we stopped
try:
    done_list = pd.read_csv(base_folder+'/current_search_result_'+date+'.csv')
    done = done_list['species'].tolist()
    done_list = done_list.to_dict('records')
except:
    done_list = []
    done = []

# Get list of species
search_list = pd.read_csv(base_folder+'/endangered_species_list.csv')['Species'].tolist()
for search in tqdm(search_list):
    if search in done:
        continue
        
    # Get scrape results
    URL, count, result = get_search(search)
    
    # Append scrap results to the list
    search_result_dict = {'species': None, 'found': None, 'URL': None, 'count': None}
    search_result_dict['URL'] = URL
    search_result_dict['species'] = search
    search_result_dict['found'] = result
    search_result_dict['count'] = count
    done_list.append(search_result_dict)
    
    # Save metadata
    df_result = pd.DataFrame(done_list)
    df_result.to_csv(base_folder+"/current_search_result_"+date+".csv", index=False)

 98%|█████████▊| 9282/9425 [15:45<03:28,  1.46s/it]   