In [17]:
import pandas as pd
import requests
import sys
from bs4 import BeautifulSoup
import multiprocessing as mp
from tqdm import tqdm
from datetime import datetime
import re

In [18]:
base_folder = 'Butterfly'

In [19]:
def get_search(search):
    # Initial query
    search_term = '+'.join(search.split(' '))
    URL = 'https://www.ebay.com/sch/i.html?_sacat=0&LH_Sold=1&_udlo=&_udhi=&_samilow=&_samihi=&_sop=12&_dmd=1&_ipg=200&LH_Complete=1&_fosrp=1&_nkw=%22'+search_term+'%22&rt=nc&LH_PrefLoc=2'
    page = requests.get(URL,{'country':'US'})
    soup = BeautifulSoup(page.content, 'html.parser')
    
    found = False
    cnt = 0
    while not found:
        if cnt > 10:
            return URL, 0, False, cnt
        cnt = cnt+1
        
        page = api.get(URL,{'country':'US'})
        soup = BeautifulSoup(page['body'], 'html.parser')
        try:
            soup.find('div', id='cbelm').find_all('span', class_='rcnt')[0].text
            found = True
        except:
            found = False
            
    # Get number of results
    try:
        count = soup.find('div', id='cbelm').find_all('span', class_='rcnt')[0].text
    except:
        try:
            count = soup.find_all('span', class_='nllclt')[0].find_all('b')[0].text
        except:
            print(URL)
            count = soup.find_all('span', class_='rcnt')[0].text
        
    #If no results are returned then skip it
    if count == '0':
        return URL, count, False

    # Get the main list container of the results
    results = soup.find('ul', id='ListViewInner')

    # Get listing items
    items = results.find_all('li', class_='sresult', recursive=False)

    # Capture all data in this list
    dict_list = []

    # Loop through listings
    for item in items:
        # Define empty dictionary
        dict_item = {
            'id': None,
            'title': None,
            'link': None,
            'image': None,
            'price_sold': None,
            'end_date': None,
            'auction_end_date':None,
            'origin': None,
            'seller_name': None,
            'seller_link': None,
            'sell_type': None,
            'num_bids': 0,
            'num_sold': 1,
            #'description': None
        }

        # Grab all the high level data data
        id = item.get('id')
        title_and_link = item.find_all('a', class_='vip')[0]
        title = title_and_link.get('title').replace('Click this link to access ','')
        link = title_and_link.get('href')
        price_info = item.find_all('ul', class_='lvprices', recursive=False)[0].find_all('li', class_='lvprice')[0]
        price_best_offer = price_info.find_all('span', class_='sboffer')
        price_bid_sold = price_info.find_all('span', class_='bidsold')
        end_date = item.find_all('ul', class_='lvdetails', recursive=False)[0].find_all('span', class_='tme')[0].find_all('span')[0].text.strip()
        
        try:
            image = item.find_all('img', class_='img')[0].get('src')
        except:
            image = 'NA'

        # Detailed Scraping
        deeper_info = BeautifulSoup(requests.get(link).content, 'html.parser')
        
        # Get seller info
        try:
            seller_info = deeper_info.find_all('div', class_='si-content')[0]
            seller_name = seller_info.find_all('span', class_='mbg-nw')[0].text
            seller_link = seller_info.find_all('a')[0].get('href')
        except:
            seller_name = 'Unknown'
            seller_link = 'Unknown'

        # Get sales type, num bids and num sold
        try:
            #num_bids = deeper_info.find('a', id='vi-VR-bid-lnk').find_all('span')[0].text.strip()
            num_bids = deeper_info.find('div',id='mainContent').find_all('span', text=re.compile("bids"))[0].parent()[0].text.strip()
            auction_end_date = deeper_info.find('span', id='bb_tlft').text.strip()
            sell_type = 'Auction'
            num_sold = 1
        except:
            sell_type = 'Instant'
            num_bids = 0
            auction_end_date = None
            try:
                num_sold = deeper_info.find('div',id='mainContent').find_all('a', text=re.compile("sold"))[0].text.strip().replace(' sold','')
            except:
                num_sold = 1
                
        # Get locality
        ### NEEDS UPDATING ####
        try:
            origin = deeper_info.find('div',id='mainContent').find_all('div', text=re.compile("Item location"))[0].parent()[1].text.strip()
        except:
            origin = 'Unknown'

        # Description
        #description_src = deeper_info.find_all('iframe')[0].get('src')
        #description = ' '.join(BeautifulSoup(requests.get(description_src).content, 'html.parser').find('div', id='ds_div').text.strip().split())

        # Put the data into the dictionary and append to the list
        dict_item['id'] = id
        dict_item['title'] = title
        dict_item['link'] = link
        dict_item['image'] = image
        if len(price_best_offer) > 0:
            dict_item['price_sold'] = price_best_offer[0].text.strip()
        else:
            dict_item['price_sold'] = price_bid_sold[0].text.strip()
        dict_item['end_date'] = end_date
        dict_item['auction_end_date'] = auction_end_date
        dict_item['num_bids'] = num_bids
        dict_item['origin'] = origin
        dict_item['seller_name'] = seller_name
        dict_item['seller_link'] = seller_link
        dict_item['sell_type'] = sell_type
        dict_item['num_sold'] = num_sold
        #dict_item['description'] = description

        dict_list.append(dict_item)

    # convert data to pandas dataframe and save species search result to CSV
    df = pd.DataFrame(dict_list)
    df.to_csv(base_folder+"/past sales/past_sales_"+search+".csv", index=False, encoding = 'utf-8-sig')
    
    return URL, count, True

In [20]:
# Get today's date
date = datetime.today().strftime('%Y-%m-%d')

# Load file containing past scraps so that we can resume where we left off if we stopped
try:
    done_list = pd.read_csv(base_folder+'/past_search_result_'+date+'.csv')
    done = done_list['species'].tolist()
    done_list = done_list.to_dict('records')
except:
    done_list = []
    done = []

# Get list of species
#search_list = pd.read_csv(base_folder+'/butterfly species list_2020_2021_march_version_part1.csv')['Species'].tolist()
search_list = ['Teinopalpus aureus']
# Loop through list of species, making sure to skip the ones we already finished
for search in tqdm(search_list):
    if search in done:
        continue
    # Get scrape results
    URL, count, result = get_search(search)
    
    # Append scrape results to the list
    search_result_dict = {'species': None, 'found': None, 'URL': None, 'count': None}
    search_result_dict['URL'] = URL
    search_result_dict['species'] = search
    search_result_dict['found'] = result
    search_result_dict['count'] = count
    done_list.append(search_result_dict)
    
    # Save metadata
    df_result = pd.DataFrame(done_list)
    df_result.to_csv(base_folder+"/past_search_result_"+date+".csv", index=False)
    

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

<!DOCTYPE html>
<html lang="en"><head><meta content="IE=Edge" http-equiv="X-UA-Compatible"/><meta content="width=device-width, initial-scale=1" name="viewport"/><link href="https://pages.ebay.com/favicon.ico" rel="icon"/><link href="//ir.ebaystatic.com" rel="dns-prefetch"/><link href="//secureir.ebaystatic.com" rel="dns-prefetch"/><link href="//i.ebayimg.com" rel="dns-prefetch"/><link href="//rover.ebay.com" rel="dns-prefetch"/><script>$ssgST=new Date().getTime();</script><meta charset="utf-8"/><title>Security Measure</title><style>
			#dCF_captcha_text {
				display: none !important;
			}
			.pgHeading {				
				margin: 10px 0px 20px 10px;
				padding: 10px 0px 0px 0px;				
			}			
			.pgHeading > h1 {
				line-height: 27px;
				text-align: left;
				margin: 0px 10px 20px 0px;
				font-size: 1.2em;				
				font-family: Arial, Helvetica, sans-serif;
				font-weight: normal;
				color: #5d5d5d;
			}
			.pgCenter {				
				margin: 10px 10px 10px 10px;
				text-align: justify;
				font-si




IndexError: list index out of range