In [1]:
!pip install requests



In [2]:
!pip install scrapy

Collecting scrapy
[?25l  Downloading https://files.pythonhosted.org/packages/eb/9f/81a270190802cf02d49a495a2ee9291ea1d21f969a900880285dd7444d74/Scrapy-2.2.1-py2.py3-none-any.whl (241kB)
[K     |█▍                              | 10kB 18.3MB/s eta 0:00:01[K     |██▊                             | 20kB 2.8MB/s eta 0:00:01[K     |████                            | 30kB 3.8MB/s eta 0:00:01[K     |█████▍                          | 40kB 4.1MB/s eta 0:00:01[K     |██████▉                         | 51kB 3.3MB/s eta 0:00:01[K     |████████▏                       | 61kB 3.7MB/s eta 0:00:01[K     |█████████▌                      | 71kB 3.9MB/s eta 0:00:01[K     |██████████▉                     | 81kB 4.4MB/s eta 0:00:01[K     |████████████▎                   | 92kB 4.5MB/s eta 0:00:01[K     |█████████████▋                  | 102kB 4.4MB/s eta 0:00:01[K     |███████████████                 | 112kB 4.4MB/s eta 0:00:01[K     |████████████████▎               | 122kB 4.4MB/s eta 

In [44]:
import scrapy
import requests
import json
from datetime import date, timedelta, datetime
import pandas as pd
from google.colab import drive

In [4]:
def item_json(url):
    '''Gets the item data from the JSON of an item page.
    INPUT: Item page URL
    OUTPUT: JSON data'''

    # Get a response from the URL
    response = scrapy.Selector(text=requests.get(url, timeout=10).text)
    
    # Get tracking data
    track_item_response = response.xpath('//meta[@name="buyma:track_item_json"]/@content').get()
    track_item_json = json.loads(track_item_response)
    # Get main item data
    recent_item_response = response.xpath('//meta[@name="buyma:recent_item_json"]/@content').get()
    recent_item_json = json.loads(recent_item_response)
    # Get access data
    access = response.xpath('//span[@class="ac_count"]/text()').get()
    # Get favourite data
    fav = response.xpath('//span[@class="fav_count"]/text()').get().split('人')[0]
    
    # Combine JSON data and output
    return {'url':url, 'access_count': int(access), 'fav_count': int(fav), **recent_item_json,**track_item_json}

In [16]:
def seller_list(buyer_page_url, previous_days):
    '''Gets the base data for items listed on a page.
    A date threshold can be set which will tell the scraper to stop going through
    past pages depending on the sale date of the last item on that page.
    INPUT: Buyer page url, number of days from today to previously check
    OUTPUT: List of JSON data 
    '''
    
    items_dict = []
    dt_threshold = datetime.today() - timedelta(previous_days)
    in_time_frame = True
    page_number=1
    buyer_id = buyer_page_url.split('/')[4]
    while True:
        try:
            url = 'https://www.buyma.com/buyer/{}/sales_{}.html'.format(buyer_id,page_number)
            
            response = scrapy.Selector(text=requests.get(url, timeout=10).text)
            buyer_table = response.xpath('//div[@id="buyeritemtable"]')

            # Get item urls
            item_url_extensions = buyer_table.xpath('..//li[@class="buyeritemtable_img"]/a/@href').extract()
            # Get item_images
            item_images = buyer_table.xpath('..//img/@src').extract()
            # Get item_names
            item_names = buyer_table.xpath('..//img/@alt').extract()
            # Get sold amounts
            sold_amounts_unformatted = buyer_table.xpath('..//li[@class="buyeritemtable_info"]/p[2]/text()').extract()
            sold_amounts = [int(i.split('：')[1].split('個')[0]) for i in sold_amounts_unformatted]
            # Get sold dates
            sold_dates_unformatted = buyer_table.xpath('..//li[@class="buyeritemtable_info"]/p[3]/text()').extract()
            sold_dates = [datetime.strptime(i.split('：')[1], '%Y/%m/%d')  for i in sold_dates_unformatted]

            keys = ['url_ext', 'img', 'item_name','sold_amount', 'sold_date']

            # Combine to a dictionary
            items_dict+= [dict(zip(keys,[item_url_extensions[i],
                              item_images[i],
                              item_names[i],
                              sold_amounts[i],
                              sold_dates[i]])) 
                          for i in range(len(item_url_extensions))]
            print('Buyer page:', page_number)
            print('Last date sold:', sold_dates[-1])
            
            # Loop check
            if sold_dates[-1] > dt_threshold:
                page_number+=1
            else:
                print('end')
                break
                
        except:
            print('No more item pages to get in time frame')
            break

        
    return items_dict

In [37]:
def all_listed_items_details(buyer_page_url, previous_days):
    '''Gets the base data for items listed on a page.
    A date threshold can be set which will tell the scraper to stop going through
    past pages depending on the sale date of the last item on that page.
    From the gathered URLs, each item page is accessed individually and the
    pages are scraped.
    INPUT: Buyer page url, number of days from today to previously check
    OUTPUT: List of JSON data 
    '''
    
    # Get the list of items
    buyer_page_data = seller_list(buyer_page_url, previous_days)

    items = []
    for i in buyer_page_data:
        item_url = 'https://www.buyma.com{}'.format(i.get('url_ext'))
        try:
            item_data = item_json(item_url)
        except:
            item_data = {'ERROR':'PAGE UNAVAILABLE'}
        all_item_data = {**i, **item_data}
        items.append(all_item_data)
    
    return items


In [25]:
buyer_page_data = seller_list('https://www.buyma.com/buyer/4880785/sales_1.html', 60)

Buyer page: 1
Last date sold: 2020-06-03 00:00:00
Buyer page: 2
Last date sold: 2020-02-26 00:00:00
end


In [38]:
all = all_listed_items_details('https://www.buyma.com/buyer/4880785/sales_1.html', 60)

Buyer page: 1
Last date sold: 2020-06-03 00:00:00
Buyer page: 2
Last date sold: 2020-02-26 00:00:00
end
https://www.buyma.com/item/49084902/
https://www.buyma.com/item/46113739/
https://www.buyma.com/item/43629712/
https://www.buyma.com/item/44765205/
https://www.buyma.com/item/45827554/
https://www.buyma.com/item/50010752/
https://www.buyma.com/item/50780264/
https://www.buyma.com/item/49084902/
https://www.buyma.com/item/48680473/
https://www.buyma.com/item/50497403/
https://www.buyma.com/item/41341917/
https://www.buyma.com/item/41341917/
https://www.buyma.com/item/50703853/
https://www.buyma.com/item/47552561/
https://www.buyma.com/item/49904328/
https://www.buyma.com/item/31016249/
https://www.buyma.com/item/31016326/
https://www.buyma.com/item/35284276/
https://www.buyma.com/item/49597210/
https://www.buyma.com/item/33409827/
https://www.buyma.com/item/47390396/
https://www.buyma.com/item/46929553/
https://www.buyma.com/item/44958487/
https://www.buyma.com/item/48654017/
https://

In [41]:
df_all = pd.DataFrame(all)

In [43]:
df_all.head()

Unnamed: 0,url_ext,img,item_name,sold_amount,sold_date,url,access_count,fav_count,syo_id,syo_name,syo_img1,syo_img_090_1,syo_img_210_1,syo_url,tanka_format,discount_percentage,on_timesale,brand_name_eigo,brand_url,category_id,category,buyer_id,brand_id,model_id,season_id,thm_id,kokaidate,yukodate,cate_id1,cate_id2,cate_id3,tag_ids,reference_price_kbn,reference_price,timesale_start_date,timesale_end_date,item_id,price,coupon,ERROR
0,/item/49084902/,https://static-buyma-com.akamaized.net/imgdata...,Dior ネックレス CD ロゴ アイコン 真鍮 シルバー 限定 直営 男,1,2020-07-18,https://www.buyma.com/item/49084902/,960.0,42.0,49084902.0,Dior ネックレス CD ロゴ アイコン 真鍮 シルバー 限定 直営 男,https://static-buyma-com.akamaized.net/imgdata...,https://static-buyma-com.akamaized.net/imgdata...,https://static-buyma-com.akamaized.net/imgdata...,https://www.buyma.com/item/49084902/,136500.0,,,Dior,https://www.buyma.com/brand/CHRISTIAN_DIOR_%E3...,3360.0,メンズファッション/アクセサリー,4880785.0,163.0,0.0,33.0,98.0,2019-11-18T01:30:23+09:00,2020-08-01T23:59:59+09:00,1002.0,2206.0,3360.0,"[104, 130, 430, 440, 445, 446, 447, 448, 502, ...",0.0,,,,49084902.0,136500.0,,
1,/item/46113739/,https://static-buyma-com.akamaized.net/imgdata...,Dior バッグ ショルダー ブラック オブリーク ロゴ 直営店 19AW,1,2020-07-17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,PAGE UNAVAILABLE
2,/item/43629712/,https://static-buyma-com.akamaized.net/imgdata...,オプション/MY CLOSET*,1,2020-07-14,https://www.buyma.com/item/43629712/,38.0,0.0,43629712.0,オプション/MY CLOSET*,https://static-buyma-com.akamaized.net/imgdata...,https://static-buyma-com.akamaized.net/imgdata...,https://static-buyma-com.akamaized.net/imgdata...,https://www.buyma.com/item/43629712/,14000.0,,,,,3244.0,レディースファッション/その他ファッション,4880785.0,0.0,0.0,0.0,0.0,2019-05-10T18:22:14+09:00,2020-07-31T23:59:59+09:00,1001.0,2112.0,3244.0,,0.0,,,,43629712.0,14000.0,,
3,/item/44765205/,https://static-buyma-com.akamaized.net/imgdata...,Dior サンダル ネオプレン カーフ ブラック CD ロゴ メンズ 19AW,1,2020-07-10,https://www.buyma.com/item/44765205/,487.0,20.0,44765205.0,Dior サンダル ネオプレン カーフ ブラック CD ロゴ メンズ 19AW,https://static-buyma-com.akamaized.net/imgdata...,https://static-buyma-com.akamaized.net/imgdata...,https://static-buyma-com.akamaized.net/imgdata...,https://www.buyma.com/item/44765205/,169700.0,,,Dior,https://www.buyma.com/brand/CHRISTIAN_DIOR_%E3...,3320.0,メンズファッション/靴・ブーツ・サンダル,4880785.0,163.0,0.0,31.0,98.0,2019-06-22T01:01:48+09:00,2020-08-22T23:59:59+09:00,1002.0,2204.0,3320.0,"[105, 106, 167, 341, 414, 419, 445, 446, 447, ...",0.0,,,,44765205.0,169700.0,,
4,/item/45827554/,https://static-buyma-com.akamaized.net/imgdata...,CHANEL ポーチ カード コイン ホルダー マト CC ラム ブラック,1,2020-07-10,https://www.buyma.com/item/45827554/,261.0,9.0,45827554.0,CHANEL ポーチ カード コイン ホルダー マト CC ラム ブラック,https://static-buyma-com.akamaized.net/imgdata...,https://static-buyma-com.akamaized.net/imgdata...,https://static-buyma-com.akamaized.net/imgdata...,https://www.buyma.com/item/45827554/,175500.0,,,CHANEL,https://www.buyma.com/brand/CHANEL_%E3%82%B7%E...,3170.0,レディースファッション/財布・小物,4880785.0,158.0,97.0,28.0,98.0,2019-07-29T02:21:59+09:00,2020-07-29T23:59:59+09:00,1001.0,2114.0,3170.0,"[106, 336, 430, 445, 446, 447, 448, 502, 504, ...",0.0,,,,45827554.0,175500.0,,


In [45]:
# Mount your Drive to the Colab VM.
drive.mount('/gdrive')

# Write the DataFrame to CSV file.
with open('/gdrive/My Drive/foo.csv', 'w') as f:
  df_all.to_csv(f)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


In [5]:
item_json('https://www.buyma.com/item/53799545/')

{'access_count': 607,
 'brand_id': 158,
 'brand_name_eigo': 'CHANEL',
 'brand_url': 'https://www.buyma.com/brand/CHANEL_%E3%82%B7%E3%83%A3%E3%83%8D%E3%83%AB.html',
 'buyer_id': 4880785,
 'cate_id1': 1002,
 'cate_id2': 2204,
 'cate_id3': 3321,
 'category': 'メンズファッション/靴・ブーツ・サンダル',
 'category_id': 3321,
 'coupon': None,
 'discount_percentage': None,
 'fav_count': 26,
 'item_id': 53799545,
 'kokaidate': '2020-04-27T11:07:53+09:00',
 'model_id': 104,
 'on_timesale': '',
 'price': 153720,
 'reference_price': None,
 'reference_price_kbn': '0',
 'season_id': 32,
 'syo_id': 53799545,
 'syo_img1': 'https://static-buyma-com.akamaized.net/imgdata/item/200427/0053799545/256874963/100.jpg',
 'syo_img_090_1': 'https://static-buyma-com.akamaized.net/imgdata/item/200427/0053799545/256874963/090.jpg',
 'syo_img_210_1': 'https://static-buyma-com.akamaized.net/imgdata/item/200427/0053799545/256874963/210.jpg',
 'syo_name': 'CHANEL スニーカー シューズ 靴 CC ホワイト ロゴ 人気 新作 男',
 'syo_url': 'https://www.buyma.com/item/5

In [19]:
seller_list('https://www.buyma.com/buyer/4880785/sales_1.html', 60)


Buyer page: 1
Last date sold: 2020-06-03 00:00:00
Buyer page: 2
Last date sold: 2020-02-26 00:00:00
end


[{'img': 'https://static-buyma-com.akamaized.net/imgdata/item/191118/0049084902/227591698/210.jpg',
  'item_name': 'Dior ネックレス CD ロゴ アイコン 真鍮 シルバー 限定 直営 男',
  'sold_amount': 1,
  'sold_date': datetime.datetime(2020, 7, 18, 0, 0),
  'url_ext': '/item/49084902/'},
 {'img': 'https://static-buyma-com.akamaized.net/imgdata/item/190808/0046113739/209733446/210.jpg',
  'item_name': 'Dior バッグ ショルダー ブラック オブリーク ロゴ 直営店 19AW',
  'sold_amount': 1,
  'sold_date': datetime.datetime(2020, 7, 17, 0, 0),
  'url_ext': '/item/46113739/'},
 {'img': 'https://static-buyma-com.akamaized.net/imgdata/item/190510/0043629712/195083141/210.jpg',
  'item_name': 'オプション/MY CLOSET*',
  'sold_amount': 1,
  'sold_date': datetime.datetime(2020, 7, 14, 0, 0),
  'url_ext': '/item/43629712/'},
 {'img': 'https://static-buyma-com.akamaized.net/imgdata/item/190620/0044765205/201770895/210.jpg',
  'item_name': 'Dior サンダル ネオプレン カーフ ブラック CD ロゴ メンズ 19AW',
  'sold_amount': 1,
  'sold_date': datetime.datetime(2020, 7, 10, 0, 0),
  'u