In [1]:
import os
import re
import json
import pandas as pd
import time

In [2]:
def get_text(text):
    text = re.sub('\n+', ' ', text.strip())
    text = re.sub('\r+', ' ', text.strip())
    text = ' '.join(map(lambda x: x.strip(), text.strip().split('\r\n')))
    clean_text, start = '', 0
    while text:
        text = text.split('<', 1)
        clean_text += text[0]
        if len(text) < 2:
            break
        tmp = text[1].split('>', 1)
        if len(tmp) < 2:
            break
        text = tmp[1]
    clean_text = unicode(clean_text, errors='ignore')
    clean_text = re.sub(' +', ' ', clean_text)
    return clean_text.strip()

In [3]:
def parse_item(filename):
    with open(filename) as fp:
        content = fp.read()
    
    if not content:
        return None
    title_pattern = re.compile('<title>(.*?)</title>', re.DOTALL)
    title = re.findall(title_pattern, content)
    if not title:
        return None
    if get_text(title[0]) == u'we rise again':
        return None
    img_file = re.findall('<link href="(/images/.*?.css)"', content)
    if len(img_file):
        img_file = img_file[0]

    main_content_pattern = re.compile('<div class="body".*?>(.*)<a name="shipping">',
                                      re.DOTALL)
    main_content = re.findall(main_content_pattern, content)
    if main_content:
        main_content = main_content[0]
    else:
        return None

    image_names = re.findall('<div class="item_image main" id="(.*?)">', main_content)
    if len(image_names):
        image_name = image_names[0]
    else:
        image_name = None
    if len(image_names) > 1:
        print "%d images" % len(image_names), filename
        
    title = re.findall('<h2>(.*)</h2>', main_content)[0]

    price_pattern = re.compile('<div class="price_big">(.*?)</div>', re.DOTALL)
    price_btc = re.findall(price_pattern, main_content)[0]
    price_btc = re.sub(r"[^.0-9]", "", price_btc)
    price_btc = float(price_btc) if price_btc else None
        
    seller = re.findall('vendor: <a.*?>(.*?)</a>', main_content)[0]
    ships_from = re.findall('ships from: (.*)', main_content)[0]
    ships_to = re.findall('ships to: (.*)', main_content)[0]
    ships_from = get_text(ships_from)
    ships_to = get_text(ships_to)

    text_pattern = re.compile('<div class="container container_large".*?>(.*)', re.DOTALL)
    raw_text = re.findall(text_pattern, main_content)[0]
    text = get_text(raw_text)
    
    iteminfo = {}
    iteminfo['title'] = title
    iteminfo['itemID'] = filename.split('?', 1)[0]
    iteminfo['btc_price'] = price_btc
    iteminfo['description'] = text
    iteminfo['seller'] = seller
    iteminfo['ship_to'] = ships_to
    iteminfo['seller_location'] = ships_from
    iteminfo['date'] = filename.split('/')[-3]
    if img_file and image_name:
        iteminfo['img'] = img_file + '######' + image_name
    else:
        iteminfo['img'] = None
    
    return iteminfo

In [7]:
root_path = '/media/intel/m2/silkroad2/'
item = {}
error_count = 0

In [8]:
def analyze(path):
    # print path
    global error_count
    webpage_folder_path = os.path.join(path, 'items')
    filenames = map(lambda wp: os.path.join(path, 'items', wp),
                    os.listdir(webpage_folder_path))
    for filename in filenames:
        try:
            if filename.split('?')[0] not in item:
                parse_res = parse_item(filename)
                if parse_res is not None:
                    item[filename.split('?')[0]] = parse_res               
        except:
            print filename
            error_count += 1
            if error_count == 10000:
                break
    return 1

In [9]:
paths = [os.path.join(root_path, x) for x in os.listdir(root_path)]
paths = sorted(paths)

In [10]:
time0 = time.time()
for path in paths:
    if os.path.isdir(path):
        analyze(path)
        if error_count == 10000:
            break
print time.time() - time0

236.942967892


In [11]:
print len(item)
with open('silkroad2.json', 'w') as fp:
    json.dump(item, fp)

363968


In [29]:
import pandas as pd
import re
import os
import json
import time
from shutil import copy2
import shutil
import hashlib

In [13]:
data = pd.read_json('silkroad2.json', orient='index')

len(data)

363968

In [75]:
len(data.seller.unique())

1332

In [72]:
df_unique = data.drop_duplicates(['seller', 'title'])

len(df_unique)

35114

In [73]:
df_unique = df_unique[df_unique.img.notnull()]
df_unique = df_unique.sort_index()

In [23]:
root_path = '/media/intel/m2/silkroad2/'
df_unique['image_location'] = root_path + df_unique.date.astype(str) + df_unique.img

In [39]:
seller_name_list = list(df_unique.seller)

image_location = list(df_unique.image_location)

itemID = [hashlib.md5(x).hexdigest() for x in df_unique.index.astype(str)]

In [66]:
target_path = '/media/intel/m2/imgSilkRoad'
try:
    os.mkdir(target_path)
except:
    pass
seller_paths = [os.path.join(target_path, re.sub('[\W_]+', '00', x)) for x in seller_name_list]

In [67]:
def image_seller_path(itemID, image_location, seller_path):
    image_location, _ = image_location.split('######')
    if not os.path.isfile(image_location):
        return 0
    with open(image_location) as fp:
        image_files = fp.read()
    imgbase64 = re.findall("content: url\('data:image/jpeg;base64,(.*)'", image_files)
    for i in range(len(imgbase64)):
        image_name = itemID + str(i) + '.jpg'
        image_tar_path = os.path.join(seller_path, image_name)
        with open(image_tar_path, "wb") as fp:
            fp.write(imgbase64[i].decode('base64'))
    return len(imgbase64)

In [68]:
time0 = time.time()
uniq_item_count = len(itemID)
img_count = 0
for i in xrange(uniq_item_count):
    if not os.path.isdir(seller_paths[i]):
        os.makedirs(seller_paths[i])
    img_count += image_seller_path(itemID[i], image_location[i],
                                       seller_paths[i])
print time.time() - time0, img_count

14.3879628181 11414


In [71]:
for seller_path in set(seller_paths):
    if os.path.isdir(seller_path):
        if not len(os.listdir(seller_path)):
            os.rmdir(seller_path)