In [1]:
import os
import re
import json
import pandas as pd
import time

In [2]:
def get_text(text):
    text = re.sub('\n+', ' ', text.strip())
    text = re.sub('\r+', ' ', text.strip())
    text = ' '.join(map(lambda x: x.strip(), text.strip().split('\r\n')))
    clean_text, start = '', 0
    while text:
        text = text.split('<', 1)
        clean_text += text[0]
        if len(text) < 2:
            break
        tmp = text[1].split('>', 1)
        if len(tmp) < 2:
            break
        text = tmp[1]
    clean_text = unicode(clean_text, errors='ignore')
    clean_text = re.sub(' +', ' ', clean_text)
    return clean_text.strip()

In [3]:
def parse_item(filename):
    # filename = '/media/intel/m2/blackbankmarket/2014-12-06/item/1793'
    if not os.path.isfile(filename):
        return None
    
    with open(filename) as fp:
        content = fp.read()

    main_conttent_pattern = re.compile('<a class=dark href="/market/home">Market</a>(.*?)' +
                                       '<div class="heavy tiny inline pad5 light-gray-color">', re.DOTALL)
    main_content = re.findall(main_conttent_pattern, content)
    
    if not main_content:
        return None
    else:
        main_content = main_content[0]

    cat = re.findall('<a class=dark href="/cat/.*?">(.*?)</a>', main_content)[0]
    # print main_content
    
    img_tmp = re.findall('img src="(.*?)" title="(.*?)" alt=', main_content)
    if not img_tmp:
        img_tmp = re.findall('img src="(.*?)" title=(.*?) style=', main_content)
    image_path, title = img_tmp[0]

    price_pattern = re.compile(r'>Price:.*?>([\.0-9]*?) BTC', re.DOTALL)

    price_btc = re.findall(price_pattern, main_content)[0]

    ships_from = re.findall('>Ships from:<.*>(.*?)<', main_content)
    if ships_from:
        ships_from = get_text(ships_from[0])
    else:
        ships_from = None

    seller_pattern = re.compile('>Vendor:<.*?>.*?<a href="/vendor/(.*?)">', re.DOTALL)
    seller = re.findall(seller_pattern, main_content)[0]
    seller_email = re.findall('>Contact:<.*> (.*?@.*?)<', main_content)
    if seller_email:
        seller_email = get_text(seller_email[0])
    else:
        seller_email = None

    text_pattern = re.compile('>Description<.*?<div class="small item-description pad5 round".*?>'
                              +'(.*)>Item Rating:<.*<a href="/market/home\?navi=shop', re.DOTALL)
    raw_text = re.findall(text_pattern, main_content)
    text = get_text(raw_text[0])
    
    iteminfo = {}
    iteminfo['title'] = title
    iteminfo['itemID'] = filename
    iteminfo['btc_price'] = price_btc
    iteminfo['description'] = text
    iteminfo['seller'] = seller
    iteminfo['seller_email'] = seller_email
    iteminfo['seller_location'] = ships_from
    iteminfo['date'] = filename.split('/')[-3]
    iteminfo['img'] = image_path
    
    return iteminfo

In [7]:
root_path = '/media/intel/m2/blackbankmarket/'
item = {}
error_count = 0

In [8]:
def analyze(path):
    # print path
    global error_count
    webpage_folder_path = os.path.join(path, 'item')
    if not os.path.isdir(webpage_folder_path):
        return None
    filenames = map(lambda wp: os.path.join(path, 'item', wp),
                    os.listdir(webpage_folder_path))
    for filename in filenames:
        try:
            parse_res = parse_item(filename)
            if parse_res:
                item[filename] = parse_res               
        except:
            print filename
            error_count += 1
            if error_count == 100:
                break
    return 1

In [9]:
paths = [os.path.join(root_path, x) for x in os.listdir(root_path)]
paths = sorted(paths)

In [10]:
time0 = time.time()
for path in paths:
    if os.path.isdir(path):
        analyze(path)
        if error_count == 100:
            break
print time.time() - time0

73.2687330246


In [11]:
print len(item)
with open('blackbank.json', 'w') as fp:
    json.dump(item, fp)

92368


In [12]:
import pandas as pd
import re
import os
import json
import time
from shutil import copy2
import shutil
import hashlib

In [13]:
data = pd.read_json('blackbank.json', orient='index')

len(data)

92368

In [14]:
len(data.seller.unique())

834

In [15]:
df_unique = data.drop_duplicates(['seller', 'title'])

len(df_unique)

10918

In [16]:
df_unique = df_unique[df_unique.img.notnull()]
df_unique = df_unique.sort_index()

In [17]:
root_path = '/media/intel/m2/blackbankmarket/'
df_unique['image_location'] = root_path + df_unique.date.astype(str) + df_unique.img

In [18]:
seller_name_list = list(df_unique.seller)

image_location = list(df_unique.image_location)

itemID = [hashlib.md5(x).hexdigest() for x in df_unique.index.astype(str)]

In [19]:
target_path = '/media/intel/m2/imgBlackBank'
try:
    os.mkdir(target_path)
except:
    pass
seller_paths = [os.path.join(target_path, re.sub('[\W_]+', '00', x)) for x in seller_name_list]

In [20]:
def image_seller_path(itemID, image_location, seller_path):
    if not os.path.isfile(image_location):
        return 0
    copy2(image_location, os.path.join(seller_path, itemID))
    return 1

In [21]:
time0 = time.time()
uniq_item_count = len(itemID)
img_count = 0
for i in xrange(uniq_item_count):
    if not os.path.isdir(seller_paths[i]):
        os.makedirs(seller_paths[i])
    img_count += image_seller_path(itemID[i], image_location[i],
                                       seller_paths[i])
print time.time() - time0, img_count

8.05149507523 8115


In [22]:
for seller_path in set(seller_paths):
    if os.path.isdir(seller_path):
        if not len(os.listdir(seller_path)):
            os.rmdir(seller_path)