In [8]:
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool as ThreadPool
from woocommerce import API
from lxml import etree

import re
import requests
import os
import json
import pandas as pd
import numpy as np

In [3]:
def get_items_by_category(category_id):
    template = 'https://growtrade.ru/catalog/{}/'
    items = []
    
    for i in range(5):
        r = requests.get(template.format(category_id), params = {'SHOWALL_1': str(1)})
        if r.status_code == 200:
            break
                
    if r.status_code != 200:
        print('stopping with category {0}: status code = {1}'.format(category_id, r.status_code))
        return items
        
        
    soup = BeautifulSoup(r.text, 'lxml')
    for tag in soup.find_all('div', class_='catalog-item-title'):
        link = tag.find('a').get('href')
        items.append('https://growtrade.ru'+link)
    
    return items

In [4]:
def download_item(item_link):
    if item_link in processed_items:
        return
    for i in range(5):
        r = requests.get(item_link)
        if r.status_code == 200:
            break

    if r.status_code != 200:
        print('stopping with item {0}: status code = {1}'.format(item_link, r.status_code))
        return items
    
    soup = BeautifulSoup(r.text, 'lxml')
    
    name = soup.find('h1', id='pagetitle').text
    name = name.strip()
    
    item_name = re.sub('[\\\\/:*?\"<>|]', '', name)
    
#     create folder for item files: 
    path = os.path.join(os.getcwd(), item_name)
    print(path)
    os.mkdir(path)
    
#     write item name to name.txt:
    with open(item_name+'/'+'name.txt', 'w') as f:
        f.write(name)
    
#     description download:
    desc = soup.find('div', class_='catalog-detail-full-desc')
    desc.h4.decompose()
    desc.div.decompose()
    desc = desc.decode_contents()
    
    with open(item_name+'/'+'description.txt', 'w') as f:
        f.write(desc)
    
#     pictures download:        
    i=1
    for tag in soup.find_all('div', class_='catalog-detail-image'):
        link = 'https://growtrade.ru' + tag.find('a').get('href')
        img = requests.get(link)
        with open(item_name+'/'+str(i)+'.jpg', 'wb') as f:
            f.write(img.content)
        i +=1
    
    processed_items.append(item_link)


In [5]:
def process_item(item_link, catalog_df, pics_df):
    if item_link in processed_items:
        return catalog_df, pics_df 
    
    for i in range(5):
        r = requests.get(item_link)
        if r.status_code == 200:
            break
    if r.status_code != 200:
        print('stopping with item {0}: status code = {1}'.format(item_link, r.status_code))
        return catalog_df, pics_df
    
    
    soup = BeautifulSoup(r.text, 'lxml')
    
#     name:
    name = soup.find('h1', id='pagetitle').text
    name = name.strip()
    
#     description:
    desc = soup.find('div', class_='catalog-detail-full-desc')
    if desc: 
        desc.h4.decompose()
        desc.div.decompose()
        desc = desc.decode_contents()
    
#     category:
    cat = item_link.split('/')[-3]
    
    catalog_row = {
        'name': name,
        'description': desc,
        'category': cat
    }
    
    catalog_df = catalog_df.append(catalog_row, ignore_index=True)
    
#     pictures:        
    for tag in soup.find_all('div', class_='catalog-detail-image'):
        link = 'https://growtrade.ru' + tag.find('a').get('href')
        pics_row = {
            'name': name,
            'img_url': link
        }
        pics_df = pics_df.append(pics_row, ignore_index=True)    
    
    processed_items.append(item_link)
    
    return catalog_df, pics_df 

In [None]:
# скачиваем категории

category = dict()
category_parent = dict()

catalog = 'https://growtrade.ru/catalog/'

for i in range(5):
    r = requests.get(catalog)
    if r.status_code == 200:
        break
                
if r.status_code != 200:
    print('stopping with category {0}: status code = {1}'.format(category_id, r.status_code))
            
soup = BeautifulSoup(r.text, 'lxml')
for cat in soup.find_all('div', class_='catalog-section-info'):
    parent = cat.find('div', class_='catalog-section-title')
    
    childs = cat.find('div', class_='catalog-section-childs')
    for child in childs.find_all('a'):
        link = child.get('href')
        id = link.split('/')[-2]
        id = int(id)
        category[id] = child.getText()
        category_parent[id] = parent.getText().strip() 

In [106]:
# скачиваем ссылки товаров для каждой из категорий

items = []
for category_id in category.keys():
    cur_items = get_items_by_category(category_id)
    items += cur_items
#     print('Retrieved {0} item links from {1}'.format(len(cur_items), category[category_id]))


# Скачиваем товары и картинки и сохраняем в эксельки

processed_items = []

catalog = pd.DataFrame(columns=['name', 'description', 'category'])   
pics = pd.DataFrame(columns=['name', 'img_url'])   

for item in items:
    catalog, pics = process_item(item, catalog, pics)

catalog.to_excel('catalog.xlsx')
pics.to_excel('pics.xlsx')

In [5]:
# открываем из экселек 

price = pd.read_csv('price.csv', encoding='windows-1251', sep=';')
price = price.rename(columns={'Номенклатура': 'name'})
price.name = price.name.str.strip()
price['Цена'] = price['Цена'].str.replace(u'\xa0', u' ')
price['Цена'] = price['Цена'].str.replace(u' ', u'')
price['Цена'] = price['Цена'].str.replace(u',', u'.')
price['Остаток'] = price['Остаток'].str.replace(u'\xa0', u' ')
price['Остаток'] = price['Остаток'].str.replace(u' ', u'')
price['Остаток'] = price['Остаток'].str.replace(u',', u'.')

catalog = pd.read_excel('catalog.xlsx')
catalog.name = catalog.name.str.strip()
catalog.description.fillna('', inplace=True)
catalog.description = catalog.description.str.strip()

merged=pd.merge(catalog, price, on='name', how='inner')

pics = pd.read_excel('pics.xlsx')
pics.name = pics.name.str.strip()

In [108]:
# Выгрузка категорий на сайт

parent_id = dict()
for cat in set(category_parent.values()):
    data = {
        "name": cat
    }
    r = wcapi.post("products/categories", data)
    parent_id[cat] = r.json()['id']
    

category_id = dict()
for cat in category:
    data = {
        "name": category[cat],
        "parent": parent_id[category_parent[cat]]
    }
    r = wcapi.post("products/categories", data)
    category_id[cat] = r.json()['id']

In [205]:
merged.replace({'category': category_id}, inplace=True)
merged['Цена'].fillna('', inplace=True)

In [251]:
df = merged
sku_id = dict()

In [274]:
for i in range(df.shape[0]):
    if df['Артикул'][i] in sku_id.keys():
        continue
        
    data = {
        'name': df['name'][i],
        'sku': df['Артикул'][i],
        'regular_price': df['Цена'][i],
        'description': df['description'][i],
        'categories': [
            {
                'id': int(df['category'][i])
            }
        ],
        'images': [{'src': url} for url in pics[pics.name==df['name'][i]]['img_url']],
        
        'manage_stock': True,
        'stock_quantity': int(float(df['Остаток'][i]))
    }
    
    r = wcapi.post("products", data)
    
    if r.status_code != 201:
        print('ERROR: ' + str(r.status_code))
        print('SKU: ' + str(df['Артикул'][i]))
        print('DATA: ' + str(data))
        print(r.json())
        print('i: '+str(i))
        break
        
    sku_id[df['Артикул'][i]] = r.json()['id']
    uploaded_sku.add(df['Артикул'][i])

In [14]:
wcapi = API(
    url='https://growprofi.ru/',
    consumer_key='ck_aab26433cc72833f5e8b15617d11765b6c45adeb',
    consumer_secret='cs_c77d45ea53d72ce60c4bbb9bbdc38fe99031cfa4',
    timeout=200
)

In [15]:
with open('sku_id.json', 'r') as f: 
    sku_id = json.load(f)

In [21]:
len(sku_id.values())

876

ОБНОВЛЕНИЕ НАЛИЧИЯ:

In [16]:
price = pd.read_csv('price.csv', encoding='windows-1251', sep=';')

price = price.rename(columns={'Номенклатура': 'name'})
price.name = price.name.str.strip()

price['Цена'] = price['Цена'].str.replace(u'\xa0', u' ')
price['Цена'] = price['Цена'].str.replace(u' ', u'')
price['Цена'] = price['Цена'].str.replace(u',', u'.')
price['Остаток'] = price['Остаток'].str.replace(u'\xa0', u' ')
price['Остаток'] = price['Остаток'].str.replace(u' ', u'')
price['Остаток'] = price['Остаток'].str.replace(u',', u'.')
price['Цена'].fillna('0', inplace=True)
price['Цена'] = price['Цена'].astype('float')
price['Цена'] = np.floor(price['Цена'])
price['Цена'] = price['Цена'].astype('int')

In [17]:
updated = 0

for i in range (price.shape[0]):
    if price['Артикул'][i] not in sku_id.keys():
        continue
    
    amnt = 1
    many = re.search(regexp, price.name[i])
    if many:
        amnt = int(many.group(1))
    
    data = {
        'regular_price': str(int(price['Цена'][i]/amnt)),
        'stock_quantity': int(float(price['Остаток'][i]) * amnt)
    }
    
    id = sku_id[price['Артикул'][i]]
    
    r = wcapi.put('products/'+str(id), data)
    
    if r.status_code != 200:
        print('ERROR: ' + str(r.status_code))
        print('SKU: ' + str(price['Артикул'][i]))
        print('DATA: ' + str(data))
        print(r.json())
        print('i: '+str(i))
        break
    
    updated +=1

ERROR: 400
SKU: MONT-PFCH
DATA: {'regular_price': '3205', 'stock_quantity': 0}
{'code': 'woocommerce_rest_product_invalid_id', 'message': 'Неверный ID.', 'data': {'status': 400}}
i: 3


РАЗОВАЯ КОРРЕКТИРОВКА ОПИСАНИЯ на основе xml файла

In [75]:
tree = etree.parse("avito_changed_desc.xml")
root = tree.getroot()

for ad in root.getchildren():
    sku = ad.find('Id').text
    desc = ad.find('Description').text
    
    data = {
        'description': desc
    }
    
    id = sku_id[sku]
    
    r = wcapi.put('products/'+str(id), data)
    
    if r.status_code != 200:
        print('ERROR: ' + str(r.status_code))
        print('SKU: ' + sku)
        print('DATA: ' + str(data))
        print(r.json())
        print('i: '+str(i))
        break

РАЗОВАЯ КОРРЕКТИРОВКА ИМЕН (ШТ/УП) на сайте

In [13]:
regexp = ',*\s*(\d+)\s*шт/уп'

In [109]:
for id in sku_id.values():
    r = wcapi.get('products/{}'.format(id))
    name = r.json()['name']
    edited_name = re.sub(regexp, '', name)
    edited_name = re.sub('\(\)', '', edited_name)
    edited_name = re.sub('\s\s', ' ', edited_name)
    
    if name != edited_name:    
        data = {
            'name': edited_name
        }
        r = wcapi.put('products/'+str(id), data)
    
        if r.status_code != 200:
            print('ERROR: ' + str(r.status_code))
            print('ID: ' + str(id))
            print('DATA: ' + str(data))
            print(r.json())
            print('i: '+str(i))
            break