In [1]:
import requests
import html
from bs4 import BeautifulSoup
from re import search
import time
import warnings
import json
import csv

warnings.filterwarnings('ignore')

In [2]:
def get_cards(soup):
    try:
        Card_names = soup.find_all('tr')  #во всем документе ровно одна таблица, её table rows – карты.
        cards = {}
        for card in Card_names:
            name = card.find('td', class_='col-name')
            if type(name) is not type(None):
                cards[name.find('a').string.strip()] = int(search(r'(?<=× ).*', str(name)).group(0))
    except:
        cards = {}
    return cards


In [3]:
def get_data(soup):

    regim = 'standart' if soup.find('p', class_='is-std') else 'wild'
    
    rating_string = str(soup.find('div', class_='deck-rating-form'))
    rating = int(search(r'data-rating-sum="([^"]+)"', rating_string).group(1))

    code_string = str(soup.find('button', class_='copy-button button'))
    code = search(r'data-clipboard-text="([^"]+)"', code_string).group(1)

    Class = soup.find_all('li', class_='b-breadcrumb-item')[2].find('span', itemprop = 'name').string
        
    deck_stats = soup.find('ul', class_='t-deck-stats')
    deck_details = soup.find('ul', class_='deck-details')

    try:
        minion_count = int(deck_stats.find('li', class_='t-deck-card-count-minions').string.split()[0])
    except:
        minion_count = 0

    try:
        spell_count = int(deck_stats.find('li', class_='t-deck-card-count-spells').string.split()[0])
    except:
        spell_count = 0

    try:
        weapon_count = int(deck_stats.find('li', class_='t-deck-card-count-weapons').string.split()[0])
    except:
        weapon_count = 0
    

    craft_cost = int(deck_details.find('span', class_ ='craft-cost').string)
    creation_date = search(r'(?<=Created: <span>)[^\s]+', str(deck_details)).group(0)
    deck_type = deck_details.find('span', class_ = 'deck-type').string
    deck_archetype = deck_details.find_all('span', class_ = 'deck-type')[1].find('a').string

    cards = get_cards(soup)
    
    


    data = {'regim': regim, 'rating': rating, 'Class': Class, 'minion_count': minion_count, 'spell_count': spell_count,
                        'weapon_count': weapon_count, 'craft_cost': craft_cost, 'creation_date': creation_date,
                        'deck_type': deck_type, 'deck_archetype': deck_archetype, 'cards': cards, 'code': code}
    return data


In [4]:
def get_response(deck_url, wait_time = 0):
    home = 'https://www.hearthpwn.com/'
    with requests.get(home + deck_url, verify=False) as deck:
        if deck.status_code == 429:
            print('error 429', end = ' ')
            wait_time += 1
            time.sleep(wait_time)
            return get_response(deck_url, wait_time)
        else:
            page_content = deck.text
            soup = BeautifulSoup(page_content, 'lxml')
            time.sleep(0.4)
            return(soup)


In [5]:
with open('allcards.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

card_names = [card['name'] for card in data]
card_names = sorted(list(set(card_names))) #удаляем дубликаты в данных (а они там есть!)
card_names = [card for card in card_names if (('Rank 2' not in card) & ('Rank 3' not in card))] #эти карты нельзя использовать для создания колоды

In [6]:
keys = ['regim', 'rating', 'Class', 'minion_count', 'spell_count', 'weapon_count',
                      'craft_cost', 'creation_date', 'deck_type', 'deck_archetype', 'code']

def append_to_csv(filename, deck_list, card_names):
    with open(filename, mode = 'a', newline = '', encoding = 'utf-8') as file:
        writer =  csv.writer(file)
        for deck in deck_list:
            row = [0]*len(card_names)
            for card_name, count in deck['cards'].items():
                if card_name in card_names:
                    index = card_names.index(card_name)
                    row[index] = count
            writer.writerow([deck[key] for key in keys] + row)
        

In [None]:
'''with open('DataTable.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    headers = keys + card_names
    writer.writerow(headers)'''
#инициализация csv файла, не запускать

In [None]:
pages = ['https://www.hearthpwn.com/decks?page=' + str(i) + '&sort=datemodified' for i in range(4595, 4700)]
deck_list = []
for page in pages:
    fresh_deck_list = []
    page_req = requests.get(page)

    decoded_content = html.unescape(page_req.text)
    soup = BeautifulSoup(decoded_content)
    page_content = soup.find_all('td',class_='col-name')
    url_list = [item.find('a')['href'] for item in page_content]
    
    for deck_url in url_list:
        fresh_deck_list.append(get_data(get_response(deck_url)))
    
    deck_list += fresh_deck_list
        
    print(int(search(r"page=(\d+)", page).group(1)), end= ' ')
        
        
    time.sleep(2)


In [8]:
append_to_csv('DataTable.csv', deck_list, card_names)

Добаление карт делается "руками": сначала выставляются страницы, которые надо скачать, затем они перезаписываются в основной файл, выставляются следующие страницы и т.д....