In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from time import sleep

from multiprocessing import Lock

In [2]:
log_lock = Lock()

def log_error_code(status_code):
    with log_lock:
        print("WARNING: " + (requests.status_codes._codes[r.status_code])[0], file=sys.stderr)
        

def log(message, lock=None):
    with log_lock:
        print(message, file=sys.stderr)

In [61]:
def get_page(url, n_attempts=5, t_sleep=0.1, headers=None):
    for _ in range(n_attempts):
        r = requests.get(url, headers=headers)
        if r.status_code == 200:
            return r

        log_error_code(r.status_code)
        sleep(t_sleep)
    return None

In [64]:
def get_page_contents(url, n_attempts=5, t_sleep=0.1, headers=None):
    page = get_page(url, n_attempts, t_sleep, headers)
    if page is not None:
        return page.content
    return None

# Задача 
Выполним обкачку сайта https://gg.deals/games/?sort=metascore&type=1

In [4]:
host = 'https://gg.deals'
main_url = host + '/games/?sort=metascore&type=1'

# **Этап 1**

Найдем URLы и ID первых 300 игр

In [5]:
def current_page_game_URLs_and_IDs(content):
    body = BeautifulSoup(content, 'html.parser').find('body')
    badges = body.find_all('div', class_=lambda s: s and s.startswith('with-badges'))

    urls_and_ids = list()
    for badge in badges:
        id = badge['data-container-game-id']
        url = host + badge.find('a')['href']
        urls_and_ids.append((url, id))

    return urls_and_ids

In [6]:
def get_urls_and_ids(npages):
    content = get_page_contents(main_url)
    if content is None:
        log("ERROR: unable to download url " + main_url)
        return None
    urls_and_ids = current_page_game_URLs_and_IDs(content)
    for i in tqdm(range(2, npages + 1)):
        current_url = main_url + f'&page={i}'
        content = get_page_contents(current_url)
        if content is None:
            log("ERROR: unable to download url " + current_url)
            return None
        urls_and_ids.extend(current_page_game_URLs_and_IDs(content))
    return urls_and_ids

In [7]:
ngames = 300
npages = (ngames // 24) + 1

urls_and_ids = get_urls_and_ids(npages)[:300]

  0%|          | 0/12 [00:00<?, ?it/s]

# **Этап 2**

Напишем функцию, позволяющую вытаскивать данные с сайта

In [9]:
import re

    поле "name" – название игры;

In [10]:
def name(body):
    return body.find('ul', class_='breadcrumbs-list').find('a', class_='active').find('span').text

    Поле "image" – ссылка на постер игры;
    Поле "market_url" – ссылка на игру в оригинальном магазине;

In [80]:
def info_image(body):
    info_image = dict()
    info_image['image'] = body.find('img', class_="image-game")['src']
    widget = body.find('a', class_='game-link-widget')
    if widget is not None:
        page = get_page(widget['href'])
        info_image['matket_url'] = page.url
    return info_image

    Поля "wishlist_count", "alert_count", "owners_count" – значения соответвующих счетчиков.

In [12]:
def collection_actions(body):
    collection_actions = body.find('div', class_=lambda s: s and s.startswith('game-collection-actions'))
    info = collection_actions.find_all('div', class_='game-action-wrap')
    names = ['wishlist_count', 'alert_count', 'owners_count']
    counts = [int(k.find('span', class_='count').text) for k in info]
    return {*zip(names, counts)}

    Группы полей, если имеются:
        "release_date" – дата релиза (выхода) игры;
        "developer" – разработчик игры;
        "metacritic_score" – рейтинг Metascore;
        "user_score" – рейтинг Userscore;
        "review_label", "review_positive_pctg", "review_count" – общий пользовательский вердикт (например, Very Positive), доля позитивных обзоров, общее число обзоров на игру;
        "genres" – список жанров игры;
        "tags" – список тегов игры;
        "features" – список особенностей игры.

In [98]:
def info_side(body):
    content = body.find('div', class_='game-info-content')
    details = content.find('div', id='game-info-side')

    info_content = dict()
    for name in ['release', 'developer']:
        section = details.find('div', class_=lambda s: s and s.endswith(name))
        if section is None:
            continue
        info_content[name] = section.find('p').text

    reviews = details.find('div', class_=lambda s: s and s.endswith('reviews'))

    if reviews is not None:
        circles = reviews.find_all('a', class_=lambda s: s and s.startswith('score-circle'))
        for circle in circles:
            name = circle['class'][1].split('-')[1]
            info_content[name] = float(circle.find('span').text)

        steam_bar = reviews.find('a', class_='score-grade')
        if steam_bar is not None:
            reviews_label = steam_bar.find('span')
            info_content['review_label'] = ' '.join(reviews_label.text.split()[:-1])
            info_content['review_positive_pctg'] = float(reviews_label['title'].split()[0][:-1])
            info_content['review_count'] = int(''.join(re.findall(r'\d', reviews_label.find('span').text)))

    for game_info in ['genres', 'tags', 'features']:
        subsection = content.find('div', id=f'game-info-{game_info}')
        if subsection is None:
            continue
        badges = subsection.find_all('a', class_='badge')
        info_content.update({game_info: list()})
        for badge in badges:
            info_content[game_info].append(badge.text)    

    return info_content

    поле "dlcs" – список ссылок на DLC (дополнения) к игре, поле "packs" – список ссылок на Packs (расширенные версии игр); списки могут быть пустыми;

In [104]:
def dlcs_and_packs(body, id):
    info = dict()
    for name in ['dlc', 'packs']:
        section = body.find('section', id=lambda s: s and s.startswith(f'game-{name}'))
        if section is None:
            continue
        info[name] = section.find_all('a', attrs={'class':'full-link', 'href': True})
        if section.find('div', class_='list-show-more') is not None:
            headers = {'x-requested-with': 'XMLHttpRequest'}
            content = get_page_contents(host + f'/games/relations/{id}/?type={name}&offset=4&hideKeyshops=0', headers=headers)
            if content is None:
                log("ERROR: unable to get " + id + f" {name}")
                continue
            s = BeautifulSoup(content, 'html.parser')
            info[name].extend(s.find_all('a', attrs={'class':'full-link', 'href': True}))
        info[name] = [host + x['href'] for x in info[name]]
    return info

    поле "pc_systems" – список поддерживаемых ОС компьютеров;

In [100]:
def pc_systems(body):
    tabs = body.find('div', class_='game-requirements-tabs')
    if tabs is not None:
        return {'pc_systems': [x.text for x in tabs.find_all('a')]}
    return {}

    поле "price_history" – список цен на игру в оригинальных магазинах (голубая линия) за весь имеющийся период.

In [105]:
def price_history(body, id):
    headers = {'x-requested-with': 'XMLHttpRequest'}
    url = host + f'/ru/games/chartHistoricalData/{id}/?hideKeyshops=0'
    page = get_page(url, headers=headers)
    if page is None:
        log("ERROR: unable to get " + id + " price history")
        return {}
    prices = dict(page.json())
    if 'chartData' not in prices:
        return {}
    prices = prices['chartData']['deals']
    for old_dict in prices:
        old_dict['ts'] = old_dict.pop('x') / 1000
        old_dict['price'] = old_dict.pop('y')
        old_dict.pop('name', None)
    return {'price_history': prices}

### **Итоговая функция**

In [106]:
def process_game(url, id):
    content = get_page_contents(url)
    if content is None:
        log("ERROR: unable to download url " + url)
        return {}
    soup = BeautifulSoup(content, 'html.parser')
    head = soup.find('head')
    body = soup.find('body')
    info = {
        'url': url,
        "name": name(body),
    }
    for func in [info_image, collection_actions, info_side, pc_systems]:
        info.update(func(body))
    info.update(dlcs_and_packs(body, id))
    info.update(price_history(body, id))
    return info

# **Этап 3**

Выполним обкачку каждого полученного URLa

In [18]:
import gzip
import json
import codecs

from multiprocessing.dummy import Pool, Queue

In [108]:
queue = Queue()
for elem in urls_and_ids:
    queue.put(elem)

def process_page_wrapper(i):
    with gzip.open('data/part_{:05d}.jsonl.gz'.format(i), mode='wb') as f_json:
        f_json = codecs.getwriter('utf8')(f_json)

        while not queue.empty():
            url, id = queue.get()
            try:
                record = process_game(url, id)
            except:
                log("ERROR: unexpected exception cought")
            record_str = json.dumps(record, ensure_ascii=False)
            print(record_str, file=f_json)

            with lock:
                pbar.update(1)


with Pool(processes=3) as pool, tqdm(total=queue.qsize()) as pbar:
    lock = pbar.get_lock()
    pool.map(process_page_wrapper, range(pool._processes))

  0%|          | 0/300 [00:00<?, ?it/s]

**Готово!** 

Прочитаем первый получившийся json

In [114]:
import json

In [137]:
with gzip.open('data/part_00000.jsonl.gz', mode='r') as f_json:
    first = f_json.readlines()[0]
    answer = json.loads(first)
    print(json.dumps(answer, indent=4))

{
    "url": "https://gg.deals/game/half-life/",
    "name": "Half-Life",
    "image": "https://img.gg.deals/92/dc/9224fd2b8ef1b6a2823c1dc0a7c3123f34f1_307xt176.jpg",
    "matket_url": "https://store.steampowered.com/app/70/",
    "wishlist_count": 478,
    "owners_count": 12103,
    "alert_count": 75,
    "release": "08 Nov 1998",
    "developer": "Valve",
    "metascore": 96.0,
    "userscore": 9.1,
    "review_label": "Overwhelmingly Positive",
    "review_positive_pctg": 96.0,
    "review_count": 46683,
    "genres": [
        "Action"
    ],
    "tags": [
        "FPS",
        "Action",
        "Sci-fi",
        "Singleplayer",
        "Shooter",
        "Multiplayer",
        "First-Person",
        "1990's",
        "Aliens",
        "Story Rich",
        "Silent Protagonist",
        "Adventure",
        "Atmospheric",
        "Moddable",
        "Action-Adventure",
        "Retro",
        "PvP",
        "Gore",
        "Difficult",
        "Linear"
    ],
    "features": [
 