In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from lxml import etree, html as lhtml
import json
import re
import numpy as np
from tqdm.notebook import tqdm
from multiprocessing.dummy import Pool as ThreadPool
from threading import Lock

In [2]:
lock = Lock()

In [3]:
#Multi Request
def multi_request(path, repeats=5, headers=None):
    for i in range(repeats):
        if headers is not None:
            req = requests.get(path, headers=headers)
        else:
            req = requests.get(path)
        if req.status_code // 100 == 2:
            break
    if req.status_code // 100 != 2:
        return None
    return req


In [4]:
#Main Parser

p_hist = "https://gg.deals/ru/games/chartHistoricalData/"
dlc_list = "https://gg.deals/games/?view=list&title="
page_base = "https://gg.deals"

def page_parser(ans, wbase="https://gg.deals"):
    
    soup = BeautifulSoup(ans.text, 'html.parser')
    
    game_info = {
        'url': ans.url,
        'name': soup.find('a', class_='active', itemprop='item').find('span').text,
    }
    
    #Image
    tmp = soup.find('div', class_='game-card ab-alternative-2')
    if tmp is not None:
        tmp = tmp.find('img', loading='eager')['srcset']
        if tmp is not None:
            game_info['image'] = re.split(r' ', tmp)[0]

    #WishList
    tmp = soup.find('div', class_ = lambda s: s and s.startswith('wishlisted'))
    if tmp is not None:
        tmp = tmp.find('span', class_='count')
        if tmp is not None:
            game_info['wishlist_count'] = int(tmp.text)

    #Alerts
    tmp = soup.find('div', class_ = lambda s: s and s.startswith('alerted'))
    if tmp is not None:
        tmp = tmp.find('span', class_='count')
        if tmp is not None:
            game_info['alert_count'] = int(tmp.text)

    #Owners
    tmp =soup.find('div', class_ = lambda s: s and s.startswith('own'))
    if tmp is not None:
        tmp = tmp.find('span', class_='count')
        if tmp is not None:
            game_info['owners_count'] = int(tmp.text)

    #DLCs and packs
    game_name = game_info['url'].rsplit('/')[-2]
    ans = multi_request(f'https://gg.deals/games/?view=list&title={game_name}')
    if ans is not None:
        dlc_soup = BeautifulSoup(ans.text, 'html.parser')
        game_packs = dlc_soup.find_all('a', class_="ellipsis title", href= lambda s: s and game_name in s and '/pack/' in s) 
        game_dlcs = dlc_soup.find_all('a', class_="ellipsis title", href= lambda s: s and game_name in s and '/dlc/' in s)
        if game_packs is not None:
            game_info['packs'] = [page_base + pack['href'] for pack in game_packs]
        if game_dlcs is not None:
            game_info['dlcs'] = [page_base + dlc['href'] for dlc in game_dlcs]

    #market_url
    tmp = soup.find('div', class_="offer-section", id="official-stores")
    if tmp is not None:
        tmp = tmp.find('a', class_="full-link")
        if tmp is not None:
            seller = multi_request(page_base + tmp['href'])
            if seller is not None:
                game_info['market_url'] = seller.url

    #pc_systems
    tmp = soup.find('div', class_="game-section bg-light game-about")
    if tmp is not None:
        tmp = tmp.find('ul', class_="nav")
        if tmp is not None:
            tmp = tmp.find_all('a', id=lambda s: s and s.startswith('requirement-tab-trigger'))
            if tmp is not None:
                game_info['pc_systems'] = [os.text for os in tmp]

                
    #Additional info
    tmp = soup.find('div', class_="game-info-content")
    if tmp is not None:
        
        #release
        tmp2 = tmp.find('div', class_="game-info-details-section game-info-details-section-release")
        if tmp2 is not None:
            tmp2 = tmp2.find('p')
            if tmp2 is not None:
                game_info['release date'] = tmp2.text

        #developer
        tmp2 = tmp.find('div', class_="game-info-details-section game-info-details-section-developer")
        if tmp2 is not None:
            tmp2 = tmp2.find_all('p')
            if tmp2 is not None:
                game_info['developer'] = [dev.text for dev in tmp2]

        #meta_score
        tmp2 = tmp.find('a', class_="score-circle score-metascore")
        if tmp2 is not None:
            tmp2 = tmp2.find('span', class_="overlay")
            if tmp2 is not None:
                game_info['metacritic_score'] = int(tmp2.text)

        #user_score
        tmp2 = tmp.find('a', class_="score-circle score-userscore")
        if tmp2 is not None:
            tmp2 = tmp2.find('span', class_="overlay")
            if tmp2 is not None:
                game_info['user_score'] = float(tmp2.text)
                
        #review_labels
        tmp2 = tmp.find('div', class_="score-col full")
        if tmp2 is not None:
            tmp2 = tmp2.find('span', class_="reviews-label")
            if tmp2 is not None:
                
                game_info["review_positive_pctg"] = int(tmp2['title'].partition('%')[0])
                
                reviews = re.search(r'(\d+\,?\d*)', tmp2.text)[0]
                reviews = reviews.replace(',', '')
                game_info["review_count"] = int(reviews)
                
                game_info["review_label"] = re.search(r'[a-zA-Z ]*', tmp2.text)[0].rstrip()
            
            
        #genres
        tmp2 = tmp.find('div', class_="game-info-genres tags-container")
        if tmp2 is not None:
            tmp2 = tmp2.find_all('a')
            if tmp2 is not None:
                game_info['genres'] = [genre.text for genre in tmp2]

        #tags
        tmp2 = tmp.find('div', class_= "game-info-tags tags-container", id="game-info-tags")
        if tmp2 is not None:
            tmp2 = tmp2.find_all('a')
            if tmp2 is not None:
                game_info['tags'] = [tag.text for tag in tmp2]

        #features
        tmp2 = tmp.find('div', class_="game-info-tags tags-container", id="game-info-features")
        if tmp2 is not None:
            tmp2 = tmp2.find_all('a')
            if tmp2 is not None:
                game_info['features'] = [feature.text for feature in tmp2]

    #Price history
    tmp = soup.find('div', class_= lambda s: s and s.startswith("game-collection-actions game-collection-actions"))
    if tmp is not None:
        req = p_hist + tmp['data-game-id'] + '/?hideKeyshops=0'
        ans = multi_request(req, headers={'x-requested-with':'XMLHttpRequest '})
        if ans is not None:
            ans = ans.json()
            if 'chartData' in ans and 'deals' in ans['chartData']:
                for offer in ans['chartData']['deals']:
                    if 'price_history' not in game_info:
                        game_info['price_history'] = []
                    game_info['price_history'].append({'ts': offer['x'], 'price': offer['y'], 'shop':offer['shop']})
            
    return game_info
    

In [5]:
#Getting 300 top metascore urls
def get_urls(soup):
    return [page_base + title['href'] for title in soup.find_all('a', class_='ellipsis title')]

ans = requests.get("https://gg.deals/games/?sort=metascore")
urls = np.asarray(get_urls(BeautifulSoup(ans.text, 'html.parser')))
page_num = 14

for i in tqdm(range(2, 14)):
    ans = multi_request(f"https://gg.deals/games/?sort=metascore&page={i}")
    urls = np.concatenate((urls, get_urls(BeautifulSoup(ans.text, 'html.parser'))))
urls = urls[:300]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [6]:
#Processing all pages with multithreading

def proceed_page(url):
    if url is None:
        with lock:
            print('Url is incorrect', file=sys.stderr)
        return None
    ans = multi_request(url)
    if ans is None:
        with lock:
            print(f'Page cannot be reached with web-address: {url}')
        return None
    page = page_parser(ans)
    
    #require './jsons' folder. Uncomment if you have one and need to save .json files
    #with open('./jsons/' + page['name'].replace(' ', '_').replace('/','_') + '.json', 'w') as out:
    #    json.dump(page, out)
    
    return page

with ThreadPool(20) as pool:
    pages = pool.map(proceed_page, urls)

In [7]:
#Здесь находится лишь часть атрибутов, которые удобно выводить в датафрейм
df = pd.DataFrame(columns = ['name', 'release date', 'metacritic_score', 'user_score', 'alert_count', 'owners_count', 'wishlist_count', 'url', 'market_url'])

In [8]:
for page in pages:
    tmp_page = {}
    for i in page:
        if i in df.columns:
            tmp_page[i] = page[i]
    df = df.append(tmp_page, ignore_index=True)

In [9]:
df.head(20)

Unnamed: 0,name,release date,metacritic_score,user_score,alert_count,owners_count,wishlist_count,url,market_url
0,Grand Theft Auto V,14 Apr 2015,96,7.8,663,10123,3446,https://gg.deals/game/grand-theft-auto-v/,https://www.greenmangaming.com/games/grand-the...
1,The Orange Box,10 Oct 2007,96,9.2,39,50,34,https://gg.deals/pack/the-orange-box/,https://store.steampowered.com/sub/469/
2,Half-Life 2,16 Nov 2004,96,9.2,83,17130,519,https://gg.deals/game/half-life-2/,https://store.steampowered.com/app/220/
3,Half-Life,08 Nov 1998,96,9.1,75,12103,477,https://gg.deals/game/half-life/,https://store.steampowered.com/app/70/
4,BioShock,21 Aug 2007,96,8.6,47,15543,406,https://gg.deals/game/bioshock/,https://ru.gamersgate.com/DD-BIO-STEAM-RU-GN/b...
5,Baldur's Gate 2 Complete,24 Sep 2000,95,9.3,1,382,5,https://gg.deals/pack/baldur-s-gate-2-complete/,
6,Portal 2,18 Apr 2011,95,8.9,158,21134,654,https://gg.deals/game/portal-2/,https://store.steampowered.com/app/620/
7,Divinity: Original Sin - Enhanced Edition,27 Oct 2015,94,8.3,532,6544,3183,https://gg.deals/game/divinity-original-sin-en...,https://store.steampowered.com/app/373420/
8,The Elder Scrolls IV: Oblivion Game of the Yea...,16 Jun 2009,94,8.2,76,243,231,https://gg.deals/pack/the-elder-scrolls-iv-obl...,https://store.steampowered.com/app/900883/
9,The Elder Scrolls IV: Oblivion Game of the Yea...,11 Sep 2007,94,8.2,163,6446,1406,https://gg.deals/game/the-elder-scrolls-iv-obl...,https://store.steampowered.com/app/22330/
