In [1]:
import requests
from multiprocessing import Pool
import json
from lxml import etree, html as lhtml
import threading
import time
from tqdm import tqdm
import datetime
import pandas as pd
import threading

In [2]:
mutex = threading.Lock()

In [3]:
def get_page(url, n_attempts=5, t_sleep=1, header=None):
    for _ in range(n_attempts):
        if header is None:
            r = requests.get(url)
        else:
            r = requests.get(url, headers=header)
        if r.ok and r.status_code == 200:
            return r
        else:
            time.sleep(t_sleep)
    with mutex:
        print("Url: ", url, " isn't downloaded")
    return None

# 1) Get urls of cards

In [4]:
cards_url = []
page_num = 1
while len(cards_url) != 300:
    url = f"https://gg.deals/games/?sort=metascore&type=1&page={page_num}"
    text = get_page(url)
    if text is None:
        page_num += 1
        continue
    text = text.text
    query = '//div[@class="list-items shadow-box-small-lighter"]//a[@class="full-link"]'
    tree = lhtml.fromstring(text)
    for val in tree.xpath(query):
        cards_url.append("https://gg.deals" + val.attrib['href'])
        if len(cards_url) == 300:
            break
    print("Page_num: ", page_num, " len(pages) = ", len(cards_url))
    page_num += 1

Page_num:  1  len(pages) =  48
Page_num:  2  len(pages) =  96
Page_num:  3  len(pages) =  144
Page_num:  4  len(pages) =  192
Page_num:  5  len(pages) =  240
Page_num:  6  len(pages) =  288
Page_num:  7  len(pages) =  300


# 2) Get cards

In [5]:
cards = []
for val in tqdm(cards_url):
    text = get_page(val)
    if text is not None:
        text = text.text
        cards.append(text)

100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [03:05<00:00,  1.62it/s]


# 3) Parse cards

In [6]:
queries = {}
queries.update(name='//div[@class="game-heading"]//h1/text()')
queries.update(image='//div[@class="game-info-image"]//img/attribute::src')
queries.update(release_date='//div[@class="game-info-details-section game-info-details-section-release"]//p/text()')
queries.update(developer='//div[@class="game-info-details-section game-info-details-section-developer"]//p/text()')
queries.update(metacritic_score='//a[@class="score-circle score-metascore"]//span[@class="overlay"]/text()')
queries.update(user_score='//a[@class="score-circle score-userscore"]//span[@class="overlay"]/text()')
queries.update(review_label='//span[starts-with(@class, "reviews-label")]/text()')
queries.update(review_positive_pctg='//a[@class="score-grade"]//span[starts-with(@class, "reviews-label")]')
queries.update(review_count='//span[starts-with(@class, "reviews-label")]/span/text()')
queries.update(genres='//div[@class="tags-list badges-container"]/a/text()')
queries.update(tags='//div[@class="game-info-tags game-info-badges"]/h4[text()="Tags"]/..//a/text()')
queries.update(features='//div[@class="game-info-tags game-info-badges"]/h4[text()="Features"]/..//a/text()')
queries.update(platforms='//div[@class="game-info-details-section game-info-details-section-platforms"]//svg/attribute::title')
queries.update(wishlist_count='//div[@class="game-header-box"]//span[text()="Wishlist it"]/../..//span[@class="count"]/text()')
queries.update(alert_count='//div[@class="game-header-box"]//span[text()="Create alert"]/../..//span[@class="count"]/text()')
queries.update(owners_count='//div[@class="game-header-box"]//span[text()="Own it"]/../..//span[@class="count"]/text()')
queries.update(market_url='//a[@class="game-link-widget"]/attribute::href')
queries.update(dlcs='//section[@class="tab-menu-section offer-section"]//*[text()=" DLCs"]/../..//a[@class="full-link"]\
/attribute::href')
queries.update(packs='//section[@class="tab-menu-section offer-section"]//*[text()=" Packs"]/../..//a[@class="full-link"]\
/attribute::href')

In [7]:
def build_dict(card):
    dict_ = {}  
    tree = lhtml.fromstring(card)
    dict_['name'] = tree.xpath(queries['name'])[0][4:-3]
    val = tree.xpath(queries['image'])
    if val != []:
        dict_['image'] = val[0]
    val = tree.xpath(queries['market_url'])
    if val != []:
        dict_['market_url'] = val[0]
    val = tree.xpath(queries['wishlist_count'])
    if val != []:
        dict_['wishlist_count'] = val[0]
    val = tree.xpath(queries['alert_count'])
    if val != []:
        dict_['alert_count'] = val[0]
    val = tree.xpath(queries['owners_count'])
    if val != []:
        dict_['owners_count'] = val[0]
    val = tree.xpath(queries['release_date'])
    if val != []:
        dict_['release_date'] = val[0]
    val = tree.xpath(queries['developer'])
    if val != []:
        dict_['developer'] = val[0]
    val = tree.xpath(queries['metacritic_score'])
    if val != []:
        dict_['metacritic_score'] = val[0]
    val = tree.xpath(queries['user_score'])
    if val != []:
        dict_['user_score'] = tree.xpath(queries['user_score'])[0]
    val = tree.xpath(queries['review_label'])
    if val != []:
        dict_['review_label'] = val[0]
    try:
        dict_['review_positive_pctg'] = tree.xpath(queries['review_positive_pctg'])[0].attrib['title'].split('%')[0]
    except Exception:
        pass
    try:
        dict_['review_count'] = tree.xpath(queries['review_count'])[0].strip()[1:-1]
    except Exception:
        pass
    dict_['genres'] = tree.xpath(queries['genres'])
    dict_['tags'] = tree.xpath(queries['tags'])
    dict_['features'] = tree.xpath(queries['features'])
    dict_['platforms'] = tree.xpath(queries['platforms'])
    dict_['dlcs'] = tree.xpath(queries['dlcs'])
    dict_['packs'] = tree.xpath(queries['packs'])
    header_r = {'x-requested-with': 'XMLHttpRequest'}
    i = tree.xpath('//*[starts-with(@data-url, "/collection/single/?id")]')[0].attrib['data-url'].split('=')[-1]
    r = get_page(f"https://gg.deals/us/games/chartHistoricalData/{i}/?showKeyshops=1", header=header_r)
    if r is not None:
        d = r.json()
        if 'chartData' in d.keys():
            d = d['chartData']['deals']
            for_updates = []
            for i in range(len(d) - 1):
                for_updates.append(d[i + 1]['x'] - d[i]['x'])
            for i in range(len(d) - 1):
                d[i + 1]['ts'] = for_updates[i]
                d[i + 1]['price'] = d[i + 1]['y']
                del d[i + 1]['x']
                del d[i + 1]['y']
                del d[i + 1]['name']
            d[0]['ts'] = d[0]['x']
            d[0]['price'] = d[0]['y']
            del d[0]['x']
            del d[0]['y']
            del d[0]['name']
            dict_['price_history'] = d
    dict_ = {k:v for k, v in dict_.items() if v}
    #dict_ = json.dumps(dict_, indent=4)
    return dict_

In [8]:
parsed = []
for card in tqdm(cards):
    parsed.append(build_dict(card))

100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [02:23<00:00,  2.09it/s]


In [9]:
import gzip
import json
import codecs

from multiprocessing.dummy import Pool, Queue

queue = Queue()

for card in cards:
    queue.put(card)


def process_page_wrapper(i):
    with gzip.open('data/part_{:05d}.jsonl.gz'.format(i), mode='wb+') as f_json:
        f_json = codecs.getwriter('utf8')(f_json)

        while not queue.empty():
            try:
                record = build_dict(queue.get())
                record_str = json.dumps(record, ensure_ascii=False)
                print(record_str, file=f_json)
            except Exception as e:
                print(e)
            with lock:
                pbar.update(1)


with Pool(processes=8) as pool, tqdm(total=queue.qsize()) as pbar:
    lock = pbar.get_lock()
    pool.map(process_page_wrapper, range(pool._processes))
    time.sleep(3)
    pool.close()
    pool.join()

100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [00:23<00:00, 12.63it/s]


In [10]:
parsed[50]

{'name': 'Crusader Kings III',
 'image': 'https://img.gg.deals/15/37/f17d91ce13bd6cf6c3b2a62b8524b7670e8f_307xt176.jpg',
 'market_url': 'https://gg.deals/redirect/51d0f5757a6a4112d131ea20389baee07cfacc08/?utm_source=games%2Fsingle',
 'wishlist_count': '4311',
 'alert_count': '542',
 'owners_count': '2848',
 'release_date': '01 Sep 2020',
 'developer': 'Paradox Development Studio',
 'metacritic_score': '91',
 'user_score': '8.3',
 'review_label': 'Very Positive',
 'review_positive_pctg': '93',
 'review_count': '48,988',
 'genres': ['RPG', 'Simulation', 'Strategy'],
 'tags': ['Strategy',
  'RPG',
  'Simulation',
  'Grand Strategy',
  'Medieval',
  'Historical',
  'Political',
  'War',
  'Management',
  'Character Customization',
  'Singleplayer',
  'Choices Matter',
  'Multiplayer',
  'Sandbox',
  'Real-Time with Pause',
  'Moddable',
  'Sexual Content',
  'PvP',
  'Nudity',
  'Dating Sim'],
 'features': ['Single-player',
  'Online PvP',
  'Steam Achievements',
  'Steam Trading Cards',
 

In [11]:
parsed[167]

{'name': 'Pillars of Eternity II: Deadfire',
 'image': 'https://img.gg.deals/da/c9/2742ea6c8be597620effce70b4c1813c201a_307xt176.jpg',
 'market_url': 'https://gg.deals/redirect/429b749aeccfbf24f6a51fe46058cfe5aa5dfde1/?utm_source=games%2Fsingle',
 'wishlist_count': '4550',
 'alert_count': '534',
 'owners_count': '4452',
 'release_date': '08 May 2018',
 'developer': 'Obsidian Entertainment',
 'metacritic_score': '88',
 'user_score': '7.8',
 'review_label': 'Very Positive',
 'review_positive_pctg': '86',
 'review_count': '10,265',
 'genres': ['RPG'],
 'tags': ['RPG',
  'Party-Based RPG',
  'Isometric',
  'Story Rich',
  'Fantasy',
  'Singleplayer',
  'CRPG',
  'Character Customization',
  'Choices Matter',
  'Open World',
  'Real-Time with Pause',
  'Turn-Based',
  'Adventure',
  'Pirates',
  'Nudity',
  'Exploration',
  'Great Soundtrack',
  'Atmospheric',
  'Romance',
  'Female Protagonist'],
 'features': ['Single-player',
  'Steam Achievements',
  'Steam Trading Cards',
  'Steam Works