# This is my Notebook to test collecting the data with the BGG API

## Setup

In [1]:
import requests
import pandas as pd
import xmltodict
import time


## Import data

In [9]:
url_things = 'https://boardgamegeek.com/xmlapi2/thing?'

game_ids = pd.read_csv('data/boardgames_ranks.csv').id

try:
    game_details = pd.read_csv('data/game_details.csv')
except:
    game_details = pd.DataFrame(columns=['id',
                                        #'alt_name', # maybe include, so the searching goes easier
                                        'description',
                                        'yearpublished',
                                        'minplayers',
                                        'maxplayers',
                                        'community_best_with',
                                        'community_recommended_with',
                                        'playingtime',
                                        'minplaytime',
                                        'maxplaytime',
                                        'minage',
                                        #'community_minage',
                                        #'language_dependency', # superfluous? mostly focus on english games
                                        'boardgamecategories', 
                                        'boardgamemechanics',
                                        'boardgamefamilies',
                                        #'boardgameaccessories', # superfluous? probably has high correlation with the family and mechanics
                                        #'boardgameimplementations', # superfluous?
                                        'boardgamedesigners',
                                        'boardgameartists',
                                        #'boardgamepublishers', # superfluous?
                                        #'usersrated', # already in the dataset
                                        #'average', # already in the dataset
                                        #'bayesaverage', # already in the dataset
                                        #'ranks', # already in the dataset
                                        'stddev',
                                        'median',
                                        'owned',
                                        'trading',
                                        'wanting',
                                        'wishing',
                                        'numcomments',
                                        'numweights',
                                        'averageweight'
                                         ])
    


(162398,)

In [24]:
# convert the pandas series to a list for looping
game_ids_unupdated = [x for x in game_ids.to_list()]
print(len(game_ids_unupdated))
print(len(set(game_details.id.to_list())))
game_ids_unupdated = list(set(game_ids_unupdated) - set(game_details.id.to_list()))
print(len(game_ids_unupdated))
print(len(list(set(game_ids_unupdated).difference(set(game_details.id.to_list())))))
game_ids_unupdated = list(map(str, game_ids_unupdated))

162398
133413
28985
28985


In [4]:
def _extract_details_from_link_element(link_element, detail_type):
    detail_list = []
    for item in link_element:
        if item['@type'] == detail_type:
            detail_list.append(item['@value'])

    return ','.join(detail_list)
    pass # these infos are all in the same element and can occur multiple times. The different detail_types are: '

In [5]:
def _extract_info_into_dataframe(df, item_dict):
    # save the info into a dictionary
    info = {
        'id': item_dict['@id'],
        #'name': item_dict[item], # included in dataset
        #'alt_name', # maybe include, so the searching goes easier
        'description': item_dict['description'],
        'yearpublished': item_dict['yearpublished']['@value'],
        'minplayers': item_dict['minplayers']['@value'],
        'maxplayers': item_dict['maxplayers']['@value'],
        'community_best_with': item_dict['poll-summary']['result'][0]['@value'],
        'community_recommended_with': item_dict['poll-summary']['result'][1]['@value'],
        'playingtime': item_dict['playingtime']['@value'],
        'minplaytime': item_dict['minplaytime']['@value'],
        'maxplaytime': item_dict['maxplaytime']['@value'],
        'minage': item_dict['minage']['@value'],
        #'community_minage': item_dict[item], # hassle to extract
        #'language_dependency': item_dict[item], # superfluous? mostly focus on english games
        'boardgamecategories': _extract_details_from_link_element(item_dict['link'], 'boardgamecategory'), 
        'boardgamemechanics': _extract_details_from_link_element(item_dict['link'], 'boardgamemechanic'),
        'boardgamefamilies': _extract_details_from_link_element(item_dict['link'], 'boardgamefamily'),
        #'boardgameaccessories': _extract_details_from_link_element(item_dict['link'], 'boardgameaccessory'), # superfluous? probably has high correlation with the family and mechanics
        #'boardgameimplementations': _extract_details_from_link_element(item_dict['link'], 'boardgameimplementation'), # superfluous?
        'boardgamedesigners': _extract_details_from_link_element(item_dict['link'], 'boardgamedesigner'),
        'boardgameartists': _extract_details_from_link_element(item_dict['link'], 'boardgameartist'),
        #'boardgamepublishers': _extract_details_from_link_element(item_dict['link'], 'boardgamepublisher'), # superfluous?
        #'usersrated': item_dict[item], # already in the dataset
        #'average': item_dict[item], # already in the dataset
        #'bayesaverage': item_dict[item], # already in the dataset
        #'ranks': item_dict[item], # already in the dataset
        'stddev': item_dict['statistics']['ratings']['stddev']['@value'],
        'median': item_dict['statistics']['ratings']['median']['@value'],
        'owned': item_dict['statistics']['ratings']['owned']['@value'],
        'trading': item_dict['statistics']['ratings']['trading']['@value'],
        'wanting': item_dict['statistics']['ratings']['wanting']['@value'],
        'wishing': item_dict['statistics']['ratings']['wishing']['@value'],
        'numcomments': item_dict['statistics']['ratings']['numcomments']['@value'],
        'numweights': item_dict['statistics']['ratings']['numweights']['@value'],
        'averageweight': item_dict['statistics']['ratings']['averageweight']['@value']
    }
    return info
    

In [6]:
# prepare and execute the api call
form_values = {
    'id':'', # Specifies the id of the thing(s) to retrieve. To request multiple things with a single query, NNN can specify a comma-delimited list of ids. Maximum 20.
    #'type':'', # Specifies that, regardless of the type of thing asked for by id, the results are filtered by the THINGTYPE(s) specified. Multiple THINGTYPEs can be specified in a comma-delimited list.
    #'versions':'1', # Returns version info for the item.
    #'videos':'1', # Returns videos for the item.
    'stats':'1', # Returns ranking and rating stats for the item.
    #'historical':'1', # Not currently supported. Returns historical data over time. See page parameter.
    #'marketplace':'1', # Returns marketplace data.
    #'comments':'1', # Returns all comments about the item. Also includes ratings when commented. See page parameter.
    #'ratingcomments':'1', # Returns all ratings for the item. Also includes comments when rated. See page parameter. The ratingcomments and comments parameters cannot be used together, as the output always appears in the <comments> node of the XML; comments parameter takes precedence if both are specified. Ratings are sorted in descending rating value, based on the highest rating they have assigned to that item (each item in the collection can have a different rating).
    #'page':'1', # Defaults to 1, controls the page of data to see for historical info, comments, and ratings data.
    #'pagesize':'10', # Set the number of records to return in paging. Minimum is 10, maximum is 100.
    #'from':'', # Not currently supported.
    #'to':'' # Not currently supported.
}
max_ids_per_call = 20
counter = 0
# while game_ids_unupdated != []:
while game_ids_unupdated != []:
    ids_to_update = ",".join(game_ids_unupdated[:max_ids_per_call])  # Get first 20 items and join with ','
    del game_ids_unupdated[:max_ids_per_call]  # Remove them from the original list
    # update the API parameter to get the current id's
    form_values.update({'id':ids_to_update})
    # get the info from BGG
    response = requests.get(url_things, form_values)
    if response.status_code == 429:
        print('Too many requests!', counter)
        counter +=1
        time.sleep(5)
    # save the info into the dataframe
    if response.status_code == 200:
        info = xmltodict.parse(response.text)  
    else:
        game_ids_unupdated.append(ids_to_update.split(','))
        continue
    for i in range(max_ids_per_call):
        try:
            info_current_game = _extract_info_into_dataframe(game_details, info['items']['item'][i])
            df_current_game = pd.DataFrame(info_current_game, index = [int(info_current_game['id'])])
            game_details = pd.concat([game_details, df_current_game])
        except:
            print('Extraction failed.')


Too many requests! 0
Too many requests! 1
Too many requests! 2
Too many requests! 3
Too many requests! 4
Too many requests! 5
Too many requests! 6
Too many requests! 7
Too many requests! 8
Too many requests! 9
Too many requests! 10
Too many requests! 11
Too many requests! 12
Too many requests! 13
Too many requests! 14
Too many requests! 15
Too many requests! 16
Too many requests! 17
Too many requests! 18
Too many requests! 19
Too many requests! 20
Extraction failed.
Too many requests! 21
Extraction failed.
Too many requests! 22
Too many requests! 23
Too many requests! 24
Too many requests! 25
Too many requests! 26
Too many requests! 27
Too many requests! 28
Too many requests! 29
Too many requests! 30
Too many requests! 31
Too many requests! 32
Too many requests! 33
Too many requests! 34
Too many requests! 35
Too many requests! 36
Too many requests! 37
Too many requests! 38
Too many requests! 39
Too many requests! 40
Too many requests! 41
Too many requests! 42
Extraction failed.
Extract

KeyboardInterrupt: 

In [7]:
game_details.shape

(193078, 25)

In [8]:
game_details.to_csv('data/game_details.csv', index=False)