# This is my Notebook to test collecting the data with the BGG API

## Setup

In [1]:
import requests
import pandas as pd
import xmltodict
import time


## Import data

In [2]:
url_things = 'https://boardgamegeek.com/xmlapi2/thing?'

all_games = pd.read_csv('data/boardgames_ranks.csv')
game_ids = all_games.query('rank > 0')['id'].sort_values()
game_ids

452           1
5178          2
273           3
6899          4
344           5
          ...  
11569    432536
12237    433099
22832    433444
12065    435979
4605     436126
Name: id, Length: 27925, dtype: int64

In [3]:

try:
    game_details = pd.read_csv('data/game_details.csv')
except:
    game_details = pd.DataFrame(columns=['game_id',
                                        #'alt_name', # maybe include, so the searching goes easier
                                        'description',
                                        'yearpublished',
                                        'minplayers',
                                        'maxplayers',
                                        'community_best_with',
                                        'community_recommended_with',
                                        'playingtime',
                                        'minplaytime',
                                        'maxplaytime',
                                        'minage',
                                        #'community_minage',
                                        #'language_dependency', # superfluous? mostly focus on english games
                                        'boardgamecategories', 
                                        'boardgamemechanics',
                                        'boardgamefamilies',
                                        #'boardgameaccessories', # superfluous? probably has high correlation with the family and mechanics
                                        #'boardgameimplementations', # superfluous?
                                        'boardgamedesigners',
                                        'boardgameartists',
                                        #'boardgamepublishers', # superfluous?
                                        #'usersrated', # already in the dataset
                                        #'average', # already in the dataset
                                        #'bayesaverage', # already in the dataset
                                        #'ranks', # already in the dataset
                                        'stddev',
                                        'median',
                                        'owned',
                                        'trading',
                                        'wanting',
                                        'wishing',
                                        'numcomments',
                                        'numweights',
                                        'averageweight'
                                         ])
    


In [4]:
game_details['game_id'].sort_values()

0             1
1             2
2             3
3             4
4             5
          ...  
15423    432487
15440    432527
15448    432536
15724    433444
16496    436126
Name: game_id, Length: 21297, dtype: int64

In [11]:
# convert the pandas series to a list for looping
game_ids_unupdated = [x for x in game_ids.to_list()]

game_ids_unupdated = list(set(game_ids_unupdated) - set(game_details['game_id'].to_list()))

game_ids_unupdated = list(map(str, game_ids_unupdated))
game_ids_unupdated.sort()
game_ids_unupdated


['1000',
 '1001',
 '1002',
 '100275',
 '100278',
 '1003',
 '1004',
 '1006',
 '1007',
 '100734',
 '1008',
 '10090',
 '10093',
 '10094',
 '10095',
 '10096',
 '10102',
 '10104',
 '10105',
 '1012',
 '1013',
 '101335',
 '10140',
 '101463',
 '101519',
 '101766',
 '101785',
 '101786',
 '101796',
 '101929',
 '101930',
 '10213',
 '10214',
 '102145',
 '102148',
 '10215',
 '102150',
 '102151',
 '102159',
 '102181',
 '10221',
 '10226',
 '10230',
 '10232',
 '10234',
 '102346',
 '10236',
 '10244',
 '102652',
 '102676',
 '102680',
 '102681',
 '102690',
 '102859',
 '102881',
 '103091',
 '103092',
 '103132',
 '10325',
 '10326',
 '103368',
 '10341',
 '10345',
 '10346',
 '10348',
 '10349',
 '103745',
 '103752',
 '103755',
 '103922',
 '103975',
 '104006',
 '104012',
 '104162',
 '104527',
 '104553',
 '10462',
 '10463',
 '10467',
 '10470',
 '10471',
 '10472',
 '104768',
 '104769',
 '104770',
 '104775',
 '104780',
 '10496',
 '10501',
 '10502',
 '10506',
 '105123',
 '105134',
 '105550',
 '105551',
 '10605',
 

In [12]:
def _extract_details_from_link_element(link_element, detail_type):
    detail_list = []
    for item in link_element:
        if item['@type'] == detail_type:
            detail_list.append(item['@value'])

    return ','.join(detail_list)
    pass # these infos are all in the same element and can occur multiple times. The different detail_types are: '

In [13]:
def _extract_info_into_dataframe(df, item_dict):
    # save the info into a dictionary
    info = {
        'game_id': item_dict['@id'],
        #'name': item_dict[item], # included in dataset
        #'alt_name', # maybe include, so the searching goes easier
        'description': item_dict['description'],
        'yearpublished': item_dict['yearpublished']['@value'],
        'minplayers': item_dict['minplayers']['@value'],
        'maxplayers': item_dict['maxplayers']['@value'],
        'community_best_with': item_dict['poll-summary']['result'][0]['@value'],
        'community_recommended_with': item_dict['poll-summary']['result'][1]['@value'],
        'playingtime': item_dict['playingtime']['@value'],
        'minplaytime': item_dict['minplaytime']['@value'],
        'maxplaytime': item_dict['maxplaytime']['@value'],
        'minage': item_dict['minage']['@value'],
        #'community_minage': item_dict[item], # hassle to extract
        #'language_dependency': item_dict[item], # superfluous? mostly focus on english games
        'boardgamecategories': _extract_details_from_link_element(item_dict['link'], 'boardgamecategory'), 
        'boardgamemechanics': _extract_details_from_link_element(item_dict['link'], 'boardgamemechanic'),
        'boardgamefamilies': _extract_details_from_link_element(item_dict['link'], 'boardgamefamily'),
        #'boardgameaccessories': _extract_details_from_link_element(item_dict['link'], 'boardgameaccessory'), # superfluous? probably has high correlation with the family and mechanics
        #'boardgameimplementations': _extract_details_from_link_element(item_dict['link'], 'boardgameimplementation'), # superfluous?
        'boardgamedesigners': _extract_details_from_link_element(item_dict['link'], 'boardgamedesigner'),
        'boardgameartists': _extract_details_from_link_element(item_dict['link'], 'boardgameartist'),
        #'boardgamepublishers': _extract_details_from_link_element(item_dict['link'], 'boardgamepublisher'), # superfluous?
        #'usersrated': item_dict[item], # already in the dataset
        #'average': item_dict[item], # already in the dataset
        #'bayesaverage': item_dict[item], # already in the dataset
        #'ranks': item_dict[item], # already in the dataset
        'stddev': item_dict['statistics']['ratings']['stddev']['@value'],
        'median': item_dict['statistics']['ratings']['median']['@value'],
        'owned': item_dict['statistics']['ratings']['owned']['@value'],
        'trading': item_dict['statistics']['ratings']['trading']['@value'],
        'wanting': item_dict['statistics']['ratings']['wanting']['@value'],
        'wishing': item_dict['statistics']['ratings']['wishing']['@value'],
        'numcomments': item_dict['statistics']['ratings']['numcomments']['@value'],
        'numweights': item_dict['statistics']['ratings']['numweights']['@value'],
        'averageweight': item_dict['statistics']['ratings']['averageweight']['@value']
    }
    return info
    

In [14]:
# prepare and execute the api call
form_values = {
    'id':'', # Specifies the id of the thing(s) to retrieve. To request multiple things with a single query, NNN can specify a comma-delimited list of ids. Maximum 20.
    #'type':'', # Specifies that, regardless of the type of thing asked for by id, the results are filtered by the THINGTYPE(s) specified. Multiple THINGTYPEs can be specified in a comma-delimited list.
    #'versions':'1', # Returns version info for the item.
    #'videos':'1', # Returns videos for the item.
    'stats':'1', # Returns ranking and rating stats for the item.
    #'historical':'1', # Not currently supported. Returns historical data over time. See page parameter.
    #'marketplace':'1', # Returns marketplace data.
    #'comments':'1', # Returns all comments about the item. Also includes ratings when commented. See page parameter.
    #'ratingcomments':'1', # Returns all ratings for the item. Also includes comments when rated. See page parameter. The ratingcomments and comments parameters cannot be used together, as the output always appears in the <comments> node of the XML; comments parameter takes precedence if both are specified. Ratings are sorted in descending rating value, based on the highest rating they have assigned to that item (each item in the collection can have a different rating).
    #'page':'1', # Defaults to 1, controls the page of data to see for historical info, comments, and ratings data.
    #'pagesize':'10', # Set the number of records to return in paging. Minimum is 10, maximum is 100.
    #'from':'', # Not currently supported.
    #'to':'' # Not currently supported.
}
max_ids_per_call = 20
counter = 0
# while game_ids_unupdated != []:
while game_ids_unupdated != []:
    ids_to_update = ",".join(game_ids_unupdated[:max_ids_per_call])  # Get first 20 items and join with ','
    del game_ids_unupdated[:max_ids_per_call]  # Remove them from the original list
    # update the API parameter to get the current id's
    form_values.update({'id':ids_to_update})
    # get the info from BGG
    response = requests.get(url_things, form_values)
    if response.status_code == 429:
        print('Too many requests!', counter, game_details.shape)
        counter +=1
        time.sleep(5)
    # save the info into the dataframe
    if response.status_code == 200:
        info = xmltodict.parse(response.text)  
    else:
        game_ids_unupdated.append(ids_to_update.split(','))
        continue
    for i in range(max_ids_per_call):
        try:
            info_current_game = _extract_info_into_dataframe(game_details, info['items']['item'][i])
            df_current_game = pd.DataFrame(info_current_game, index = [int(info_current_game['id'])])
            game_details = pd.concat([game_details, df_current_game])
        except:
            print('Extraction failed for game number: ', info['items']['item'][i]['@id'])


Extraction failed for game number:  1000
Extraction failed for game number:  1001
Extraction failed for game number:  1002
Extraction failed for game number:  100275
Extraction failed for game number:  100278
Extraction failed for game number:  1003
Extraction failed for game number:  1004
Extraction failed for game number:  1006
Extraction failed for game number:  1007
Extraction failed for game number:  100734
Extraction failed for game number:  1008
Extraction failed for game number:  10090
Extraction failed for game number:  10093
Extraction failed for game number:  10094
Extraction failed for game number:  10095
Extraction failed for game number:  10096
Extraction failed for game number:  10102
Extraction failed for game number:  10104
Extraction failed for game number:  10105
Extraction failed for game number:  1012
Extraction failed for game number:  1013
Extraction failed for game number:  101335
Extraction failed for game number:  10140
Extraction failed for game number:  1014

KeyboardInterrupt: 

In [9]:
print(game_details.shape)
game_details.drop_duplicates
print(game_details.shape)


(21297, 26)
(21297, 26)


In [10]:
game_details.to_csv('data/game_details.csv', index=False)