# This is my Notebook to test collecting the data with the BGG API

## Setup

In [1]:
import requests
import pandas as pd
import xmltodict
import time


## Import data

In [2]:
url_things = 'https://boardgamegeek.com/xmlapi2/thing?'

all_games_ranked = pd.read_csv('data/boardgames_ranks.csv').query('rank > 0')
game_ids = all_games_ranked['id'].sort_values()
len(game_ids)

27925

In [3]:

try:
    game_details = pd.read_csv('data/game_details_raw.csv')
except:
    game_details = pd.DataFrame(columns=['game_id',
                                        #'alt_name', # maybe include, so the searching goes easier
                                        'description',
                                        #'yearpublished',
                                        'minplayers',
                                        'maxplayers',
                                        'community_best_with',
                                        'community_recommended_with',
                                        'playingtime',
                                        'minplaytime',
                                        'maxplaytime',
                                        'minage',
                                        #'community_minage',
                                        #'language_dependency', # superfluous? mostly focus on english games
                                        'boardgamecategories', 
                                        'boardgamemechanics',
                                        'boardgamefamilies',
                                        #'boardgameaccessories', # superfluous? probably has high correlation with the family and mechanics
                                        #'boardgameimplementations', # superfluous?
                                        'boardgamedesigners',
                                        'boardgameartists',
                                        #'boardgamepublishers', # superfluous?
                                        #'usersrated', # already in the dataset
                                        #'average', # already in the dataset
                                        #'bayesaverage', # already in the dataset
                                        #'ranks', # already in the dataset
                                        'stddev',
                                        'median',
                                        'owned',
                                        'trading',
                                        'wanting',
                                        'wishing',
                                        'numcomments',
                                        'numweights',
                                        'averageweight'
                                         ])
    


In [4]:
game_details['game_id'].sort_values()

450           1
5149          2
272           3
6862          4
343           5
          ...  
11487    432536
12150    433099
22669    433444
11980    435979
4578     436126
Name: game_id, Length: 27720, dtype: int64

In [5]:
# convert the pandas series to a list for looping
game_ids_unupdated = [x for x in game_ids.to_list()]

game_ids_unupdated = list(set(game_ids_unupdated) - set(game_details['game_id'].to_list()))

game_ids_unupdated = list(map(str, game_ids_unupdated))
game_ids_unupdated.sort()
len(game_ids_unupdated)


205

In [6]:
def _extract_details_from_link_element(link_element, detail_type):
    detail_list = []
    if type(link_element) == dict:
        if link_element['@type'] == detail_type:
            detail_list.append(item['@value'])
    else:
        for item in link_element:
            if item['@type'] == detail_type:
                detail_list.append(item['@value'])

    return ','.join(detail_list)
    pass # these infos are all in the same element and can occur multiple times. The different detail_types are: '

In [7]:
def _extract_info_into_dataframe(df, item_dict):
    # save the info into a dictionary
    info = {
        'game_id': item_dict['@id'],
        #'name': item_dict[item], # included in dataset
        #'alt_name', # maybe include, so the searching goes easier
        'description': item_dict['description'],
        #'yearpublished': item_dict['yearpublished']['@value'], # already included in dataset
        'minplayers': item_dict['minplayers']['@value'],
        'maxplayers': item_dict['maxplayers']['@value'],
        'community_best_with': item_dict['poll-summary']['result'][0]['@value'],
        'community_recommended_with': item_dict['poll-summary']['result'][1]['@value'],
        'playingtime': item_dict['playingtime']['@value'],
        'minplaytime': item_dict['minplaytime']['@value'],
        'maxplaytime': item_dict['maxplaytime']['@value'],
        'minage': item_dict['minage']['@value'],
        #'community_minage': item_dict[item], # hassle to extract
        #'language_dependency': item_dict[item], # superfluous? mostly focus on english games
        'boardgamecategories': _extract_details_from_link_element(item_dict['link'], 'boardgamecategory'), 
        'boardgamemechanics': _extract_details_from_link_element(item_dict['link'], 'boardgamemechanic'),
        'boardgamefamilies': _extract_details_from_link_element(item_dict['link'], 'boardgamefamily'),
        #'boardgameaccessories': _extract_details_from_link_element(item_dict['link'], 'boardgameaccessory'), # superfluous? probably has high correlation with the family and mechanics
        #'boardgameimplementations': _extract_details_from_link_element(item_dict['link'], 'boardgameimplementation'), # superfluous?
        'boardgamedesigners': _extract_details_from_link_element(item_dict['link'], 'boardgamedesigner'),
        'boardgameartists': _extract_details_from_link_element(item_dict['link'], 'boardgameartist'),
        #'boardgamepublishers': _extract_details_from_link_element(item_dict['link'], 'boardgamepublisher'), # superfluous?
        #'usersrated': item_dict[item], # already in the dataset
        #'average': item_dict[item], # already in the dataset
        #'bayesaverage': item_dict[item], # already in the dataset
        #'ranks': item_dict[item], # already in the dataset
        'stddev': item_dict['statistics']['ratings']['stddev']['@value'],
        'median': item_dict['statistics']['ratings']['median']['@value'],
        'owned': item_dict['statistics']['ratings']['owned']['@value'],
        'trading': item_dict['statistics']['ratings']['trading']['@value'],
        'wanting': item_dict['statistics']['ratings']['wanting']['@value'],
        'wishing': item_dict['statistics']['ratings']['wishing']['@value'],
        'numcomments': item_dict['statistics']['ratings']['numcomments']['@value'],
        'numweights': item_dict['statistics']['ratings']['numweights']['@value'],
        'averageweight': item_dict['statistics']['ratings']['averageweight']['@value']
    }
    return info
    

In [8]:
# prepare and execute the api call
form_values = {
    'id':'', # Specifies the id of the thing(s) to retrieve. To request multiple things with a single query, NNN can specify a comma-delimited list of ids. Maximum 20.
    #'type':'', # Specifies that, regardless of the type of thing asked for by id, the results are filtered by the THINGTYPE(s) specified. Multiple THINGTYPEs can be specified in a comma-delimited list.
    #'versions':'1', # Returns version info for the item.
    #'videos':'1', # Returns videos for the item.
    'stats':'1', # Returns ranking and rating stats for the item.
    #'historical':'1', # Not currently supported. Returns historical data over time. See page parameter.
    #'marketplace':'1', # Returns marketplace data.
    #'comments':'1', # Returns all comments about the item. Also includes ratings when commented. See page parameter.
    #'ratingcomments':'1', # Returns all ratings for the item. Also includes comments when rated. See page parameter. The ratingcomments and comments parameters cannot be used together, as the output always appears in the <comments> node of the XML; comments parameter takes precedence if both are specified. Ratings are sorted in descending rating value, based on the highest rating they have assigned to that item (each item in the collection can have a different rating).
    #'page':'1', # Defaults to 1, controls the page of data to see for historical info, comments, and ratings data.
    #'pagesize':'10', # Set the number of records to return in paging. Minimum is 10, maximum is 100.
    #'from':'', # Not currently supported.
    #'to':'' # Not currently supported.
}
max_ids_per_call = 20
counter = 0
# while game_ids_unupdated != []:
while game_ids_unupdated != []:
    try:
        ids_to_update = ",".join(game_ids_unupdated[:max_ids_per_call])  # Get first 20 items and join with ','
        del game_ids_unupdated[:max_ids_per_call]  # Remove them from the original list
    except:
         print('Some ID is a list')
         print(game_ids_unupdated[:max_ids_per_call])
    # update the API parameter to get the current id's
    form_values.update({'id':ids_to_update})
    # get the info from BGG
    response = requests.get(url_things, form_values)
    if response.status_code == 429:
        print('Too many requests!', counter, game_details.shape)
        counter +=1
        time.sleep(5)
    # save the info into the dataframe
    if response.status_code == 200:
        info = xmltodict.parse(response.text)  
    else:
        game_ids_unupdated.append(ids_to_update.split(','))
        continue
    for game in info['items']['item']:
            info_current_game = _extract_info_into_dataframe(game_details, game)
            df_current_game = pd.DataFrame(info_current_game, index = [int(info_current_game['game_id'])])
            game_details = pd.concat([game_details, df_current_game])
        


In [9]:
game_details.drop_duplicates(inplace=True)
game_details['game_id'] = game_details['game_id'].astype('int64')
game_details['game_id']

0        224517
1        161936
2        342942
3        174430
4        233078
          ...  
99935     99935
99949     99949
99975     99975
99976     99976
99992     99992
Name: game_id, Length: 27925, dtype: int64

In [10]:

cols_to_use = game_details.columns.difference(all_games_ranked.columns)
df = pd.merge(all_games_ranked, game_details[cols_to_use], left_on='id', right_on='game_id', suffixes=('',''))
df['description'] = df['description'].fillna('')
df.drop(columns=['id'], inplace=True)
df.to_csv('data/game_details_raw.csv', index=False)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27925 entries, 0 to 27924
Data columns (total 39 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name                        27925 non-null  object 
 1   yearpublished               27925 non-null  int64  
 2   rank                        27925 non-null  int64  
 3   bayesaverage                27925 non-null  float64
 4   average                     27925 non-null  float64
 5   usersrated                  27925 non-null  int64  
 6   is_expansion                27925 non-null  int64  
 7   abstracts_rank              1441 non-null   float64
 8   cgs_rank                    367 non-null    float64
 9   childrensgames_rank         1082 non-null   float64
 10  familygames_rank            3380 non-null   float64
 11  partygames_rank             922 non-null    float64
 12  strategygames_rank          3044 non-null   float64
 13  thematic_rank               170

In [12]:
df['boardgamemechanics']

0        Hand Management,Income,Loans,Market,Network an...
1        Action Points,Cooperative Game,Hand Management...
2        Action Queue,End Game Bonuses,Grid Coverage,Ha...
3        Action Queue,Action Retrieval,Campaign / Battl...
4        Action Drafting,Area-Impulse,Dice Rolling,Foll...
                               ...                        
27920       Betting and Bluffing,Bingo,Pattern Recognition
27921                                                  NaN
27922                                 Roll / Spin and Move
27923      Events,Race,Roll / Spin and Move,Track Movement
27924        Paper-and-Pencil,Pattern Building,Square Grid
Name: boardgamemechanics, Length: 27925, dtype: object

In [13]:
print(df['community_best_with'].dtype)

object


In [14]:
if df['community_best_with'].dtype == 'object':
    df['community_best_with'] = df['community_best_with'].str.extract('(\d+)').astype(float)
collective_columns = [#'boardgameartists',
                      'boardgamecategories',
                      #'boardgamedesigners',
                      #'boardgamefamilies',
                      'boardgamemechanics'
                      ]
df[collective_columns].fillna('')
for category in collective_columns:
    print(category)
    df = df.join(df[category].str.get_dummies(sep=','), rsuffix=category)

boardgamecategories
boardgamemechanics


In [15]:
df.columns

Index(['name', 'yearpublished', 'rank', 'bayesaverage', 'average',
       'usersrated', 'is_expansion', 'abstracts_rank', 'cgs_rank',
       'childrensgames_rank',
       ...
       'Turn Order: Stat-Based', 'Turn Order: Time Track',
       'Variable Phase Order', 'Variable Player Powers', 'Variable Set-up',
       'Victory Points as a Resource', 'Voting', 'Worker Placement',
       'Worker Placement with Dice Workers', 'Zone of Control'],
      dtype='object', length=318)

In [16]:
df.to_csv('data/game_details_cleaned.csv', index=False)