# Importing the Libraries

In [3]:
# !pip install selenium
# !apt-get install chromium-driver

In [2]:
from bs4 import BeautifulSoup                # the BeautifulSoup library for scraping
#from selenium import webdriver               # The Selenium library for scraping
#from selenium.webdriver.common.by import By  # For finding elements using the By method (didn't end up using)

import requests                              # Establish website connection using the requests library
import pandas as pd                          # For getting the data into a dataframe
import numpy as np                           # Standard maths
import re                                    # RegEx for pattern matching

# Retrieving the URL of each board game page

In [10]:
site = 'https://boardgamegeek.com' # The base url for BoardGameGeek
browse = 'https://boardgamegeek.com/browse/boardgame/page/{}' # The url for the browsing section of BoardGameGeek - {} for page number

game_links = []

# Extract the url of each specific board game from the first 10 browse pages
for page_no in range(1, 11):
    browse_page = browse.format(page_no)
    response = requests.get(browse_page).text
    soup = BeautifulSoup(response, 'html.parser')

# Modify the url so that it instead directs to the credits section that includes more information
    for link in soup.select('.collection_objectname a'):
      game_links.append(site + link['href'])

In [27]:
print(len(game_links)) # Check to see if there are the correct amount of board games
# print(game_links)

1000



# Retrieving the data from a single page using beautiful soup

In [20]:
# Connect to the board game "Dune: Imperium" from the BoardGameGeek website
site = 'https://boardgamegeek.com/boardgame/316554/dune-imperium/credits#boardgamemechanic'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
response

<Response [200]>

In [20]:
# Locate the area that contains the important information
info = soup.select('script')[2].text
# print(info)




In [11]:
# Using Regex to extract the important information on each board game
Name = re.search(r'},"name":"(.*?)"', info).group(1)
Year = re.search(r'"yearpublished":"(.*?)"', info).group(1)
MinPlayers = re.search(r'"minplayers":"(.*?)"', info).group(1)
MaxPlayers = re.search(r'"maxplayers":"(.*?)"', info).group(1)
MinPlayTime = re.search(r'"minplaytime":"(.*?)"', info).group(1)
MaxPlayTime = re.search(r'"maxplaytime":"(.*?)"', info).group(1)
MinAge = re.search(r'"minage":"(.*?)"', info).group(1)
Complexity = re.search(r'"averageweight":(.*?),', info).group(1)
BGGRating = re.search(r'"average":"(.*?)",', info).group(1)
Description = re.search(r'"short_description":"(.*?)"', info).group(1)
Category = re.findall(r'"href":"\\/boardgamecategory\\/(.*?)\\/(.*?)"', info)
Mechanic = re.findall(r'"href":"\\/boardgamemechanic\\/(.*?)\\/(.*?)"', info)
print("Name: " + Name)
print("Year: " + Year)
print("MinPlayers: " + MinPlayers)
print("MaxPlayers: " + MaxPlayers)
print("MinPlayTime: " + MinPlayTime)
print("MaxPlayTime: " + MaxPlayTime)
print("MinAge: " + MinAge)
print("Complexity: " + Complexity)
print("BGGRating: " + BGGRating)
print("Description: " + Description)
print(Category)
print(Mechanic)

Name: Dune: Imperium
Year: 2020
MinPlayers: 1
MaxPlayers: 4
MinPlayTime: 60
MaxPlayTime: 120
MinAge: 14
Complexity: 3.059255079006772
BGGRating: 8.42716
Description: Influence, intrigue, and combat in the universe of Dune.
[('1064', 'movies-tv-radio-theme'), ('1093', 'novel-based'), ('1001', 'political'), ('1016', 'science-fiction')]
[('2857', 'card-play-conflict-resolution'), ('2664', 'deck-bag-and-pool-building'), ('2901', 'delayed-purchase'), ('2864', 'force-commitment'), ('2914', 'increase-value-of-unchosen-resources'), ('3099', 'multi-use-cards')]


---
### Why I couldn't use BeautifulSoup:
Unfortunately, BGG has a very large HTML, so the information that I required was not shown when put into python. This led to a workaround by using Regex on a specific set of information in a portion of the HTML called "GEEK.geekitemPreload"

This approach worked for most of it since this portion contained all the information I needed, however, when extracting the genre list I noticed it to be smaller than expected. That's when I realised that GEEK.geekitemPreload only contained the first 6 genres from BGG alphabetically. So for example, a genre like "worker placement" would never be shown in my genre list which would be a big limitation.

This resulted in attempting to explore the HTML using Selenium instead

---

# Retrieving the data from a single page using Selenium

In [5]:
# Retrieving the web driver into google colab
def web_driver():
  options = webdriver.ChromeOptions()
  options.add_argument("--verbose")
  options.add_argument('--no-sandbox')
  options.add_argument('--headless')
  options.add_argument('--disable-gpu')
  options.add_argument("--window-size=1920, 1200")
  options.add_argument('--disable-dev-shm-usage')
  driver = webdriver.Chrome(options=options)
  return driver

driver = web_driver()

In [17]:
# Connect to the board game "Dune: Imperium" from the BoardGameGeek website
driver.get('https://boardgamegeek.com/boardgame/316554/dune-imperium/credits#boardgamemechanic')
source = driver.page_source
#source

In [16]:
# Using Regex to extract the important information on each board game
Name = re.search(r'},"name":"(.*?)"', source).group(1)
Year = re.search(r'"yearpublished":"(.*?)"', source).group(1)
MinPlayers = re.search(r'"minplayers":"(.*?)"', source).group(1)
MaxPlayers = re.search(r'"maxplayers":"(.*?)"', source).group(1)
MinPlayTime = re.search(r'"minplaytime":"(.*?)"', source).group(1)
MaxPlayTime = re.search(r'"maxplaytime":"(.*?)"', source).group(1)
MinAge = re.search(r'"minage":"(.*?)"', source).group(1)
Complexity = re.search(r'"averageweight":(.*?),', source).group(1)
BGGRating = re.search(r'"average":"(.*?)",', source).group(1)
Description = re.search(r'"short_description":"(.*?)"', source).group(1)
Category = re.findall(r'<a ng-href="/boardgamecategory/(.*?)/(.*?)"', source)
Mechanic = re.findall(r'<a ng-href="/boardgamemechanic/(.*?)/(.*?)"', source)
print("Name: " + Name)
print("Year: " + Year)
print("MinPlayers: " + MinPlayers)
print("MaxPlayers: " + MaxPlayers)
print("MinPlayTime: " + MinPlayTime)
print("MaxPlayTime: " + MaxPlayTime)
print("MinAge: " + MinAge)
print("Complexity: " + Complexity)
print("BGGRating: " + BGGRating)
print("Description: " + Description)
print(Category)
print(Mechanic)

Name: Dune: Imperium
Year: 2020
MinPlayers: 1
MaxPlayers: 4
MinPlayTime: 60
MaxPlayTime: 120
MinAge: 14
Complexity: 3.059255079006772
BGGRating: 8.42716
Description: Influence, intrigue, and combat in the universe of Dune.
[('1064', 'movies-tv-radio-theme'), ('1093', 'novel-based'), ('1001', 'political'), ('1016', 'science-fiction')]
[('2857', 'card-play-conflict-resolution'), ('2664', 'deck-bag-and-pool-building'), ('2901', 'delayed-purchase'), ('2864', 'force-commitment'), ('2914', 'increase-value-of-unchosen-resources'), ('3099', 'multi-use-cards'), ('2041', 'open-drafting'), ('2876', 'race'), ('2819', 'solo-solitaire-game'), ('3100', 'tags'), ('2686', 'take-that'), ('2828', 'turn-order-progressive'), ('2015', 'variable-player-powers'), ('2082', 'worker-placement')]


# Retrieving the data from each game into one dataframe

In [7]:
# Retrieving the web driver into google colab.
def web_driver():
  options = webdriver.ChromeOptions()
  options.add_argument("--verbose")
  options.add_argument('--no-sandbox')
  options.add_argument('--headless')
  options.add_argument('--disable-gpu')
  options.add_argument("--window-size=1920, 1200")
  options.add_argument('--disable-dev-shm-usage')
  driver = webdriver.Chrome(options=options)
  return driver
driver = web_driver()

In [10]:
# Defining some empty lists of each element I want in my dataset
Name = []
Release_Year = []
Description = []
Minimum_Players = []
Maximum_Players = []
Minimum_Time = []
Maximum_Time = []
Age_Rating = []
Complexity_Score = []
BBG_Rating = []
Game_Categories = []
Game_Mechanics = []

# Using Selenium to connect to the URL of each game made in game_links
for game in game_links:
  driver.get(game)
  source = driver.page_source

# Using Regex to extract the important information on each board game
# Under a try/except statement so that any missing information is found as unknown instead of stopping the collection

  try:
    Title = re.search(r'},"name":"(.*?)"', source).group(1)
  except AttributeError:
    Title = "Unknown"

  try:
    Year = re.search(r'"yearpublished":"(.*?)"', source).group(1)
  except AttributeError:
    Year = "Unknown"

  try:
    Desc = re.search(r'"short_description":"(.*?)"', source).group(1)
  except AttributeError:
    Desc = "Unknown"

  try:
    MinPlayers = re.search(r'"minplayers":"(.*?)"', source).group(1)
  except AttributeError:
    MinPlayers = "Unknown"

  try:
    MaxPlayers = re.search(r'"maxplayers":"(.*?)"', source).group(1)
  except AttributeError:
    MaxPlayers = "Unknown"

  try:
    MinPlayTime = re.search(r'"minplaytime":"(.*?)"', source).group(1)
  except AttributeError:
    MinPlayTime = "Unknown"

  try:
    MaxPlayTime = re.search(r'"maxplaytime":"(.*?)"', source).group(1)
  except AttributeError:
    MaxPlayTime = "Unknown"

  try:
    MinAge = re.search(r'"minage":"(.*?)"', source).group(1)
  except AttributeError:
    MinAge = "Unknown"

  try:
    Complexity = re.search(r'"averageweight":(.*?),', source).group(1)
  except AttributeError:
    Complexity = "Unknown"

  try:
    BGGRating = re.search(r'"average":"(.*?)",', source).group(1)
  except AttributeError:
    BGGRating = "Unknown"

  try:
    Category = re.findall(r'<a ng-href="/boardgamecategory/(.*?)/(.*?)"', source)
    Category = Category[0][0] if Category else "Unknown" # Extract the first category ID
  except (AttributeError, IndexError):
    Category = "Unknown"

  try:
    Mechanic = re.findall(r'<a ng-href="/boardgamemechanic/(.*?)/(.*?)"', source)
    Mechanic = Mechanic[0][0] if Mechanic else "Unknown" # Extract the first mechanic ID
  except (AttributeError, IndexError):
    Mechanic = "Unknown"


  Name.append(Title)
  Release_Year.append(Year)
  Description.append(Desc)
  Minimum_Players.append(MinPlayers)
  Maximum_Players.append(MaxPlayers)
  Minimum_Time.append(MinPlayTime)
  Maximum_Time.append(MaxPlayTime)
  Age_Rating.append(MinAge)
  Complexity_Score.append(Complexity)
  BBG_Rating.append(BGGRating)
  Game_Categories.append(Category)
  Game_Mechanics.append(Mechanic)

# Creating a dataframe and converting it to a .csv

In [14]:
# Collecting my data into a pandas dataframe
df = pd.DataFrame({"Name":Name, "Release_Year":Release_Year, "Description":Description,
                   "Minimum_Players":Minimum_Players, "Maximum_Players":Maximum_Players,
                   "Minimum_Time":Minimum_Time, "Maximum_Time":Maximum_Time, "Age_Rating":Age_Rating,
                   "Complexity_Score":Complexity_Score, "BBG_Rating":BBG_Rating,
                   "Game_Categories":Game_Categories, "Game_Mechanics":Game_Mechanics})

In [15]:
# Checking to make sure everything looks okay!
df.head()

Unnamed: 0,Name,Release_Year,Description,Minimum_Players,Maximum_Players,Minimum_Time,Maximum_Time,Age_Rating,Complexity_Score,BBG_Rating,Game_Categories,Game_Mechanics
995,Union Pacific,1999,Will you increase the value of railways you ha...,2,6,90,90,12,2.5458422174840085,7.19338,1021,2081
996,Istanbul: The Dice Game,2017,Roll worker dice to collect and trade resource...,2,4,20,40,8,1.790909090909091,7.14751,1017,2001
997,Pictomania,2011,Pictionary chaos. Everyone draws and guesses ...,3,6,25,25,9,1.5254237288135593,7.26297,1030,3096
998,Palm Island,2018,Develop and gather resources to grow an island...,1,2,15,15,10,1.7452229299363058,7.06717,1002,2023
999,Spyfall,2014,Players ask each other probing questions to de...,3,8,15,15,13,1.2355212355212355,6.75935,1023,2073


In [17]:
# Converting my dataframe to a csv
df.to_csv('Board Game Data.csv', index=False)