# Data Retrieval and exploration

In [None]:
import os
import requests
from tqdm.auto import tqdm
import zipfile
import kagglehub
import polars as pl
from pathlib import Path

DATA_DIR = Path("data")
DOWNLOAD_DIR = DATA_DIR / "download"
RAW_DIR = DATA_DIR / "raw"
GEN_DIR = DATA_DIR / "gen"

# Ensure directories exist
for d in [DOWNLOAD_DIR, RAW_DIR, GEN_DIR]:
    d.mkdir(parents=True, exist_ok=True)


DATASET_URL = 'https://www.kaggle.com/api/v1/datasets/download/jvanelteren/boardgamegeek-reviews'
DESTINATION_PATH = DOWNLOAD_DIR / 'raw.zip'
DATA_PATH = RAW_DIR

In [None]:
os.makedirs(os.path.dirname(DESTINATION_PATH), exist_ok=True)

response = requests.get(DATASET_URL, stream=True)

if response.status_code == 200:
    total_size = int(response.headers.get('content-length', 0))
    with open(DESTINATION_PATH, 'wb') as f:
        progress_bar = tqdm(total=total_size, unit='B', unit_scale=True, unit_divisor=1024)
        for data in response.iter_content(chunk_size=1024):
            f.write(data)
            progress_bar.update(len(data))
        progress_bar.close()
    print("Dataset downloaded successfully.")
else:
    print(f"Failed to download dataset: {response.status_code} - {response.text}")

100%|██████████| 1.61G/1.61G [00:48<00:00, 35.6MB/s] 

Dataset downloaded successfully.





In [None]:
os.makedirs(DATA_PATH, exist_ok=True)

try:
    with zipfile.ZipFile(DESTINATION_PATH, 'r') as zip_ref:
        zip_ref.extractall(DATA_PATH)
    print("Dataset extracted successfully.")
except zipfile.BadZipFile:
    print(f"Error: The file {DESTINATION_PATH} is not a valid ZIP file.")
except Exception as e:
    print(f"An error occurred: {e}")

Dataset extracted successfully.


In [12]:
# Download latest version
#path = kagglehub.dataset_download("jvanelteren/boardgamegeek-reviews") #info <- for me speicifing filename here was failing the download?
path = Path("./data/raw/")


print("Path to dataset files:", path)

Path to dataset files: data\raw


In [14]:
data = pl.read_csv(path / 'bgg-15m-reviews.csv')
print(data.head)

<bound method DataFrame.head of shape: (15_823_269, 6)
┌──────────┬─────────────────┬────────┬───────────────────────────────┬────────┬───────────────────┐
│          ┆ user            ┆ rating ┆ comment                       ┆ ID     ┆ name              │
│ ---      ┆ ---             ┆ ---    ┆ ---                           ┆ ---    ┆ ---               │
│ i64      ┆ str             ┆ f64    ┆ str                           ┆ i64    ┆ str               │
╞══════════╪═════════════════╪════════╪═══════════════════════════════╪════════╪═══════════════════╡
│ 0        ┆ Torsten         ┆ 10.0   ┆ null                          ┆ 30549  ┆ Pandemic          │
│ 1        ┆ mitnachtKAUBO-I ┆ 10.0   ┆ Hands down my favorite new    ┆ 30549  ┆ Pandemic          │
│          ┆                 ┆        ┆ gam…                          ┆        ┆                   │
│ 2        ┆ avlawn          ┆ 10.0   ┆ I tend to either love or      ┆ 30549  ┆ Pandemic          │
│          ┆                 ┆      

In [9]:
# print where comment is not null
print(data.shape)
print(data.filter(pl.col('comment').is_not_null()).shape)

(15823269, 6)
(2995023, 6)


In [10]:
# print length of unique users
print(data['user'].unique().shape)

# print length of unique games
print(data['ID'].unique().shape)



(351049,)
(19330,)
