In [3]:
# Collecting Rawg data by API 
!pip install -q pandas requests python-dotenv

import os
import time
import random
import requests
import pandas as pd
from dotenv import load_dotenv

load_dotenv()  
API_KEY = os.getenv("RAWG_API_KEY")
if not API_KEY:
    raise RuntimeError(
        "RAWG_API_KEY not found. Create a .env file in your project root with:\n"
        "RAWG_API_KEY=your_key_here"
    )

games = []
target_per_bucket = 800  
buckets = [
    ("2000-01-01", "2004-12-31"),
    ("2005-01-01", "2009-12-31"),
    ("2010-01-01", "2014-12-31"),
    ("2015-01-01", "2019-12-31"),
    ("2020-01-01", "2025-12-31"),
]

print("Starting RAWG data collection (2000–2025)…\n")

session = requests.Session()
BASE = "https://api.rawg.io/api/games"

for start, end in buckets:
    print(f"Collecting for {start[:4]}–{end[:4]}…")
    collected = 0
    attempts = 0

    page_limit = 200 if int(start[:4]) < 2010 else 800

    while collected < target_per_bucket and attempts < 200:
        page = random.randint(1, page_limit)
        params = {
            "key": API_KEY,
            "page_size": 100,
            "page": page,
            "dates": f"{start},{end}",
        }

        try:
            r = session.get(BASE, params=params, timeout=20)
        except requests.RequestException:
            attempts += 1
            time.sleep(1)
            continue

        attempts += 1
        if r.status_code != 200:
            print(f"  Skipped page {page} (error {r.status_code})")
            time.sleep(1)
            continue

        data = r.json().get("results", [])
        if not data:
            time.sleep(0.5)
            continue

        for g in data:
            platforms = g.get("platforms") or []
            platform_names = ", ".join(
                p["platform"]["name"]
                for p in platforms
                if p and p.get("platform") and p["platform"].get("name")
            )

            genres_list = g.get("genres") or []
            genre_names = ", ".join(
                p.get("name", "")
                for p in genres_list
                if p and p.get("name")
            )

            games.append({
                "name": g.get("name"),
                "rating": g.get("rating"),
                "released": g.get("released"),
                "platforms": platform_names if platform_names else None,
                "genres": genre_names if genre_names else None,
            })

            collected += 1
            if collected >= target_per_bucket:
                break

        if collected % 40 == 0:
            print(f"  {collected} games collected so far for {start[:4]}–{end[:4]}")
        time.sleep(1)  

df = pd.DataFrame(games)
df["year"] = pd.to_datetime(df["released"], errors="coerce").dt.year

print("\nCollection complete.")
print("Total collected:", len(df))
print(df["year"].value_counts().sort_index())

os.makedirs("data", exist_ok=True)
out_path = "data/rawg_balanced_2000_2025.csv"
df.to_csv(out_path, index=False)
print("Saved dataset:", out_path)

Starting RAWG data collection (2000–2025)…

Collecting for 2000–2004…
  Skipped page 195 (error 404)
  40 games collected so far for 2000–2004
  Skipped page 188 (error 404)
  80 games collected so far for 2000–2004
  Skipped page 191 (error 404)
  120 games collected so far for 2000–2004
  160 games collected so far for 2000–2004
  200 games collected so far for 2000–2004
  Skipped page 182 (error 404)
  240 games collected so far for 2000–2004
  280 games collected so far for 2000–2004
  320 games collected so far for 2000–2004
  360 games collected so far for 2000–2004
  Skipped page 194 (error 404)
  400 games collected so far for 2000–2004
  Skipped page 199 (error 404)
  440 games collected so far for 2000–2004
  Skipped page 144 (error 404)
  480 games collected so far for 2000–2004
  Skipped page 162 (error 404)
  Skipped page 200 (error 404)
  520 games collected so far for 2000–2004
  560 games collected so far for 2000–2004
  600 games collected so far for 2000–2004
  Skippe

In [1]:
#Collecting Raw Steam Data by API   & Collecting Raw Data Sales data by API
!pip install -q kaggle

import os
import shutil
from kaggle import api
import pandas as pd

os.makedirs("data", exist_ok=True)

home = os.path.expanduser("~")
kaggle_dir = os.path.join(home, ".kaggle")
os.makedirs(kaggle_dir, exist_ok=True)

src_cfg = "kaggle.json"
dst_cfg = os.path.join(kaggle_dir, "kaggle.json")
if os.path.exists(src_cfg):
    shutil.copyfile(src_cfg, dst_cfg)
try:
    if os.name == "posix":
        os.chmod(dst_cfg, 0o600)
except Exception:
    pass

def fetch_kaggle_dataset(dataset_slug: str, dest_map: dict):
    """
    dataset_slug: e.g., 'nikdavis/steam-store-games'
    dest_map: {'expected_file_name_in_zip.csv': 'data/target_name.csv'}
    """
    tmp = f"_tmp_{dataset_slug.split('/')[-1]}"
    os.makedirs(tmp, exist_ok=True)

    api.dataset_download_files(dataset_slug, path=tmp, unzip=True)

    for expected_name, out_path in dest_map.items():
        found = None
        for fname in os.listdir(tmp):
            if fname.lower() == expected_name.lower():
                found = os.path.join(tmp, fname)
                break
        if not found:
            raise FileNotFoundError(
                f"Could not find '{expected_name}' in {tmp}. "
                f"Available: {sorted(os.listdir(tmp))[:10]}"
            )
        shutil.copyfile(found, out_path)

    # 
    shutil.rmtree(tmp, ignore_errors=True)

fetch_kaggle_dataset(
    "nikdavis/steam-store-games",
    dest_map={"steam.csv": "data/steam.csv"}
)

fetch_kaggle_dataset(
    "gregorut/videogamesales",
    dest_map={"vgsales.csv": "data/vgsales.csv"}
)

steam = pd.read_csv("data/steam.csv")
sales = pd.read_csv("data/vgsales.csv")

Dataset URL: https://www.kaggle.com/datasets/nikdavis/steam-store-games
Dataset URL: https://www.kaggle.com/datasets/gregorut/videogamesales


In [2]:
print("steam.csv shape:", steam.shape)
print("vgsales.csv shape:", sales.shape)
print("steam columns (first 15):", list(steam.columns)[:15])
print("vgsales columns:", list(sales.columns))

steam.csv shape: (27075, 18)
vgsales.csv shape: (16598, 11)
steam columns (first 15): ['appid', 'name', 'release_date', 'english', 'developer', 'publisher', 'platforms', 'required_age', 'categories', 'genres', 'steamspy_tags', 'achievements', 'positive_ratings', 'negative_ratings', 'average_playtime']
vgsales columns: ['Rank', 'Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']
