In [2]:
pip install requests networkx


Note: you may need to restart the kernel to use updated packages.


# Bước 1: Chuẩn bị môi trường & API key

In [3]:
# %% 
import os
from pathlib import Path
import json
import time
import hashlib
from itertools import islice
import requests
import networkx as nx
from dotenv import load_dotenv

# Load API key
load_dotenv()
API_KEY = os.getenv("STEAM_API_KEY")
if not API_KEY:
    raise SystemExit("Please set STEAM_API_KEY in environment or .env file")

BASE = "https://api.steampowered.com"
CACHE_DIR = Path("steam_cache")
CACHE_DIR.mkdir(exist_ok=True)

DELAY_BETWEEN_REQUESTS = 0.5
BATCH_SIZE = 50
MAX_ACHIEVEMENTS_APPS = 3
REQUEST_TIMEOUT = 15


# STEAM CRAWLER - full functions (cache, API, endpoints, BFS crawler, save outputs)
- Cache để tránh request trùng
- API request với backoff, skip private
- Các endpoint Steam: summaries, friends, owned games, recently played, groups, bans, achievements
- BFS crawl nhiều batch, skip user/private, merge cuối cùng
- Safe với NetworkX GEXF (loại bỏ NoneType)

In [6]:
# %% 
"""
STEAM CRAWLER - SAFE THROTTLED + CHECKPOINT
--------------------------------------------
- Cache tránh request trùng
- API request với backoff & throttle
- BFS crawl nhiều batch
- Lưu checkpoint mỗi N user
- In progress để theo dõi tiến trình
"""

import os, json, time, hashlib, random
from itertools import islice
import requests
import networkx as nx
from pathlib import Path

# ================================
# Cache utils
# ================================
CACHE_DIR = Path("steam_cache")
CACHE_DIR.mkdir(exist_ok=True)

def cache_key(endpoint, params):
    s = endpoint + json.dumps(params, sort_keys=True)
    return hashlib.sha1(s.encode("utf-8")).hexdigest()

def cache_load(endpoint, params):
    k = cache_key(endpoint, params)
    p = CACHE_DIR / f"{k}.json"
    if p.exists():
        try:
            return json.loads(p.read_text(encoding="utf-8"))
        except Exception:
            return None
    return None

def cache_save(endpoint, params, data):
    k = cache_key(endpoint, params)
    p = CACHE_DIR / f"{k}.json"
    p.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")

# ================================
# Config crawl
# ================================
DELAY_MIN = 1.0
DELAY_MAX = 2.0
BATCH_SIZE = 50
MAX_APPS_PER_USER = 3
REQUEST_TIMEOUT = 15
CHECKPOINT_EVERY = 50
BASE = "https://api.steampowered.com"
API_KEY = os.getenv("STEAM_API_KEY")

# ================================
# API request with backoff & throttle
# ================================
def api_get(endpoint, params, use_cache=True):
    params = dict(params)
    params["key"] = API_KEY
    if use_cache:
        cached = cache_load(endpoint, params)
        if cached is not None:
            return cached

    url = f"{BASE}/{endpoint}"
    backoff = 1.0
    for attempt in range(6):
        try:
            r = requests.get(url, params=params, timeout=REQUEST_TIMEOUT)
            if r.status_code == 200:
                try: data = r.json()
                except ValueError: data = r.text
                cache_save(endpoint, params, data)
                time.sleep(random.uniform(DELAY_MIN, DELAY_MAX))
                return data
            elif r.status_code in (429, 503):
                time.sleep(backoff + random.uniform(0.1,0.5))
                backoff *= 2
            elif r.status_code in (401,403):
                print(f"[PRIVATE] skip: {params}")
                return {"private": True}
            else:
                print(f"[WARN] {r.status_code} from {url} params={params}")
                return None
        except requests.RequestException as e:
            print(f"[WARN] Request exception {e}, backoff {backoff}s")
            time.sleep(backoff + random.uniform(0.1,0.5))
            backoff *= 2
    print("[ERROR] Max retries exceeded for", url)
    return None

# ================================
# Helper chunks
# ================================
def chunks(iterable, n):
    it = iter(iterable)
    while True:
        chunk = list(islice(it, n))
        if not chunk: break
        yield chunk

# ================================
# Steam endpoints
# ================================
def get_player_summaries(steamid_list):
    endpoint="ISteamUser/GetPlayerSummaries/v0002/"; results=[]
    for batch in chunks(steamid_list,BATCH_SIZE):
        params={"steamids":",".join(batch)}
        data=api_get(endpoint,params)
        if data and not data.get("private"):
            results.extend(data.get("response",{}).get("players",[]))
    return results

def get_friend_list(steamid):
    endpoint="ISteamUser/GetFriendList/v0001/"
    params={"steamid":steamid,"relationship":"friend"}
    data=api_get(endpoint,params)
    if data and data.get("private"): return []
    return [f.get("steamid") for f in data.get("friendslist",{}).get("friends",[])] if data else []

def get_owned_games(steamid):
    endpoint="IPlayerService/GetOwnedGames/v0001/"
    params={"steamid":steamid,"include_appinfo":1,"include_played_free_games":1,"format":"json"}
    data=api_get(endpoint,params)
    if data and data.get("private"): return {}
    return data.get("response",{}) if data else {}

def get_recently_played(steamid):
    endpoint="IPlayerService/GetRecentlyPlayedGames/v0001/"
    params={"steamid":steamid}
    data=api_get(endpoint,params)
    if data and data.get("private"): return {}
    return data.get("response",{}) if data else {}

def get_user_groups(steamid):
    endpoint="ISteamUser/GetUserGroupList/v1/"
    params={"steamid":steamid}
    data=api_get(endpoint,params)
    if data and data.get("private"): return []
    return data.get("response",{}).get("groups",[]) if data else []

def get_player_bans(steamid_list):
    endpoint="ISteamUser/GetPlayerBans/v1/"; results=[]
    for batch in chunks(steamid_list,BATCH_SIZE):
        params={"steamids":",".join(batch)}
        data=api_get(endpoint,params)
        if data and not data.get("private"): results.extend(data.get("players",[]))
    return results

def get_player_achievements(steamid,appid):
    endpoint="ISteamUserStats/GetPlayerAchievements/v1/"
    params={"steamid":steamid,"appid":appid}
    data=api_get(endpoint,params)
    if data and data.get("private"): return {}
    return data.get("playerstats",{}) if data else {}

def resolve_vanity(name):
    endpoint="ISteamUser/ResolveVanityURL/v0001/"
    params={"vanityurl":name}
    data=api_get(endpoint,params)
    if data and data.get("response",{}).get("success")==1:
        return data["response"].get("steamid")
    return None

# ================================
# Load existing profiles
# ================================
def load_existing_profiles(json_path):
    if Path(json_path).exists():
        with open(json_path,"r",encoding="utf-8") as f:
            data=json.load(f)
        return set(data.keys()), data
    return set(), {}

# ================================
# Save outputs
# ================================
def save_outputs(G, profiles, out_prefix="steam_output"):
    # Clean node attributes
    for n, data in G.nodes(data=True):
        for k, v in list(data.items()):
            if v is None: data[k] = ""
            elif isinstance(v,(list,dict)): data[k]=str(v)
    json_path=f"{out_prefix}_profiles1.json"
    with open(json_path,"w",encoding="utf-8") as f: json.dump(profiles,f,indent=2,ensure_ascii=False)
    gexf_path=f"{out_prefix}_graph1.gexf"
    nx.write_gexf(G,gexf_path)
    print(f"[Saved] profiles -> {json_path}")
    print(f"[Saved] graph -> {gexf_path}")

# ================================
# BFS crawler with checkpoint
# ================================
def crawl(seed_ids, depth=1, max_users=1000, max_apps_per_user=3, visited_global=None, save_prefix="steam_output", fetch_achievements=True):
    G = nx.Graph()
    visited_global = visited_global or set()
    queue = [(s,0) for s in seed_ids]
    all_profiles = {}
    stats = {"public":0,"private":0,"skipped":0}

    while queue and len(visited_global)<max_users:
        steamid, d = queue.pop(0)
        if steamid in visited_global or d>depth: continue
        if not steamid.isdigit(): steamid=resolve_vanity(steamid) or steamid
        if not steamid.isdigit(): stats["skipped"]+=1; continue

        print(f"Crawling {steamid} (depth {d}) | Visited {len(visited_global)}/{max_users}")

        players = get_player_summaries([steamid])
        if not players:
            stats["private"]+=1
            visited_global.add(steamid)
            continue

        player = players[0]
        visited_global.add(steamid)
        stats["public"]+=1
        all_profiles[steamid] = player

        G.add_node(steamid,label=player.get("personaname",""),
                   avatar=player.get("avatarfull",""),
                   country=player.get("loccountrycode",""))

        friends = get_friend_list(steamid)
        for f in friends:
            G.add_edge(steamid,f)
            if f not in visited_global and d+1<=depth:
                queue.append((f,d+1))

        owned = get_owned_games(steamid)
        recent = get_recently_played(steamid)
        groups = get_user_groups(steamid)

        G.nodes[steamid]["owned_game_count"] = owned.get("game_count",0)
        G.nodes[steamid]["recent_count"] = recent.get("total_count",0)
        G.nodes[steamid]["groups"] = [g.get("groupid64") for g in groups] if groups else []

        apps = owned.get("games",[]) if isinstance(owned,dict) else []
        top_apps = sorted(apps, key=lambda x:x.get("playtime_forever",0), reverse=True)[:max_apps_per_user]
        G.nodes[steamid]["top_apps"] = [a.get("appid") for a in top_apps]

        # Achievements (optional, tốn thời gian)
        if fetch_achievements:
            achievements={}
            for app in top_apps: achievements[app.get("appid")] = get_player_achievements(steamid, app.get("appid"))
            G.nodes[steamid]["achievements_sample"] = achievements

        # Checkpoint every N users
        if len(visited_global) % CHECKPOINT_EVERY == 0:
            save_outputs(G, all_profiles, out_prefix=f"{save_prefix}_partial")
            print(f"[Checkpoint] Saved after {len(visited_global)} users")

    # Final bans
    bans = get_player_bans(list(visited_global))
    bans_map = {b["SteamId"]:b for b in bans}
    for node in G.nodes:
        G.nodes[node]["ban_info"] = bans_map.get(node,{})

    save_outputs(G, all_profiles, out_prefix=save_prefix)
    print(f"Stats: {stats}")
    return G, all_profiles, visited_global

# ================================
# Example run
# ================================
visited_prev, profiles_prev = load_existing_profiles("steam_output_test_profiles1.json")
SEEDS = ["76561198294300457","76561197973974836"]  # Thay bằng steamid của bạn

G_new, profiles_new, visited_new = crawl(
    SEEDS,
    depth=2,
    max_users=5000,
    max_apps_per_user=3,
    visited_global=visited_prev,
    save_prefix="steam_output",
    fetch_achievements=False  # Test nhanh trước
)


Crawling 76561198294300457 (depth 0) | Visited 0/5000
Crawling 76561197973974836 (depth 0) | Visited 1/5000
Crawling 76561197993609350 (depth 1) | Visited 2/5000
Crawling 76561198046562139 (depth 1) | Visited 3/5000
Crawling 76561198061953477 (depth 1) | Visited 4/5000
Crawling 76561198062226249 (depth 1) | Visited 5/5000
Crawling 76561198076766555 (depth 1) | Visited 6/5000
Crawling 76561198077186687 (depth 1) | Visited 7/5000
Crawling 76561198097199951 (depth 1) | Visited 8/5000
Crawling 76561198109776027 (depth 1) | Visited 9/5000
Crawling 76561198110181817 (depth 1) | Visited 10/5000
Crawling 76561198116685130 (depth 1) | Visited 11/5000
Crawling 76561198120611517 (depth 1) | Visited 12/5000
Crawling 76561198128339070 (depth 1) | Visited 13/5000
Crawling 76561198134815732 (depth 1) | Visited 14/5000
Crawling 76561198135035013 (depth 1) | Visited 15/5000
Crawling 76561198138224835 (depth 1) | Visited 16/5000
[PRIVATE] skip: {'steamid': '76561198138224835', 'relationship': 'friend', 

KeyboardInterrupt: 

# Kết hợp 2 file graph với graph1; profile với profile1

In [8]:
import json
import networkx as nx

# ---- 1. Merge Profiles ----
with open("steam_output_partial_profiles.json", "r", encoding="utf-8") as f1, \
     open("steam_output_partial_profiles1.json", "r", encoding="utf-8") as f2:
    profiles_a = json.load(f1)
    profiles_b = json.load(f2)

# Gộp dict, ưu tiên profile_b khi trùng steamid
merged_profiles = {**profiles_a, **profiles_b}

with open("merged_profiles.json", "w", encoding="utf-8") as f:
    json.dump(merged_profiles, f, indent=2, ensure_ascii=False)

print(f"✅ Merged profiles: {len(merged_profiles)} users")


# ---- 2. Merge Graphs ----
G_a = nx.read_gexf("steam_output_partial_graph.gexf")
G_b = nx.read_gexf("steam_output_partial_graph1.gexf")

# Gộp 2 graph
G_merged = nx.compose(G_a, G_b)

nx.write_gexf(G_merged, "merged_graph.gexf")
print(f"✅ Merged graph: {G_merged.number_of_nodes()} nodes, {G_merged.number_of_edges()} edges")


✅ Merged profiles: 1949 users
✅ Merged graph: 180904 nodes, 274531 edges


# Phân loại category cho steam_cache (game, group, friends, recently_played, summaries, bans, achievements, vanity, others)

In [4]:
import os
import json

def merge_json_files(input_dir, output_file):

    merged_data = {}

    for filename in os.listdir(input_dir):
        if filename.endswith(".json"):
            file_path = os.path.join(input_dir, filename)
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)

                    # Nếu dữ liệu có steamid thì dùng làm key
                    steamid = data.get("steamid") or data.get("id") or filename
                    merged_data[steamid] = data

            except json.JSONDecodeError:
                print(f"⚠️ Lỗi đọc JSON: {file_path}, bỏ qua...")
            except Exception as e:
                print(f"⚠️ Lỗi khác với file {file_path}: {e}")

    # Lưu lại thành 1 file JSON duy nhất
    with open(output_file, "w", encoding="utf-8") as out:
        json.dump(merged_data, out, ensure_ascii=False, indent=2)

    print(f"✅ Đã gộp {len(merged_data)} file từ {input_dir} vào {output_file}")

merge_json_files("steam_cache/games", "merged_games.json")
merge_json_files("steam_cache/summaries", "merged_summaries.json")
merge_json_files("steam_cache/achievements", "merged_achievements.json")
merge_json_files("steam_cache/friends", "merged_friends.json")
merge_json_files("steam_cache/groups", "merged_groups.json")


✅ Đã gộp 1502 file từ steam_cache/games vào merged_games.json
✅ Đã gộp 1951 file từ steam_cache/summaries vào merged_summaries.json
✅ Đã gộp 491 file từ steam_cache/achievements vào merged_achievements.json
✅ Đã gộp 1583 file từ steam_cache/friends vào merged_friends.json
✅ Đã gộp 1800 file từ steam_cache/groups vào merged_groups.json


## Gộp 2 file profiles và summaries (trùng thì lấy bên nào có trường dữ liệu nhiều hơn)

In [5]:
import json

def merge_profiles_and_summaries(file_profiles, file_summaries, output_file):

    # Load dữ liệu từ file
    with open(file_profiles, "r", encoding="utf-8") as f:
        profiles = json.load(f)

    with open(file_summaries, "r", encoding="utf-8") as f:
        summaries = json.load(f)
    merged = {}

    # Thêm profiles vào merged
    for uid, pdata in profiles.items():
        merged[uid] = pdata

    # Thêm summaries, nếu trùng thì so sánh độ dài dữ liệu
    for uid, sdata in summaries.items():
        if uid in merged:
            # So sánh số lượng field
            if len(sdata.keys()) > len(merged[uid].keys()):
                merged[uid] = sdata
        else:
            merged[uid] = sdata

    # Lưu ra file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(merged, f, ensure_ascii=False, indent=2)

    print(f"✅ Đã gộp {len(merged)} user vào {output_file}")

merge_profiles_and_summaries(
    "steam_cache/profiles.json",
    "steam_cache/summaries.json",
    "steam_cache/profiles_summaries_merged.json"
)


✅ Đã gộp 3900 user vào steam_cache/profiles_summaries_merged.json


## xóa 3 trường dữ liệu: avatar,avatarmedium,avatarhash vì nó ko cần thiết cho bài toán

In [7]:
import json

def clean_profiles(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f:
        profiles = json.load(f)

    cleaned = {}
    for uid, data in profiles.items():
        # Nếu có avatarfull thì giữ lại, còn avatar, avatarmedium, avatarhash thì bỏ
        data.pop("avatar", None)
        data.pop("avatarmedium", None)
        data.pop("avatarhash", None)
        cleaned[uid] = data

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(cleaned, f, ensure_ascii=False, indent=2)

    print(f"  Đã lưu file đã làm sạch -> {output_file}")


# Ví dụ chạy
clean_profiles("steam_cache/profiles_users.json", "steam_cache/profiles_users.json")


  Đã lưu file đã làm sạch -> steam_cache/profiles_users.json
