In [11]:
import pandas as pd

# 讀取需要的資料
df_metrics = pd.read_csv('../data/raw/all_data.csv', low_memory=False)      # all_data.csv
df_info = pd.read_csv('../data/raw/steam_app_data.csv', low_memory=False)   # steam_app_data

# 清除空白
df_metrics.columns = df_metrics.columns.str.strip()
df_info.columns = df_info.columns.str.strip()

# all_data.csv的'appid'欄位名稱改為'steam_appid'
df_metrics = df_metrics.rename(columns={'appid': 'steam_appid'})

print(f"all_data 資料大小: {df_metrics.shape}")
print(f"steam_app_data 資料大小: {df_info.shape}")

# 檢查重複狀況 
dupes_metrics = df_metrics.duplicated(subset=['steam_appid']).sum()
dupes_info = df_info.duplicated(subset=['steam_appid']).sum()
print(f"all_data 重複 ID 數量: {dupes_metrics}")
print(f"steam_app_data 重複 ID 數量: {dupes_info}")

# 去除重複
# keep='first' 重複的 ID，保留第一筆，刪掉後面重複的
df_metrics_clean = df_metrics.drop_duplicates(subset=['steam_appid'], keep='first')
df_info_clean = df_info.drop_duplicates(subset=['steam_appid'], keep='first')

print(f"去重後 - all_data 筆數: {df_metrics_clean.shape[0]}")
print(f"去重後 - steam_app_data 筆數: {df_info_clean.shape[0]}")

# 合併資料集
# 使用 'INNER JOIN'，只保留兩邊都有資料的遊戲（確保資料完整性）
df_merged = pd.merge(
    df_metrics_clean, 
    df_info_clean, 
    on='steam_appid', 
    how='inner', 
    suffixes=('_metrics', '_info') # 如果有重複欄位名，加上後綴區分
)

print(f"合併後資料大小: {df_merged.shape}")

# 檢查合併結果
display(df_merged.head(3))

all_data 資料大小: (86538, 18)
steam_app_data 資料大小: (86538, 39)
all_data 重複 ID 數量: 4038
steam_app_data 重複 ID 數量: 4080
去重後 - all_data 筆數: 82500
去重後 - steam_app_data 筆數: 82458
合併後資料大小: (82413, 56)


Unnamed: 0.1,Unnamed: 0,steam_appid,name_metrics,developer,publisher,score_rank,positive,negative,userscore,owners,...,categories,genres,screenshots,movies,recommendations,achievements,release_date,support_info,background,content_descriptors
0,0,10,Counter-Strike,Valve,Valve,,243818,6427,0,"10,000,000 .. 20,000,000",...,"[{'id': 1, 'description': 'Multi-player'}, {'i...","[{'id': '1', 'description': 'Action'}]","[{'id': 0, 'path_thumbnail': 'https://shared.a...",,{'total': 162153},,"{'coming_soon': False, 'date': '1 Nov, 2000'}","{'url': 'http://steamcommunity.com/app/10', 'e...",https://store.akamai.steamstatic.com/images/st...,"{'ids': [2, 5], 'notes': 'Includes intense vio..."
1,1,20,Team Fortress Classic,Valve,Valve,,7602,1136,0,"1,000,000 .. 2,000,000",...,"[{'id': 1, 'description': 'Multi-player'}, {'i...","[{'id': '1', 'description': 'Action'}]","[{'id': 0, 'path_thumbnail': 'https://shared.a...",,{'total': 6647},,"{'coming_soon': False, 'date': '1 Apr, 1999'}","{'url': '', 'email': ''}",https://store.akamai.steamstatic.com/images/st...,"{'ids': [2, 5], 'notes': 'Includes intense vio..."
2,2,30,Day of Defeat,Valve,Valve,,6414,688,0,"5,000,000 .. 10,000,000",...,"[{'id': 1, 'description': 'Multi-player'}, {'i...","[{'id': '1', 'description': 'Action'}]","[{'id': 0, 'path_thumbnail': 'https://shared.a...",,{'total': 4318},,"{'coming_soon': False, 'date': '1 May, 2003'}","{'url': '', 'email': ''}",https://store.akamai.steamstatic.com/images/st...,"{'ids': [2, 5], 'notes': 'This game includes f..."


In [12]:
import os
os.makedirs('../data/interim', exist_ok=True)

# 儲存「合併但未清洗」的原始檔
save_path = '../data/interim/steam_merged_raw.csv'
df_merged.to_csv(save_path, index=False)