### 데이터 로드 + 기본구조 확인

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

# DL (PyTorch)
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

from tqdm.auto import tqdm
import os
import joblib
import itertools
from tqdm.auto import tqdm



plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)

# 재현성
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_STATE)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE:", DEVICE)


DEVICE: cpu


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('./data/steam_reviews_last365d.csv')

  df = pd.read_csv('./data/steam_reviews_last365d.csv')


In [3]:
df.columns

Index(['appid', 'recommendationid', 'steamid', 'num_games_owned',
       'num_reviews_author', 'playtime_forever', 'playtime_last_two_weeks',
       'playtime_at_review', 'deck_playtime_at_review', 'last_played',
       'language', 'review', 'timestamp_created', 'timestamp_updated',
       'voted_up', 'votes_up', 'votes_funny', 'weighted_vote_score',
       'comment_count', 'steam_purchase', 'received_for_free',
       'written_during_early_access', 'developer_response',
       'timestamp_dev_responded', 'primarily_steam_deck'],
      dtype='object')

In [4]:
# 상위 50개 게임으로만 이루어진 데이터
appid_counts = df['appid'].value_counts()
top50_appids = appid_counts.head(50).index.tolist()
df_top50 = df[df['appid'].isin(top50_appids)].copy()

# 결측치 확인
df_top50.isnull().sum()

appid                                0
recommendationid                     0
steamid                              0
num_games_owned                      0
num_reviews_author                   0
playtime_forever                     0
playtime_last_two_weeks              0
playtime_at_review                   0
deck_playtime_at_review        4736661
last_played                          0
language                             0
review                           15761
timestamp_created                    0
timestamp_updated                    0
voted_up                             0
votes_up                             0
votes_funny                          0
weighted_vote_score                  0
comment_count                        0
steam_purchase                       0
received_for_free                    0
written_during_early_access          0
developer_response             4824716
timestamp_dev_responded        4824716
primarily_steam_deck                 0
dtype: int64

In [5]:
# 결측치 있는 컬럼들 모두 제거
df_model = df_top50.drop(columns=['deck_playtime_at_review', 'developer_response', 'timestamp_dev_responded'])
df_model.isnull().sum()

appid                              0
recommendationid                   0
steamid                            0
num_games_owned                    0
num_reviews_author                 0
playtime_forever                   0
playtime_last_two_weeks            0
playtime_at_review                 0
last_played                        0
language                           0
review                         15761
timestamp_created                  0
timestamp_updated                  0
voted_up                           0
votes_up                           0
votes_funny                        0
weighted_vote_score                0
comment_count                      0
steam_purchase                     0
received_for_free                  0
written_during_early_access        0
primarily_steam_deck               0
dtype: int64

In [6]:
df_model

Unnamed: 0,appid,recommendationid,steamid,num_games_owned,num_reviews_author,playtime_forever,playtime_last_two_weeks,playtime_at_review,last_played,language,...,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,steam_purchase,received_for_free,written_during_early_access,primarily_steam_deck
27798,2139460,215256415,76561198092089560,0,1,1659,1659,1628.0,1767647101,english,...,1767645356,True,0,0,0.50000,0,False,False,False,False
27799,2139460,215256182,76561197995642012,0,4,370,367,339.0,1767646994,french,...,1767645190,True,0,0,0.50000,0,False,False,False,False
27800,2139460,215249671,76561198217416651,0,32,552,552,401.0,1767648127,greek,...,1767640383,True,0,0,0.50000,0,False,False,False,False
27801,2139460,215246874,76561198111964424,0,16,146,0,146.0,1765836608,russian,...,1767638383,False,0,0,0.50000,0,False,False,False,False
27802,2139460,215244452,76561198000529800,85,1,47556,276,47519.0,1767638299,english,...,1767636538,True,0,0,0.50000,0,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5447568,570,192652367,76561199824228518,79,7,49329,9264,5671.0,1767667012,russian,...,1757669172,True,0,0,0.50000,0,False,True,False,False
5447569,570,192652315,76561199509986279,0,1,129565,591,79079.0,1767661292,russian,...,1744542216,False,0,0,0.50000,0,False,False,False,False
5447570,570,192652264,76561199699553706,21,4,36710,548,28793.0,1767315372,russian,...,1762351962,True,0,0,0.50000,0,False,False,False,False
5447571,570,192652232,76561199815094533,2,1,3633,0,1422.0,1766312795,russian,...,1744542138,True,1,0,0.52381,1,False,False,False,False


In [7]:
# Top50 기반으로 매핑 (네가 적어준 game_style 그대로 반영)
STYLE_MAP = {
    3241660: "online",  # R.E.P.O
    2807960: "online",  # Battlefield™ 6
    730:     "online",  # Counter-Strike 2
    1808500: "online",  # ARC Raiders
    1030300: "story",   # Hollow Knight: Silksong
    570:     "online",  # Dota 2
    578080:  "online",  # PUBG
    2246340: "video",   # Monster Hunter Wilds
    2592160: "story",   # Dispatch
    553850:  "online",  # HELLDIVERS™ 2
    3240220: "online",  # Grand Theft Auto V Enhanced
    1091500: "story",   # Cyberpunk 2077
    1903340: "video",   # Clair Obscur: Expedition 33
    2001120: "story",   # Split Fiction
    1245620: "video",   # Elden Ring
    1086940: "video",   # Baldur's Gate 3
    1144200: "online",  # Ready or Not
    3167020: "video",   # Escape From Duckov
    3564740: "online",  # Where Winds Meet
    227300:  "video",   # Euro Truck Simulator 2
    108600:  "video",   # Project Zomboid
    413150:  "video",   # Stardew Valley
    1771300: "video",   # Kingdom Come 2
    3489700: "story",   # Stellar Blade™
    1172470: "online",  # Apex
    1222140: "story",   # Detroit: Become Human
    1326470: "video",   # Sons Of The Forest
    990080:  "story",   # Hogwarts Legacy
    1551360: "video",   # Forza Horizon 5
    1623730: "video",   # Palworld
    1145350: "video",   # Hades II
    2183900: "story",   # Space Marine AE
    230410:  "online",  # Warframe
    2139460: "online",  # Once Human
    236390:  "online",  # War Thunder
    440:     "online",  # Team Fortress 2
    1973530: "online",  # Limbus Company
    394360:  "video",   # Hearts of Iron IV
    3932890: "online",  # Escape from Tarkov
    526870:  "video",   # Satisfactory
    3513350: "online",  # Wuthering Waves
    3405690: "online",  # EA SPORTS FC™ 26
    2622380: "video",   # ELDEN RING NIGHTREIGN
    814380:  "video",   # Sekiro™: Shadows Die Twice - GOTY Edition
    648800:  "video",   # Raft
    3159330: "story",   # Assassin’s Creed Shadows
    3527290: "video",   # PEAK
    2651280: "story",   # Spider-Man 2
    294100:  "video",   # RimWorld
    1222670: "video",   # The Sims 4
}

# game_style 컬럼 생성
df_model["game_style"] = df_model["appid"].map(STYLE_MAP)
df_model["game_style"].value_counts()


game_style
online    2467704
video     1427383
story      935797
Name: count, dtype: int64

In [8]:
# appid별 review 결측치 분포
review_na_by_app = (
    df_model.groupby("appid")["review"]
      .apply(lambda s: s.isna().sum())
      .rename("review_na_cnt")
      .to_frame()
)

# appid별 전체 행 수
total_by_app = df_model.groupby("appid").size().rename("total_cnt").to_frame()

# 합치기 + 비율
review_na_stats = (
    total_by_app.join(review_na_by_app, how="left")
                .fillna({"review_na_cnt": 0})
)

review_na_stats["review_na_ratio"] = review_na_stats["review_na_cnt"] / review_na_stats["total_cnt"]

# 결측치 많은 순으로 확인
review_na_stats.sort_values("review_na_cnt", ascending=False).head(30)


Unnamed: 0_level_0,total_cnt,review_na_cnt,review_na_ratio
appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3241660,341851,1067,0.003121
2807960,301260,1051,0.003489
1808500,253341,991,0.003912
3240220,139431,675,0.004841
730,273327,668,0.002444
1091500,120404,566,0.004701
553850,148578,510,0.003433
578080,194464,500,0.002571
1030300,239171,493,0.002061
1086940,105688,474,0.004485


In [9]:
# review NaN 드롭
before = len(df_model)
df_model = df_model[df_model["review"].notna()].copy()
after = len(df_model)

print(f"[drop NaN review] before={before:,} -> after={after:,} (dropped {before-after:,})")


[drop NaN review] before=4,830,884 -> after=4,815,123 (dropped 15,761)


In [10]:
df_model[['review']].isna().sum()

review    0
dtype: int64

In [11]:
# 공백/빈문자열(whitespace-only 포함) 마스크
blank_mask = df_model["review"].astype(str).str.strip().eq("")

blank_by_appid = (
    df_model.assign(is_blank_review=blank_mask)
            .groupby("appid")["is_blank_review"]
            .agg(total_cnt="size", blank_cnt="sum")
)

blank_by_appid["blank_ratio"] = blank_by_appid["blank_cnt"] / blank_by_appid["total_cnt"]

# 공백 리뷰가 있는 appid만, blank_cnt 큰 순으로 보기
blank_by_appid_nonzero = blank_by_appid[blank_by_appid["blank_cnt"] > 0].sort_values("blank_cnt", ascending=False)

display(blank_by_appid_nonzero.head(50))
print("공백 리뷰 총 개수:", int(blank_mask.sum()))


Unnamed: 0_level_0,total_cnt,blank_cnt,blank_ratio
appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
730,272659,186,0.000682
3241660,340784,159,0.000467
3240220,138756,130,0.000937
2807960,300209,121,0.000403
578080,193964,111,0.000572
570,204123,94,0.000461
227300,92225,74,0.000802
1808500,252350,73,0.000289
1030300,238678,59,0.000247
553850,148068,56,0.000378


공백 리뷰 총 개수: 1902


### 데이터 전처리

In [12]:
# 공백/빈 문자열 리뷰 드롭
before = len(df_model)

blank_mask = df_model["review"].astype(str).str.strip().eq("")
df_model = df_model[~blank_mask].copy()

after = len(df_model)
print(f"[drop blank review] before={before:,} -> after={after:,} (dropped {before-after:,})")

[drop blank review] before=4,815,123 -> after=4,813,221 (dropped 1,902)


- 결측치 처리

- 플레이 타임/시간 관련 수치 컬럼 : 숫자로 변환후 NaN을 0으로 채움

In [13]:
# style별 churn 기준일(일 단위)
STYLE_WINDOW_DAYS = {
    "online": 7,
    "video": 10,
    "story": 5,
}

# 리뷰 시각 / 마지막 플레이 시각
review_dt = pd.to_datetime(df_model["timestamp_created"], unit="s", errors="coerce")
last_dt   = pd.to_datetime(df_model["last_played"], unit="s", errors="coerce")

# 리뷰 이후 며칠 뒤에 마지막 플레이가 있었는지
df_model["days_after_review"] = (last_dt - review_dt).dt.days

# game_style별 기준일 매핑 (none은 NaN)
df_model["churn_window_days"] = df_model["game_style"].map(STYLE_WINDOW_DAYS)

# 기본 churn: days_after_review < window 이면 churn=1 (떠난 것)
df_model["churn"] = df_model["days_after_review"] < df_model["churn_window_days"].astype(int)

# 예외 처리(기존 규칙 유지)
df_model.loc[df_model["last_played"] == 0, "churn"] = 1
df_model.loc[df_model["days_after_review"] < 0, "churn"] = 1

print(df_model["churn"].value_counts(dropna=False).sort_index())

# DL에서 타깃 확실히 int로 고정
df_model["churn"] = df_model["churn"].astype(int)


  df_model.loc[df_model["last_played"] == 0, "churn"] = 1


churn
False    3432753
True     1380468
Name: count, dtype: int64


review / developer_response 텍스트 : NaN을 "N"으로 채움

In [14]:
df_model[['language']].nunique()


language    30
dtype: int64

- 원본 df를 df2로 복사

In [15]:
df_model.describe()


Unnamed: 0,appid,recommendationid,steamid,num_games_owned,num_reviews_author,playtime_forever,playtime_last_two_weeks,playtime_at_review,last_played,timestamp_created,timestamp_updated,votes_up,votes_funny,weighted_vote_score,comment_count,days_after_review,churn_window_days,churn
count,4813221.0,4813221.0,4813221.0,4813221.0,4813221.0,4813221.0,4813221.0,4813221.0,4813221.0,4813221.0,4813221.0,4813221.0,4813221.0,4813221.0,4813221.0,4813221.0,4813221.0,4813221.0
mean,1637917.0,203442200.0,7.65612e+16,72.24943,10.93352,13984.06,434.6475,10673.46,1761394000.0,1756335000.0,1756654000.0,1.038864,0.1820704,0.5029349,0.06856697,58.06457,7.498542,0.2868075
std,1156278.0,8342337.0,619088600.0,249.208,63.1823,40662.21,1151.922,38109.09,11456370.0,8849875.0,8803059.0,30.84613,6.968658,0.02391883,2.379588,136.8769,1.784596,0.4522709
min,440.0,184884000.0,7.65612e+16,0.0,1.0,0.0,0.0,5.0,0.0,1736122000.0,1736122000.0,0.0,0.0,0.02213739,0.0,-20441.0,5.0,0.0
25%,578080.0,196754900.0,7.65612e+16,0.0,2.0,1500.0,0.0,649.0,1759989000.0,1749423000.0,1749911000.0,0.0,0.0,0.5,0.0,4.0,7.0,0.0
50%,1551360.0,205936800.0,7.65612e+16,0.0,4.0,4166.0,0.0,2093.0,1765025000.0,1759660000.0,1760116000.0,0.0,0.0,0.5,0.0,34.0,7.0,0.0
75%,2807960.0,210284600.0,7.65612e+16,69.0,10.0,10341.0,233.0,6120.0,1767091000.0,1764041000.0,1764091000.0,0.0,0.0,0.5,0.0,102.0,10.0,1.0
max,3932890.0,215279200.0,7.65612e+16,38168.0,19748.0,2919994.0,36647.0,2789459.0,1767667000.0,1767666000.0,1767666000.0,20098.0,6372.0,0.9958196,4500.0,364.0,10.0,1.0


- game_style 생성

In [16]:
# 1) 언어별 키워드 사전
LEXICON = {
    "english": {
        "phrases": [
            r"highly recommend(?:ed)?",
            r"definitely recommend",
            r"worth (?:buying|it|the money|the time)",
            r"great game",
            r"amazing game",
            r"awesome game",
            r"best game(?:s)?",
        ],
        "words": [
            r"awesome", r"amazing", r"great", r"excellent", r"fantastic", r"incredible",
            r"masterpiece", r"perfect", r"love", r"fun", r"enjoy", r"recommend", r"worth",
        ],
        "neg": [
            r"not\s+good", r"not\s+great", r"not\s+worth",
            r"(?:do\s*not|don't|dont)\s+recommend",
            r"(?:do\s*not|don't|dont)\s+buy",
            r"can't\s+recommend|cant\s+recommend",
            r"avoid\b", r"refund\b",
        ],
        "boundary": True,
    },
    "spanish": {
        "phrases": [r"muy bueno", r"vale la pena", r"lo recomiendo", r"recomendad[oa]"],
        "words": [r"genial", r"excelente", r"buen[oa]", r"incre[ií]ble", r"recomiendo", r"recomendar"],
        "neg": [r"no\s+recomiendo", r"no\s+vale\s+la\s+pena", r"no\s+es\s+buen[oa]", r"no\s+merece\s+la\s+pena", r"no\s+compr(?:es|ar)"],
        "boundary": True,
    },
    "latam": {
        "phrases": [r"muy bueno", r"vale la pena", r"lo recomiendo", r"recomendad[oa]"],
        "words": [r"genial", r"excelente", r"buen[oa]", r"incre[ií]ble", r"recomiendo", r"recomendar"],
        "neg": [r"no\s+recomiendo", r"no\s+vale\s+la\s+pena", r"no\s+es\s+buen[oa]", r"no\s+merece\s+la\s+pena", r"no\s+compr(?:es|ar)"],
        "boundary": True,
    },
    "portuguese": {
        "phrases": [r"vale a pena", r"recomendo", r"muito bom", r"jogo (?:muito )?bom"],
        "words": [r"ótimo", r"excelente", r"incr[ií]vel", r"perfeito", r"divertido", r"recomendar"],
        "neg": [r"não\s+recomendo", r"nao\s+recomendo", r"não\s+vale\s+a\s+pena", r"nao\s+vale\s+a\s+pena", r"não\s+é\s+bom", r"nao\s+e\s+bom", r"não\s+compr(?:e|ar)", r"nao\s+compr(?:e|ar)"],
        "boundary": True,
    },
    "brazilian": {
        "phrases": [r"vale a pena", r"recomendo", r"muito bom", r"jogo (?:muito )?bom"],
        "words": [r"ótimo", r"excelente", r"incr[ií]vel", r"perfeito", r"divertido", r"recomendar"],
        "neg": [r"não\s+recomendo", r"nao\s+recomendo", r"não\s+vale\s+a\s+pena", r"nao\s+vale\s+a\s+pena", r"não\s+é\s+bom", r"nao\s+e\s+bom", r"não\s+compr(?:e|ar)", r"nao\s+compr(?:e|ar)"],
        "boundary": True,
    },
    "german": {
        "phrases": [r"sehr gut", r"klare(?:s)? empfehlung", r"lohnt sich", r"absolut empfehl"],
        "words": [r"genial", r"toll", r"super", r"großartig", r"exzellent", r"empfehle", r"empfehlenswert"],
        "neg": [r"nicht\s+empfehl", r"lohnt\s+sich\s+nicht", r"nicht\s+gut", r"kau(?:f|ft)\s+nicht", r"kein\s+kauf"],
        "boundary": True,
    },
    "french": {
        "phrases": [r"je recommande", r"vaut le coup", r"tr[eè]s bon", r"excellent jeu"],
        "words": [r"g[eé]nial", r"excellent", r"super", r"incroyable", r"parfait", r"recommande"],
        "neg": [r"je\s+ne\s+recommande\s+pas", r"ne\s+vaut\s+pas\s+le\s+coup", r"pas\s+bon", r"n['’]achetez\s+pas", r"n['’]ach[eè]te\s+pas"],
        "boundary": True,
    },
    "italian": {
        "phrases": [r"lo consiglio", r"vale la pena", r"molto bello", r"gioco (?:molto )?bello"],
        "words": [r"fantastico", r"ottimo", r"eccellente", r"stupendo", r"divertente", r"consiglio", r"consigliare"],
        "neg": [r"non\s+lo\s+consiglio", r"non\s+vale\s+la\s+pena", r"non\s+[eè]\s+bello", r"non\s+compr(?:are|atelo)"],
        "boundary": True,
    },
    "dutch": {
        "phrases": [r"zeker aanraden", r"de moeite waard", r"heel goed", r"geweldig spel"],
        "words": [r"geweldig", r"fantastisch", r"super", r"leuk", r"aanraden", r"aanbevelen", r"waarde"],
        "neg": [r"niet\s+aanrad", r"niet\s+de\s+moeite\s+waard", r"niet\s+goed", r"koop\s+niet"],
        "boundary": True,
    },
    "swedish": {
        "phrases": [r"rekommenderar", r"värt det", r"jättebra", r"riktigt bra"],
        "words": [r"fantastisk", r"grym", r"suverän", r"toppen", r"kul", r"rekommendera", r"värd"],
        "neg": [r"rekommenderar\s+inte", r"inte\s+värt", r"inte\s+bra", r"köp\s+inte"],
        "boundary": True,
    },
    "norwegian": {
        "phrases": [r"anbefaler", r"verdt det", r"kjempebra", r"veldig bra"],
        "words": [r"fantastisk", r"råbra", r"suveren", r"gøy", r"anbefale", r"verdt"],
        "neg": [r"anbefaler\s+ikke", r"ikke\s+verdt", r"ikke\s+bra", r"ikke\s+kjøp"],
        "boundary": True,
    },
    "danish": {
        "phrases": [r"anbefaler", r"v[æa]rd at", r"mega god", r"rigtig god"],
        "words": [r"fantastisk", r"fremragende", r"super", r"sjov", r"anbefale", r"v[æa]rd"],
        "neg": [r"anbefaler\s+ikke", r"ikke\s+v[æa]rd", r"ikke\s+god", r"k[oø]b\s+ikke"],
        "boundary": True,
    },
    "finnish": {
        "phrases": [r"suosittelen", r"todella hyv[äa]", r"sen arvoinen", r"hyv[äa] peli"],
        "words": [r"loistava", r"mahtava", r"erinomainen", r"hauska", r"suositella", r"arvoinen"],
        "neg": [r"en\s+suosittele", r"ei\s+kannata", r"ei\s+hyv[äa]", r"älä\s+osta"],
        "boundary": True,
    },
    "polish": {
        "phrases": [r"polecam", r"warto", r"świetna gra", r"bardzo dobra"],
        "words": [r"świetn[aey]", r"super", r"rewelacyjna", r"doskonała", r"polecić", r"warto"],
        "neg": [r"nie\s+polecam", r"nie\s+warto", r"nie\s+jest\s+dobr", r"nie\s+kupuj"],
        "boundary": True,
    },
    "czech": {
        "phrases": [r"doporu[čc]uji", r"stoj[ií]\s+za\s+to", r"skv[ěe]l[aá]", r"v[ýy]born[aá]"],
        "words": [r"super", r"skv[ěe]l", r"v[ýy]born", r"bav[ií]", r"doporu[čc]it"],
        "neg": [r"nedoporu[čc]uji", r"nestoj[ií]\s+za\s+to", r"nen[ií]\s+dobr", r"nekupuj"],
        "boundary": True,
    },
    "romanian": {
        "phrases": [r"recomand", r"merit[ăa]", r"foarte bun", r"joc (?:foarte )?bun"],
        "words": [r"excelent", r"minunat", r"super", r"recomanda", r"merit"],
        "neg": [r"nu\s+recomand", r"nu\s+merit[ăa]", r"nu\s+e\s+bun", r"nu\s+cump[ăa]ra"],
        "boundary": True,
    },
    "hungarian": {
        "phrases": [r"aj[aá]nlom", r"meg[eé]ri", r"nagyon j[oó]", r"szuper j[aá]t[eé]k"],
        "words": [r"szuper", r"fantasztikus", r"kiv[aá]l[oó]", r"nagyon", r"aj[aá]nlani", r"meg[eé]r"],
        "neg": [r"nem\s+aj[aá]nlom", r"nem\s+[eé]ri\s+meg", r"nem\s+j[oó]", r"ne\s+vedd\s+meg"],
        "boundary": True,
    },
    "bulgarian": {
        "phrases": [r"препоръч", r"много добра", r"страхотна", r"заслужава си"],
        "words": [r"страхот", r"отлич", r"супер", r"препоръч", r"шедьовър"],
        "neg": [r"не\s+препоръч", r"не\s+си\s+струва", r"не\s+е\s+доб", r"не\s+купувай"],
        "boundary": False,
    },
    "greek": {
        "phrases": [r"το\s+προτείν", r"αξίζει", r"πολύ\s+καλ", r"εξαιρετικ"],
        "words": [r"τέλει", r"φοβε", r"εξαιρετικ", r"καταπληκτικ", r"προτείν", r"αξίζ"],
        "neg": [r"δεν\s+προτείν", r"δεν\s+αξίζ", r"δεν\s+είναι\s+καλ", r"μην\s+αγοράσ"],
        "boundary": False,
    },
    "ukrainian": {
        "phrases": [r"рекоменд", r"дуже\s+хорош", r"варто", r"чудов"],
        "words": [r"відмін", r"класн", r"шедевр", r"рекоменд", r"варто"],
        "neg": [r"не\s+рекоменд", r"не\s+варто", r"не\s+хорош", r"не\s+купуй"],
        "boundary": False,
    },
    "russian": {
        "phrases": [r"рекоменд", r"очень\s+хорош", r"стоит", r"шедевр"],
        "words": [r"отлич", r"классн", r"супер", r"шедевр", r"рекоменд", r"стоит"],
        "neg": [r"не\s+рекоменд", r"не\s+стоит", r"плох", r"не\s+покупай", r"не\s+берите"],
        "boundary": False,
    },
    "turkish": {
        "phrases": [r"kesinlikle tavsiye", r"tavsiye ederim", r"çok iyi", r"mükemmel", r"harika"],
        "words": [r"güzel", r"mükemmel", r"harika", r"şahane", r"tavsiye", r"değer"],
        "neg": [r"tavsiye etmem", r"tavsiye etmiyorum", r"iyi değil", r"alma", r"almayın", r"değmez"],
        "boundary": True,
    },
    "koreana": {
        "phrases": [r"강추", r"완전 추천", r"강력 추천", r"갓겜", r"명작", r"존잼", r"개꿀잼", r"재밌", r"재미있"],
        "words": [r"추천", r"최고", r"꿀잼", r"재미", r"좋다", r"훌륭", r"완벽", r"감동"],
        "neg": [r"비추", r"추천\s*안", r"추천\s*하지", r"재미없", r"별로", r"최악", r"사지\s*마", r"사지마", r"환불"],
        "boundary": False,
    },
    "japanese": {
        "phrases": [r"おすすめ", r"オススメ", r"最高", r"神ゲー", r"買う価値", r"面白い", r"楽しい"],
        "words": [r"おすすめ", r"最高", r"神", r"面白", r"楽しい", r"良い", r"素晴らしい"],
        "neg": [r"おすすめしない", r"買わない方が", r"つまらない", r"面白くない", r"最悪", r"返品"],
        "boundary": False,
    },
    "schinese": {
        "phrases": [r"强烈推荐", r"非常推荐", r"值得买", r"值得入", r"很值得", r"很好玩", r"神作", r"精品"],
        "words": [r"推荐", r"值得", r"好玩", r"很好", r"优秀", r"完美", r"喜欢"],
        "neg": [r"不推荐", r"不值得", r"不好玩", r"垃圾", r"别买", r"千万别买", r"退款"],
        "boundary": False,
    },
    "tchinese": {
        "phrases": [r"強烈推薦", r"非常推薦", r"值得買", r"值得入", r"很值得", r"很好玩", r"神作", r"精品"],
        "words": [r"推薦", r"值得", r"好玩", r"很好", r"優秀", r"完美", r"喜歡"],
        "neg": [r"不推薦", r"不值得", r"不好玩", r"垃圾", r"別買", r"千萬別買", r"退款"],
        "boundary": False,
    },
    "arabic": {
        "phrases": [r"أنصح", r"ممتاز", r"رائع", r"يستحق", r"لعبة رائعة", r"ممتعة"],
        "words": [r"ممتاز", r"رائع", r"جميل", r"ممتع", r"يستحق", r"أنصح"],
        "neg": [r"لا\s+أنصح", r"لا\s+يستحق", r"سيئ", r"لا\s+تشتري", r"استرجاع"],
        "boundary": False,
    },
    "thai": {
        "phrases": [r"แนะนำ", r"ดีมาก", r"สุดยอด", r"คุ้มค่า", r"สนุกมาก", r"โคตรสนุก"],
        "words": [r"แนะนำ", r"ดี", r"สนุก", r"สุดยอด", r"คุ้ม", r"ชอบ"],
        "neg": [r"ไม่แนะนำ", r"ไม่คุ้ม", r"ไม่ดี", r"แย่", r"อย่าซื้อ", r"ขอคืนเงิน"],
        "boundary": False,
    },
    "vietnamese": {
        "phrases": [r"rất hay", r"tuyệt vời", r"đáng mua", r"đáng tiền", r"nên mua", r"khuyên dùng"],
        "words": [r"hay", r"tuyệt", r"xuất sắc", r"đáng", r"thích", r"khuyên", r"nên"],
        "neg": [r"không\s+khuyên", r"không\s+đáng", r"đừng\s+mua", r"tệ", r"chán", r"hoàn tiền"],
        "boundary": True,
    },
    "indonesian": {
        "phrases": [r"sangat bagus", r"rekomendasi", r"worth it", r"layak dibeli", r"seru banget"],
        "words": [r"bagus", r"keren", r"mantap", r"seru", r"rekomend", r"layak"],
        "neg": [r"tidak\s+rekomend", r"jangan\s+beli", r"tidak\s+layak", r"jelek", r"buruk", r"refund"],
        "boundary": True,
    },
}

DEFAULT_LANG = "english"

def _compile_lexicon(cfg):
    boundary = cfg.get("boundary", True)

    parts_good = []
    for p in cfg.get("phrases", []):
        parts_good.append(f"(?:{p})")
    for w in cfg.get("words", []):
        if boundary:
            parts_good.append(rf"\b{w}\b")
        else:
            parts_good.append(f"(?:{w})")

    good_pat = "|".join(parts_good) if parts_good else r"$^"
    good_re = re.compile(good_pat, flags=re.UNICODE)

    neg_parts = [f"(?:{p})" for p in cfg.get("neg", [])]
    neg_pat = "|".join(neg_parts) if neg_parts else r"$^"
    neg_re = re.compile(neg_pat, flags=re.UNICODE)

    return good_re, neg_re

_COMPILED = {}
for lang, cfg in LEXICON.items():
    _COMPILED[lang] = _compile_lexicon(cfg)
_COMPILED[DEFAULT_LANG] = _COMPILED.get(DEFAULT_LANG, _compile_lexicon(LEXICON["english"]))

def add_good_flag_multilang(df_model, text_col="review", lang_col="language"):
    out = df_model.copy()

    text = out[text_col].fillna("").astype(str).str.casefold()
    lang = out[lang_col].fillna(DEFAULT_LANG).astype(str)

    good_hit = pd.Series(False, index=out.index)
    neg_hit  = pd.Series(False, index=out.index)

    for l in lang.unique():
        mask = (lang == l)
        good_re, neg_re = _COMPILED.get(l, _COMPILED[DEFAULT_LANG])

        good_hit.loc[mask] = text.loc[mask].str.contains(good_re, regex=True)
        neg_hit.loc[mask]  = text.loc[mask].str.contains(neg_re,  regex=True)

    out["good_review"] = (good_hit & (~neg_hit)).astype(int)
    return out

df_model = add_good_flag_multilang(df_model, text_col="review", lang_col="language")
print(df_model["good_review"].value_counts())


good_review
0    3525940
1    1287281
Name: count, dtype: int64


- churn 라벨 생성

In [17]:
import numpy as np
import pandas as pd

# (중요) 숫자 컬럼은 float32로 계산해서 새 배열 메모리 절약
pt = pd.to_numeric(df_model["playtime_at_review"], errors="coerce").fillna(0).to_numpy(dtype=np.float32)
ng = pd.to_numeric(df_model["num_games_owned"], errors="coerce").fillna(0).to_numpy(dtype=np.float32)
nr = pd.to_numeric(df_model["num_reviews_author"], errors="coerce").fillna(0).to_numpy(dtype=np.float32)

# 1. 리뷰 시점 플레이 집중도
df_model["playtime_per_game"] = np.log1p(pt / (ng + 1)).astype(np.float32)

# 2. 리뷰 작성 시점의 몰입 단계 (log_playtime + 3개 원핫)
df_model["log_playtime"] = np.log1p(pt).astype(np.float32)

# playtime_stage는 카테고리라 메모리 부담 → 아예 안 만들고 바로 3개 이진 컬럼만 만든다 (로직 동일)
# bins: (-inf,4], (4,8], (8,inf)
lp = df_model["log_playtime"].to_numpy(dtype=np.float32)

df_model["is_short_play"] = (lp <= 4).astype(np.int8)
df_model["is_mid_play"]   = ((lp > 4) & (lp <= 8)).astype(np.int8)
df_model["is_long_play"]  = (lp > 8).astype(np.int8)

# 3. 리뷰어 성향
df_model["reviews_per_game"] = np.log1p(nr / (ng + 1)).astype(np.float32)

# 4. 경험 많은/적은 유저
df_model["log_num_games_owned"] = np.log1p(ng).astype(np.float32)
df_model["log_num_reviews_author"] = np.log1p(nr).astype(np.float32)

df_model["is_heavy_user"] = (
    (df_model["log_num_games_owned"].to_numpy(dtype=np.float32) > 5.0) &
    (df_model["log_num_reviews_author"].to_numpy(dtype=np.float32) > 3.5)
).astype(np.int8)

df_model["is_light_user"] = (
    (df_model["log_num_games_owned"].to_numpy(dtype=np.float32) < 2.0) &
    (df_model["log_num_reviews_author"].to_numpy(dtype=np.float32) < 1.0)
).astype(np.int8)

# 5. 감정×행동 결합 (그대로)
gr = pd.to_numeric(df_model["good_review"], errors="coerce").fillna(0).to_numpy(dtype=np.int8)

df_model["positive_but_short_play"] = ((gr == 1) & (pt < 60)).astype(np.int8)
df_model["negative_but_long_play"]  = ((gr == 0) & (pt > 1200)).astype(np.int8)

# 6. 사회적 반응 (그대로)
vu = pd.to_numeric(df_model["votes_up"], errors="coerce").fillna(0).to_numpy(dtype=np.float32)
vf = pd.to_numeric(df_model["votes_funny"], errors="coerce").fillna(0).to_numpy(dtype=np.float32)
cc = pd.to_numeric(df_model["comment_count"], errors="coerce").fillna(0).to_numpy(dtype=np.float32)

df_model["has_votes"]   = ((vu + vf) > 0).astype(np.int8)
df_model["has_comment"] = (cc > 0).astype(np.int8)

# 7. 리뷰 업데이트 여부 (그대로)
df_model["is_updated_review"] = (df_model["timestamp_created"] != df_model["timestamp_updated"]).astype(np.int8)

# 8. 커뮤니티 신뢰도 밀도 (그대로)
df_model["social_density"] = np.log1p((vu + cc) / (nr + 1)).astype(np.float32)

# 안전 처리: 여기서는 3개만 inf 제거(전체 replace 금지)
fix_cols = ["playtime_per_game", "reviews_per_game", "social_density"]
for c in fix_cols:
    s = pd.to_numeric(df_model[c], errors="coerce")
    df_model[c] = s.replace([np.inf, -np.inf], np.nan).fillna(0).astype(np.float32)

df_model.head(3)


Unnamed: 0,appid,recommendationid,steamid,num_games_owned,num_reviews_author,playtime_forever,playtime_last_two_weeks,playtime_at_review,last_played,language,...,log_num_games_owned,log_num_reviews_author,is_heavy_user,is_light_user,positive_but_short_play,negative_but_long_play,has_votes,has_comment,is_updated_review,social_density
27798,2139460,215256415,76561198092089560,0,1,1659,1659,1628.0,1767647101,english,...,0.0,0.693147,0,1,0,1,0,0,0,0.0
27799,2139460,215256182,76561197995642012,0,4,370,367,339.0,1767646994,french,...,0.0,1.609438,0,0,0,0,0,0,0,0.0
27800,2139460,215249671,76561198217416651,0,32,552,552,401.0,1767648127,greek,...,0.0,3.496508,0,0,0,0,0,0,0,0.0


In [18]:
# 상위 10개 언어만 유지, 나머지는 other
top_n = 10
top_langs = df_model['language'].value_counts().head(top_n).index

df_model['language'] = df_model['language'].where(
    df_model['language'].isin(top_langs),
    'other'
)

df_model = pd.get_dummies(
    df_model,
    columns=['language', 'game_style'],
    drop_first=True
)


good_review 생성

In [19]:
# 생성된 컬럼 확인
print([col for col in df_model.columns if col.startswith('language_')])
print([col for col in df_model.columns if col.startswith('game_style_')])


['language_english', 'language_french', 'language_german', 'language_koreana', 'language_other', 'language_polish', 'language_russian', 'language_schinese', 'language_spanish', 'language_turkish']
['game_style_story', 'game_style_video']


### good_review가 생성된 df_model을 최종 학습용 데이터로 확정

In [20]:
# 원핫인코딩/불리언류 int화 (DL에서도 필수: tensor 변환이 깔끔해짐)
bool_cols = ['voted_up', 'steam_purchase', 'received_for_free', 'written_during_early_access', 'primarily_steam_deck',
            'language_english', 'language_french', 'language_german','language_koreana','language_other','language_polish','language_russian','language_schinese','language_spanish','language_turkish',
            'game_style_story', 'game_style_video'
            ]

for col in bool_cols:
    if col in df_model.columns:
        df_model[col] = df_model[col].astype(int)

# 타깃/이진 파생도 int 확정
for c in ["churn", "good_review",
          "is_short_play","is_mid_play","is_long_play",
          "is_heavy_user","is_light_user",
          "positive_but_short_play","negative_but_long_play",
          "has_votes","has_comment","is_updated_review"]:
    if c in df_model.columns:
        df_model[c] = pd.to_numeric(df_model[c], errors="coerce").fillna(0).astype(int)

df_model.dtypes.head(30)


appid                            int64
recommendationid                 int64
steamid                          int64
num_games_owned                  int64
num_reviews_author               int64
playtime_forever                 int64
playtime_last_two_weeks          int64
playtime_at_review             float64
last_played                      int64
review                          object
timestamp_created                int64
timestamp_updated                int64
voted_up                         int64
votes_up                         int64
votes_funny                      int64
weighted_vote_score            float64
comment_count                    int64
steam_purchase                   int64
received_for_free                int64
written_during_early_access      int64
primarily_steam_deck             int64
days_after_review                int64
churn_window_days                int64
churn                            int64
good_review                      int64
playtime_per_game        

- good_review 값 체크

In [21]:
TARGET = "churn"

FEATURES = [
    "voted_up",
    "steam_purchase",
    "received_for_free",
    "written_during_early_access",
    "primarily_steam_deck",

    "language_english", "language_french", "language_german", "language_koreana",
    "language_other", "language_polish", "language_russian", "language_schinese",
    "language_spanish", "language_turkish",
    "game_style_story", "game_style_video",

    "weighted_vote_score",
    "playtime_per_game",
    "is_short_play", "is_mid_play", "is_long_play",
    "reviews_per_game",
    "is_heavy_user",
    "positive_but_short_play",
    "negative_but_long_play",
    "has_votes",
    "has_comment",
    "social_density",
    "is_updated_review",
]

FEATURES = [c for c in FEATURES if c in df_model.columns]

top_appids = (
    df_model.groupby("appid")
            .size()
            .sort_values(ascending=False)
            .head(50)
            .index.tolist()
)

print("n_features:", len(FEATURES))
print("top_appids(50) sample:", top_appids[:5])


n_features: 30
top_appids(50) sample: [3241660, 2807960, 730, 1808500, 1030300]


- Dataset 정의

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE:", DEVICE)

# 1. Dataset 정의
class TabDataset(Dataset):
    def __init__(self, X, y):
        # 입력 피처는 신경망이 float 연산을 하니까 float32가 표준.
        self.X = torch.tensor(X, dtype=torch.float32)
        # 타깃레이블
        # 중요 포인트 : BCEWithLogitsLoss를 쓰면 타깃 y는 float (0.0/1.0)이어야 안전해
        self.y = torch.tensor(y, dtype=torch.float32)  # BCEWithLogitsLoss -> float

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


DEVICE: cpu


### MLP 모델 정의
- MLP는 완전연결층을 여러 개 쌓은 신경망
- 표 형태 데이터에서 CNN/RNN보다 흔히 쓰이는 딥러닝 모델

In [40]:
# 2. MLP 모델 정의
class MLP(nn.Module):
    # input_dim: 피처 개수
    # hidden_dims=(128, 64): 은닉층 크기 2개
    def __init__(self, input_dim, hidden_dims=(128, 64), dropout=0.2):
        super().__init__()
        layers = []
        prev = input_dim                        # 첫 Linear의 입력 크기
        
        # 은닉층들을 순서대로 쌓기
        for h in hidden_dims:
            layers.append(nn.Linear(prev, h))   # 선형변환
            layers.append(nn.ReLU())            # 비선형활성화
            layers.append(nn.Dropout(dropout))  # 과적합 방지
            prev = h

        # 마지막 출력층: 1개의 뉴런
        # sigmoid를 바로 안쓰는 이유
        # BCEWithLogitsLoss가 내부적으로 sigmoid + BCE를 합쳐 수치적으로 안정적이기 때문
        layers.append(nn.Linear(prev, 1))  # logit

        # 위에 만든 레이어 리스트를 Sequential로 묶어서 forward에서 한 번에 호출 가능하게 함
        self.net = nn.Sequential(*layers)

    def forward(self, x):             # 기존 (batch, 1)을 
        return self.net(x).squeeze(1) # (batch, )로 만듦


In [None]:
# 3. 확률 예측 함수 (torch -> numpy proba)
def predict_proba_torch(model, loader):
    model.eval()    # 평가모드 전환
    probs = []
    with torch.no_grad():
        for xb, _ in loader:
            xb = xb.to(DEVICE)
            logits = model(xb)
            # logit을 확률로 변환
            p = torch.sigmoid(logits).detach().cpu().numpy()    
            probs.append(p) # 배치 확률을 리스트로 모음
    return np.concatenate(probs, axis=0)


In [None]:
# 4 threshold grid + best threshold 찾기(F1 최대)
THR_GRID = np.round(np.arange(0.10, 0.91, 0.02), 2)

def best_threshold_by_f1(y_true, proba, thr_grid=THR_GRID):
    # y_true 정답레이블(0/1)
    # proba 예측값
    best_f1, best_thr = -1.0, 0.5
    # thr_grid: 탐색할 threshold 리스트
    for thr in thr_grid:
        # threshold 기준으로 0/1 예측 라벨 생성
        pred = (proba >= thr).astype(int)
        # F1 계산
        f1 = f1_score(y_true, pred, zero_division=0)
        # 더 좋으면 갱신
        if f1 > best_f1:
            best_f1, best_thr = f1, float(thr)
    return best_thr, float(best_f1)

- MLP는 기본적으로 확률 또는 logit 출력

In [None]:
# 5. proba -> metric 계산
def eval_from_proba(y_true, proba, thr):
    # threshold를 적용해 확률을 라벨(0/1)로 바꿈
    pred = (proba >= thr).astype(int)

    # confusion_matrix ravel은 (tn, fp, fn, tp) 4개가 나오는 게 전제라
    # 혹시 한쪽 클래스만 생길 경우를 방어
    cm = confusion_matrix(y_true, pred, labels=[0, 1])
    if cm.size == 4:
        tn, fp, fn, tp = cm.ravel()
    else:
        # 비정상 케이스 방어
        tn, fp, fn, tp = 0, 0, 0, 0

    return {
        "acc": accuracy_score(y_true, pred),
        "roc_auc": roc_auc_score(y_true, proba) if len(np.unique(y_true)) == 2 else np.nan,
        "precision": precision_score(y_true, pred, zero_division=0),
        "recall": recall_score(y_true, pred, zero_division=0),
        "f1": f1_score(y_true, pred, zero_division=0),
        # 예측이 1로 나온 비율 (모델이 얼마나 공격적으로 1을 찍는지 확인)
        "pred_pos_rate": float(np.mean(pred == 1)),
        "tn": int(tn), "fp": int(fp), "fn": int(fn), "tp": int(tp),
    }


In [None]:
# 6. 1개 게임 학습 함수 (early stopping: val_f1)
def train_mlp_one_game(
    X_train, y_train, X_val, y_val, *,  # * 이후는 키워드 인자만 허용. 실수로 순서 바꿔넣는 버그 방지
    hidden_dims=(128, 64), dropout=0.2,
    lr=1e-3, weight_decay=1e-4, # 옵티마이저 하이퍼파라미터
    batch_size=2048, max_epochs=30, patience=5  
):
    
    # churn=1(양성)이 적을수록, 모델이 0만 찍는 쪽으로 치우칠 수 있음.
    # BCEWithLogitsLoss의 pos_weight는 "양성 샘플의 loss를 더 크게" 만들어
    # 양성을 놓치면(FN) 더 큰 벌점을 받게 함
    neg = int((y_train == 0).sum())
    pos = int((y_train == 1).sum())
    pos_weight = (neg / max(pos, 1))

    # Dataset/Loader 구성
    train_ds = TabDataset(X_train, y_train)
    val_ds   = TabDataset(X_val, y_val)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=False)
    val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False, drop_last=False)

    # 모델 생성
    model = MLP(input_dim=X_train.shape[1], hidden_dims=hidden_dims, dropout=dropout).to(DEVICE)
    # AdamW로 과적합 완화
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    # BCEWithLogitsLoss는 sigmoid를 내부에서 포함한 loss
    criterion = nn.BCEWithLogitsLoss(
        pos_weight=torch.tensor([pos_weight], dtype=torch.float32, device=DEVICE)
    )

    # 얼리스탑핑 위한 변수들
    best_state = None
    best_val_f1 = -1.0
    best_epoch = -1
    wait = 0

    # 학습 반복 시작
    for epoch in range(1, max_epochs + 1):
        model.train()
        total_loss = 0.0

        # Train Loop 배치 학습
        for xb, yb in train_loader:
            xb = xb.to(DEVICE)
            yb = yb.to(DEVICE)

            optimizer.zero_grad()   # 기울기 초기화
            logits = model(xb)      # 순전파
            loss = criterion(logits, yb) # 손실계산
            loss.backward()         # 기울기 계산
            optimizer.step()        # 업데이트

            # total_loss는 배치 loss * 배치크기로 누적해서 전체 샘플 평균을 내기 좋게 해둔 형태
            total_loss += float(loss.item()) * len(yb)

        # val f1 기준 early stopping
        # predict_proba_torch: val_loader 전체 돌면서 sigmoid 확률을 numpy로 반환
        val_proba = predict_proba_torch(model, val_loader)
        # predict_proba_torch: val에서 F1이 최대가 되는 threshold를 찾고, 그 때의 val_f1도 같이 반환
        thr, val_f1 = best_threshold_by_f1(y_val, val_proba)

        # 얼리스탑핑 업데이트
        if val_f1 > best_val_f1 + 1e-6:
            best_val_f1 = val_f1
            best_epoch = epoch
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                break

    # best state 복원
    if best_state is not None:
        model.load_state_dict(best_state)

    return model, float(pos_weight), int(best_epoch), float(best_val_f1)

In [28]:
# 7. 저장 폴더 생성 + 기본 파라미터
os.makedirs("dl_model", exist_ok=True)

RANDOM_STATE = 42
TEST_SIZE = 0.20
VAL_SIZE_IN_TRAIN = 0.20

print("TEST_SIZE:", TEST_SIZE, "VAL_SIZE_IN_TRAIN:", VAL_SIZE_IN_TRAIN)

TEST_SIZE: 0.2 VAL_SIZE_IN_TRAIN: 0.2


- 필수 변수 준비 체크

In [29]:
# df_model, FEATURES, TARGET, top_appids (50개)
print("TARGET:", TARGET)
print("n_features:", len(FEATURES))
print("top_appids len:", len(top_appids))
print("sample appids:", top_appids[:5])

TARGET: churn
n_features: 30
top_appids len: 50
sample appids: [3241660, 2807960, 730, 1808500, 1030300]


In [30]:
# 9. Top50 전체(appid 50개) 딥러닝 학습 + 저장 (메인 루프)
rows = []

pbar = tqdm(top_appids, desc="DL Games", unit="game")

for appid in pbar:
    # gdf: 해당 게임 리뷰만 모은 DataFrame
    gdf = df_model[df_model["appid"] == appid].copy()
    gdf = gdf.replace([np.inf, -np.inf], 0)

    # 타깃이 한쪽만 있으면 학습 불가
    if gdf[TARGET].nunique() < 2:
        continue

    # X/y 구성 (딥러닝용 dtype)
    X = gdf[FEATURES].apply(pd.to_numeric, errors="coerce").fillna(0).to_numpy(dtype=np.float32)
    y = gdf[TARGET].astype(int).to_numpy(dtype=np.int64)

    # split (stratify)
    X_train_full, X_test, y_train_full, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full,
        test_size=VAL_SIZE_IN_TRAIN, random_state=RANDOM_STATE, stratify=y_train_full
    )

    # 스케일링(게임별 train에만 fit)
    # scaler는 train에만 fit해야 데이터 누수가 없음.
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_val_s   = scaler.transform(X_val)
    X_test_s  = scaler.transform(X_test)

    # 학습 호출
    pbar.set_postfix_str(f"appid={appid}")
    model, pos_weight, best_epoch, best_val_f1 = train_mlp_one_game(
        X_train_s, y_train, X_val_s, y_val,
        hidden_dims=(128, 64),
        dropout=0.25,
        lr=1e-3,
        weight_decay=1e-4,
        batch_size=2048,
        max_epochs=30,
        patience=5
    )

    # val 기준 thr 선택
    val_loader = DataLoader(TabDataset(X_val_s, y_val), batch_size=4096, shuffle=False)
    val_proba = predict_proba_torch(model, val_loader)
    best_thr, _ = best_threshold_by_f1(y_val, val_proba)

    # test 평가
    test_loader = DataLoader(TabDataset(X_test_s, y_test), batch_size=4096, shuffle=False)
    test_proba = predict_proba_torch(model, test_loader)
    m = eval_from_proba(y_test, test_proba, best_thr)

    # baseline(전부 1찍기)와 비교
    churn_rate = float(y.mean())
    all1_f1 = f1_score(y_test, np.ones_like(y_test), zero_division=0)
    # 양수면 최소한 baseline은 이겼다라는 의미
    gain_vs_all1 = float(m["f1"] - all1_f1)

    # 결과 저장
    row = {
        "appid": int(appid),
        "n_rows": int(len(gdf)),
        "churn_rate": churn_rate,
        "pos_weight": float(pos_weight),
        "best_valid_f1": float(best_val_f1),
        "best_epoch": int(best_epoch),
        "best_thr": float(best_thr),

        "test_f1": float(m["f1"]),
        "test_precision": float(m["precision"]),
        "test_recall": float(m["recall"]),
        "test_roc_auc": float(m["roc_auc"]),
        "test_pred_pos_rate": float(m["pred_pos_rate"]),

        "tn": int(m["tn"]), "fp": int(m["fp"]), "fn": int(m["fn"]), "tp": int(m["tp"]),
        "all1_f1": float(all1_f1),
        "gain_vs_all1": float(gain_vs_all1),
    }
    rows.append(row)

    # 저장(payload)
    payload = {
        "model_state_dict": {k: v.cpu() for k, v in model.state_dict().items()},
        "scaler": scaler,
        "features": FEATURES,
        "best_thr": float(best_thr),
        "meta": {
            "appid": int(appid),
            "pos_weight": float(pos_weight),
            "best_valid_f1": float(best_val_f1),
            "best_epoch": int(best_epoch),
        }
    }
    joblib.dump(payload, f"dl_model/model_{int(appid)}.pkl")

DL Games: 100%|██████████| 50/50 [23:38<00:00, 28.36s/game, appid=1222670]  


In [31]:
# 10 결과 DF 만들기 + 상위 확인
results_df = pd.DataFrame(rows).sort_values(["test_f1", "test_recall"], ascending=False).reset_index(drop=True)

print("done. games:", len(results_df))
display(results_df.head(50))

done. games: 50


Unnamed: 0,appid,n_rows,churn_rate,pos_weight,best_valid_f1,best_epoch,best_thr,test_f1,test_precision,test_recall,test_roc_auc,test_pred_pos_rate,tn,fp,fn,tp,all1_f1,gain_vs_all1
0,3527290,31721,0.699001,0.430585,0.823557,3,0.18,0.823288,0.701429,0.996392,0.63228,0.992908,29,1881,16,4419,0.82282,0.000468
1,1222140,64366,0.648681,0.541596,0.788677,21,0.32,0.787336,0.651553,0.994611,0.58359,0.990213,81,4442,45,8306,0.786902,0.000434
2,2592160,150629,0.594527,0.682027,0.770578,30,0.34,0.765926,0.665063,0.902853,0.688863,0.80711,4071,8144,1740,16171,0.745717,0.020209
3,2001120,109621,0.562949,0.776371,0.739562,30,0.32,0.73989,0.607041,0.947177,0.699665,0.878404,2014,7568,652,11691,0.720381,0.019509
4,1903340,119542,0.546661,0.829281,0.709035,4,0.34,0.707433,0.558304,0.965264,0.625107,0.945125,858,9981,454,12616,0.706888,0.000545
5,3167020,96654,0.487222,1.052424,0.670666,5,0.36,0.666091,0.528583,0.900297,0.689701,0.829807,2351,7562,939,8479,0.655188,0.010903
6,648800,34646,0.489638,1.042373,0.659837,9,0.32,0.65724,0.49111,0.993221,0.625994,0.990188,45,3492,23,3370,0.657367,-0.000127
7,1145350,51398,0.410814,1.434249,0.658013,5,0.44,0.654708,0.508038,0.920436,0.712488,0.744261,2293,3764,336,3887,0.582362,0.072345
8,1326470,59319,0.456481,1.190652,0.633636,8,0.36,0.631787,0.474426,0.945347,0.625029,0.909643,776,5672,296,5120,0.626852,0.004935
9,1771300,80174,0.424252,1.357067,0.605381,5,0.4,0.605669,0.457654,0.895193,0.629807,0.829872,2015,7217,713,6090,0.595761,0.009907


In [32]:
# 11. best_by_game(게임별 1개)
best_by_game = (
    results_df.sort_values(["appid", "test_f1", "test_recall"], ascending=[True, False, False])
              .groupby("appid", as_index=False)
              .head(1)
              .sort_values("test_f1", ascending=False)
              .reset_index(drop=True)
)

display(best_by_game.head(50))

Unnamed: 0,appid,n_rows,churn_rate,pos_weight,best_valid_f1,best_epoch,best_thr,test_f1,test_precision,test_recall,test_roc_auc,test_pred_pos_rate,tn,fp,fn,tp,all1_f1,gain_vs_all1
0,3527290,31721,0.699001,0.430585,0.823557,3,0.18,0.823288,0.701429,0.996392,0.63228,0.992908,29,1881,16,4419,0.82282,0.000468
1,1222140,64366,0.648681,0.541596,0.788677,21,0.32,0.787336,0.651553,0.994611,0.58359,0.990213,81,4442,45,8306,0.786902,0.000434
2,2592160,150629,0.594527,0.682027,0.770578,30,0.34,0.765926,0.665063,0.902853,0.688863,0.80711,4071,8144,1740,16171,0.745717,0.020209
3,2001120,109621,0.562949,0.776371,0.739562,30,0.32,0.73989,0.607041,0.947177,0.699665,0.878404,2014,7568,652,11691,0.720381,0.019509
4,1903340,119542,0.546661,0.829281,0.709035,4,0.34,0.707433,0.558304,0.965264,0.625107,0.945125,858,9981,454,12616,0.706888,0.000545
5,3167020,96654,0.487222,1.052424,0.670666,5,0.36,0.666091,0.528583,0.900297,0.689701,0.829807,2351,7562,939,8479,0.655188,0.010903
6,648800,34646,0.489638,1.042373,0.659837,9,0.32,0.65724,0.49111,0.993221,0.625994,0.990188,45,3492,23,3370,0.657367,-0.000127
7,1145350,51398,0.410814,1.434249,0.658013,5,0.44,0.654708,0.508038,0.920436,0.712488,0.744261,2293,3764,336,3887,0.582362,0.072345
8,1326470,59319,0.456481,1.190652,0.633636,8,0.36,0.631787,0.474426,0.945347,0.625029,0.909643,776,5672,296,5120,0.626852,0.004935
9,1771300,80174,0.424252,1.357067,0.605381,5,0.4,0.605669,0.457654,0.895193,0.629807,0.829872,2015,7217,713,6090,0.595761,0.009907


In [33]:
# 12. 튜닝 대상 선정(과대예측/저성능 게임)
best_df = best_by_game.copy()

F1_LOW = 0.50   # F1 스코어가 0.5 미만인 게임 필터링
POS_GAP = 0.25  # 예측 양성 비율이 실제 churn_rate보다 0.25 이상 높으면 과대예측
POS_MULT = 1.8  # 예측 양성 비율이 실제보다 1.8배 이상이면 심한 과대예측

# 과대예측 지표 만들기
# 예측 1비율 - 실제 1비율
best_df["pos_gap"]  = best_df["test_pred_pos_rate"] - best_df["churn_rate"]
# 예측 1비율 / 실제 1비율
best_df["pos_mult"] = best_df["test_pred_pos_rate"] / (best_df["churn_rate"] + 1e-12)

# 튜닝 대상 필터링
need_tune = best_df[
    (best_df["test_f1"] < F1_LOW) |
    (best_df["pos_gap"] > POS_GAP) |
    ((best_df["churn_rate"] < 0.35) & (best_df["pos_mult"] > POS_MULT))
].copy()

MAX_TUNE_GAMES = 20 # 최대 20개 제한
need_tune = need_tune.sort_values("test_f1", ascending=True).head(MAX_TUNE_GAMES)

tune_appids = need_tune["appid"].astype(int).tolist()

print("튜닝 대상 appid 개수:", len(tune_appids))

display_cols = [
    "appid","n_rows","churn_rate",
    "best_valid_f1","best_thr",
    "test_precision","test_recall","test_f1","test_pred_pos_rate",
    "pos_gap","pos_mult"
]
display(need_tune[display_cols])

튜닝 대상 appid 개수: 20


Unnamed: 0,appid,n_rows,churn_rate,best_valid_f1,best_thr,test_precision,test_recall,test_f1,test_pred_pos_rate,pos_gap,pos_mult
49,730,272473,0.184881,0.347161,0.46,0.228729,0.697469,0.344486,0.563758,0.378877,3.049308
48,553850,148012,0.139239,0.364004,0.58,0.302763,0.425279,0.353713,0.195588,0.05635,1.404698
47,3405690,36162,0.17405,0.370243,0.52,0.273344,0.544083,0.363878,0.346468,0.172417,1.99062
46,1808500,252277,0.18666,0.384392,0.5,0.309792,0.476003,0.375319,0.286804,0.100144,1.536508
45,1973530,43061,0.084206,0.34613,0.68,0.306452,0.497931,0.379401,0.13677,0.052564,1.624229
44,1551360,53036,0.21723,0.3925,0.48,0.288229,0.568576,0.382538,0.42845,0.21122,1.972336
43,3240220,138626,0.219338,0.400922,0.46,0.262139,0.717316,0.383962,0.600159,0.38082,2.736223
42,227300,92151,0.227171,0.404417,0.46,0.270296,0.741103,0.396119,0.622864,0.395693,2.741832
41,294100,30831,0.23311,0.402726,0.44,0.273776,0.742698,0.400075,0.63256,0.399451,2.713576
40,3241660,340625,0.219902,0.422256,0.5,0.334178,0.571858,0.421843,0.376308,0.156407,1.711257


In [34]:
# 13. 튜닝 설정 + 저장 폴더 준비
os.makedirs("dl_model_tuned", exist_ok=True)

# 데이터 분할은 ML과 동일 흐름
RANDOM_STATE = 42
TEST_SIZE = 0.20
VAL_SIZE_IN_TRAIN = 0.20

# 튜닝 후보(너무 폭발하지 않게 "작게" 잡음)
HIDDEN_CANDS  = [(128,64), (256,128), (256,128,64)] # 모델 용량(복잡도) 조절
DROPOUT_CANDS = [0.15, 0.25, 0.35]                  # 과적합 방지 강도
LR_CANDS      = [3e-4, 1e-3, 2e-3]                  # 학습 속도/수렴 특성
WD_CANDS      = [0.0, 1e-5, 1e-4]                   # 과적합 방지(가중치 크기 억제)
BATCH_CANDS   = [1024, 2048, 4096]                  # 업데이트 노이즈/속도/일반화에 영향

# 학습 길이(튜닝은 조금 더 여유)
MAX_EPOCHS = 40
PATIENCE   = 6

# threshold grid는 기존 그대로 쓰면 됨
THR_GRID = np.round(np.arange(0.10, 0.91, 0.02), 2)

print("tune_appids:", len(tune_appids))


tune_appids: 20


In [35]:
# 14. threshold 선택 함수를 튜닝용으로 하나 더 추가
#   recall이 너무 높게 나오는(=양성 과대예측) 걸 줄이려면
#   F1 최대 대신 precision 최소조건 걸고 F1 최대를 택하는 방식이 도움이 됨.
def best_threshold_by_f1_min_precision(y_true, proba, thr_grid=THR_GRID, min_precision=0.55):

    # 1. precision >= min_precision 만족하는 thr 중에서
    # 2. F1이 최대가 되는 thr 선택
    # 3. 만족하는 thr이 하나도 없으면 -> 그냥 best_threshold_by_f1로 fallback

    best = None  # (f1, thr, precision)
    # threshold 하나씩 보면서 pred 생성, precision 계산, precision이 min_precision 미만이면 제외
    # 남은 것 중 F1이 가장 큰 thr 선택
    for thr in thr_grid:
        pred = (proba >= thr).astype(int)
        p = precision_score(y_true, pred, zero_division=0)
        if p < min_precision:
            continue
        f1 = f1_score(y_true, pred, zero_division=0)
        cand = (f1, float(thr), float(p))
        if (best is None) or (cand[0] > best[0]):
            best = cand

    if best is None:
        # 조건 만족하는 임계값 없으면 원래 방식으로 돌아감
        thr, f1v = best_threshold_by_f1(y_true, proba, thr_grid=thr_grid)
        return float(thr), float(f1v), float(precision_score(y_true, (proba>=thr).astype(int), zero_division=0))

    return best[1], best[0], best[2]

In [None]:
# 15 튜닝 1회(하이퍼파라미터 1세트) 실행 함수
# val 기준 thr 선택 -> test 평가까지 한 번에 리턴
def run_one_config_for_game(gdf, *, hidden_dims, dropout, lr, weight_decay, batch_size,
                            use_min_precision=True, min_precision=0.55):
    # X/y 구성
    X = gdf[FEATURES].apply(pd.to_numeric, errors="coerce").fillna(0).to_numpy(dtype=np.float32)
    y = gdf[TARGET].astype(int).to_numpy(dtype=np.int64)

    # split
    X_train_full, X_test, y_train_full, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full,
        test_size=VAL_SIZE_IN_TRAIN, random_state=RANDOM_STATE, stratify=y_train_full
    )

    # 스케일링(게임별 fit)
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_val_s   = scaler.transform(X_val)
    X_test_s  = scaler.transform(X_test)

    # 학습(기존 유틸 사용)
    model, pos_weight, best_epoch, best_val_f1 = train_mlp_one_game(
        X_train_s, y_train, X_val_s, y_val,
        hidden_dims=hidden_dims,
        dropout=dropout,
        lr=lr,
        weight_decay=weight_decay,
        batch_size=batch_size,
        max_epochs=MAX_EPOCHS,
        patience=PATIENCE
    )

    # val proba -> thr 선택
    val_loader  = DataLoader(TabDataset(X_val_s, y_val), batch_size=4096, shuffle=False)
    val_proba   = predict_proba_torch(model, val_loader)

    # True일 때 precision이 min_precision 이상인 임계값 중에서 F1 최대 선택
    # False이면 그냥 F1이 최대 임계값 선택
    if use_min_precision:
        best_thr, val_f1, val_prec = best_threshold_by_f1_min_precision(
            y_val, val_proba, thr_grid=THR_GRID, min_precision=min_precision
        )
    else:
        best_thr, val_f1 = best_threshold_by_f1(y_val, val_proba, thr_grid=THR_GRID)
        val_prec = float(precision_score(y_val, (val_proba>=best_thr).astype(int), zero_division=0))

    # test 평가
    test_loader = DataLoader(TabDataset(X_test_s, y_test), batch_size=4096, shuffle=False)
    test_proba  = predict_proba_torch(model, test_loader)
    m = eval_from_proba(y_test, test_proba, best_thr)

    # all-1 baseline 비교
    churn_rate = float(y.mean())
    all1_f1 = f1_score(y_test, np.ones_like(y_test), zero_division=0)
    gain_vs_all1 = float(m["f1"] - all1_f1)

    row = {
        "n_rows": int(len(gdf)),    # 데이터 메타: n_rows, churn_rate
        "churn_rate": churn_rate,
        "pos_weight": float(pos_weight), # 불균형 보정

        "hidden_dims": str(hidden_dims),
        "dropout": float(dropout),
        "lr": float(lr),
        "weight_decay": float(weight_decay),
        "batch_size": int(batch_size),

        "best_epoch": int(best_epoch),
        "best_valid_f1": float(best_val_f1),

        "best_thr": float(best_thr),
        "val_f1": float(val_f1),
        "val_precision": float(val_prec),

        "test_f1": float(m["f1"]),
        "test_precision": float(m["precision"]),
        "test_recall": float(m["recall"]),
        "test_roc_auc": float(m["roc_auc"]),
        "test_pred_pos_rate": float(m["pred_pos_rate"]),
        "tn": int(m["tn"]), "fp": int(m["fp"]), "fn": int(m["fn"]), "tp": int(m["tp"]), # 혼동행렬
        "all1_f1": float(all1_f1),
        "gain_vs_all1": float(gain_vs_all1),
    }

    payload = {
        "model_state_dict": {k: v.cpu() for k, v in model.state_dict().items()},
        "scaler": scaler,
        "features": FEATURES,
        "best_thr": float(best_thr),
        "hparams": {
            "hidden_dims": hidden_dims,
            "dropout": float(dropout),
            "lr": float(lr),
            "weight_decay": float(weight_decay),
            "batch_size": int(batch_size),
        },
        "meta": {
            "pos_weight": float(pos_weight),
            "best_epoch": int(best_epoch),
            "best_valid_f1": float(best_val_f1),
        }
    }

    return row, payload

### 튜닝 후보 조합 만들기 + 게임당 25개 샘플링

In [None]:
# 16. 튜닝 후보 조합 만들기 (너무 많으면 제한 걸기)
# 전체 조합이 크면 일부만 샘플링해서 쓰자.

# hidden 3 × dropout 3 × lr 3 × wd 3 × batch 3 = 243
# itertools.product는 모든 조합을 생성
all_configs = list(itertools.product(HIDDEN_CANDS, DROPOUT_CANDS, LR_CANDS, WD_CANDS, BATCH_CANDS))
print("total config candidates:", len(all_configs))

MAX_CONFIGS_PER_GAME = 5  # 너무 오래 걸리면 15~20으로 
rng = np.random.RandomState(42)

# k: 샘플링할 개수
# 조합수가 k보다 작으면 학습x
def sample_configs(configs, k):
    if len(configs) <= k:
        return configs
    idx = rng.choice(len(configs), size=k, replace=False)
    return [configs[i] for i in idx]

sampled_configs = sample_configs(all_configs, MAX_CONFIGS_PER_GAME)
print("sampled per-game configs:", len(sampled_configs))


total config candidates: 243
sampled per-game configs: 5


In [38]:
# 17. 튜닝 메인 루프 (tune_appids만)
# 기준: val_f1 우선, 동점이면 test_f1, 그 다음엔 test_pred_pos_rate 낮은 쪽 선호(과대예측 억제)

tuned_rows = []
pbar = tqdm(tune_appids, desc="DL Tuning", unit="game")

for appid in pbar:
    gdf = df_model[df_model["appid"] == appid].copy()
    gdf = gdf.replace([np.inf, -np.inf], 0)

    if gdf[TARGET].nunique() < 2:
        continue

    best_row = None       # config 25개 중 최고를 담을 변수
    best_payload = None   # Min_prec는 threshold 선택시 precision 반환

    # 최소 precision 설정값
    MIN_PREC = 0.55

    # 샘플링 된 config들만 실행
    for (hidden_dims, dropout, lr, wd, bs) in sampled_configs:
        pbar.set_postfix_str(f"appid={appid} hd={hidden_dims} dr={dropout} lr={lr} wd={wd} bs={bs}")

        # 한 설정(config)에 대해
        # split / scaling / train / thr 선택 / test 평가까지 수행
        row, payload = run_one_config_for_game(
            gdf,
            hidden_dims=hidden_dims,
            dropout=dropout,
            lr=lr,
            weight_decay=wd,
            batch_size=bs,
            use_min_precision=True,
            min_precision=MIN_PREC
        )


        row["appid"] = int(appid)

        if best_row is None:
            best_row, best_payload = row, payload
        else:
            # 튜닝은 val 성능을 우선으로 선택
            key_new = (row["val_f1"], row["test_f1"], -row["test_pred_pos_rate"])
            key_old = (best_row["val_f1"], best_row["test_f1"], -best_row["test_pred_pos_rate"])
            if key_new > key_old:
                best_row, best_payload = row, payload

    # 유효한 config 없으면 스킵
    if best_row is None:
        continue
    
    # 게임별 best 저장
    tuned_rows.append(best_row)

    # appid별 best 저장
    joblib.dump(best_payload, f"dl_model_tuned/model_{int(appid)}.pkl")


DL Tuning: 100%|██████████| 20/20 [1:27:47<00:00, 263.36s/game, appid=3159330 hd=(256, 128, 64) dr=0.35 lr=0.002 wd=1e-05 bs=2048] 


In [39]:
# 18. 튜닝 결과 테이블 + 기존(best_df)와 비교

tuned_df = pd.DataFrame(tuned_rows).sort_values("test_f1", ascending=False).reset_index(drop=True)
print("튜닝 완료 게임 수:", len(tuned_df))
display(tuned_df.head(30))

# # 기존 결과(best_df)에서 튜닝 대상만 뽑아서 비교
# base_part = best_df[best_df["appid"].isin(tune_appids)].copy()
# base_part = base_part[[
#     "appid","n_rows","churn_rate",
#     "best_valid_f1","best_thr",
#     "test_precision","test_recall","test_f1","test_pred_pos_rate"
# ]].rename(columns={
#     "best_valid_f1":"base_best_valid_f1",
#     "best_thr":"base_best_thr",
#     "test_precision":"base_test_precision",
#     "test_recall":"base_test_recall",
#     "test_f1":"base_test_f1",
#     "test_pred_pos_rate":"base_test_pred_pos_rate"
# })

# # appid 기준으로 base와 tuned를 붙임
# # left join이라 base에 있는 appid를 유지, tuned가 없는 경우 NaN
# cmp = base_part.merge(
#     tuned_df[[
#         "appid","best_valid_f1","best_thr",
#         "test_precision","test_recall","test_f1","test_pred_pos_rate",
#         "hidden_dims","dropout","lr","weight_decay","batch_size"
#     ]],
#     on="appid",
#     how="left"
# )

# # 개선량 계산
# # delta_f1 > 0이면 튜닝으로 F1 개선
# # delta_pos_rate < 0이면 과대예측(예측 양성 비율)이 줄어든 것
# cmp["delta_f1"] = cmp["test_f1"] - cmp["base_test_f1"]
# cmp["delta_pos_rate"] = cmp["test_pred_pos_rate"] - cmp["base_test_pred_pos_rate"]

# cmp = cmp.sort_values("delta_f1", ascending=False).reset_index(drop=True)
# display(cmp.head(30))


튜닝 완료 게임 수: 20


Unnamed: 0,n_rows,churn_rate,pos_weight,hidden_dims,dropout,lr,weight_decay,batch_size,best_epoch,best_valid_f1,...,test_recall,test_roc_auc,test_pred_pos_rate,tn,fp,fn,tp,all1_f1,gain_vs_all1,appid
0,37631,0.179878,4.559326,"(256, 128, 64)",0.35,0.002,1e-05,2048,15,0.448649,...,0.338996,0.734422,0.110004,5804,369,895,459,0.304921,0.115794,3513350
1,48982,0.204279,3.895066,"(128, 64)",0.15,0.0003,0.0001,1024,13,0.472914,...,0.325337,0.740511,0.116873,7302,494,1350,651,0.33921,0.074649,230410
2,41971,0.188463,4.306203,"(256, 128, 64)",0.35,0.002,1e-05,2048,14,0.445074,...,0.292035,0.71935,0.102204,6417,396,1120,462,0.317129,0.061559,394360
3,36162,0.17405,4.745531,"(128, 64)",0.15,0.002,0.0001,1024,19,0.377222,...,0.570294,0.681688,0.369556,4019,1955,541,718,0.296514,0.068694,3405690
4,300088,0.15778,5.338063,"(256, 128)",0.35,0.001,0.0001,4096,24,0.446716,...,0.269166,0.757892,0.07891,48361,2187,6921,2549,0.272565,0.086297,2807960
5,32102,0.222946,3.48559,"(256, 128, 64)",0.35,0.001,0.0001,2048,19,0.483516,...,0.268156,0.721557,0.117427,4619,370,1048,384,0.364701,-0.013375,3159330
6,78701,0.268586,2.723241,"(128, 64)",0.15,0.002,0.0001,1024,18,0.498222,...,0.24929,0.686852,0.127374,10562,951,3174,1054,0.423456,-0.085256,3489700
7,193853,0.17309,4.777452,"(256, 128)",0.35,0.001,0.0001,4096,27,0.440779,...,0.227388,0.73939,0.069898,30876,1184,5185,1526,0.295106,0.028851,578080
8,43061,0.084206,10.87333,"(256, 128, 64)",0.35,0.002,1e-05,2048,18,0.356185,...,0.194483,0.785045,0.027052,7796,92,584,141,0.15528,0.139084,1973530
9,83054,0.287054,2.483681,"(128, 64)",0.15,0.0003,0.0001,1024,16,0.470781,...,0.191695,0.651476,0.094275,11191,652,3854,914,0.446045,-0.157444,108600
