In [None]:
# Install necessary libraries (Clean install for Colab)
!pip install -q kaggle optuna catboost lightgbm xgboost scikit-learn pandas numpy

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import json
import time
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ML Libraries
import optuna
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from google.colab import userdata

# Global Configuration
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore') # Suppress feature name warnings

In [None]:
# Setup Kaggle API
try:
    kaggle_username = userdata.get('KAGGLE_USERNAME')
    kaggle_key = userdata.get('KAGGLE_API_KEY')
    kaggle_json = {"username": kaggle_username, "key": kaggle_key}

    os.makedirs('/root/.kaggle', exist_ok=True)
    with open('/root/.kaggle/kaggle.json', 'w') as f:
        json.dump(kaggle_json, f)
    os.chmod('/root/.kaggle/kaggle.json', 600)

    # Download Dataset
    !kaggle competitions download -c cpe342-karena
    !unzip -q -o cpe342-karena.zip -d /content/
    print("Dataset downloaded and extracted.")

except Exception as e:
    print(f"Error setting up Kaggle: {e}")
    print("Please ensure KAGGLE_USERNAME and KAGGLE_API_KEY are set in Colab Secrets.")

Downloading cpe342-karena.zip to /content
 97% 897M/922M [00:07<00:00, 108MB/s] 
100% 922M/922M [00:08<00:00, 115MB/s]
Dataset downloaded and extracted.


In [None]:
# --- 1. YOUR NEW FEATURE ENGINEERING ---
def engineer_id_features(df):
    """
    Applies the custom latent vector and density score logic
    derived from player_id.
    """
    df_eng = df.copy()

    # Generate Hex String (Latent Vector)
    df_eng['latent_vector_seq'] = df_eng['player_id'].apply(
        lambda s: binascii.b2a_hex(
            struct.pack(
                f'{len(s)}B',
                *[int(c) if c.isdigit() else 0xF for c in s]
            )
        ).decode('utf-8')
    )

    # Generate Density Score (Numeric Feature)
    df_eng['density_score'] = df_eng['latent_vector_seq'].apply(
        lambda h: sum(
            (val if val != 0xF else 0) * (10 ** i)
            for i, val in enumerate(
                int(h[j:j+2], 16)
                for j in range(len(h) - 2, -1, -2)
            )
        )
        + (len(h) * 1e-6)
    )

    return df_eng

# --- 2. SMART CLEANING ---
def smart_fill_data(df):
    df_clean = df.copy()

    # Drop Noise
    drop_cols = ['random_metric_1', 'random_metric_2', 'random_metric_3']
    df_clean = df_clean.drop(columns=[c for c in drop_cols if c in df_clean.columns], errors='ignore')

    # Logic-Zero Fills
    zero_fill_cols = [
        'friend_count', 'friend_invites_sent', 'gifts_sent_received',
        'Tournament_entries', 'chat_activity_score', 'rare_items_count'
    ]
    for col in zero_fill_cols:
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].fillna(0)

    # Grouped Imputation
    def fill_by_group(target_col, group_col):
        if target_col in df_clean.columns and group_col in df_clean.columns:
            df_clean[target_col] = df_clean[target_col].fillna(
                df_clean.groupby(group_col)[target_col].transform('median')
            )

    fill_by_group('total_spending_thb', 'vip_tier')
    fill_by_group('avg_monthly_spending', 'vip_tier')
    fill_by_group('win_rate_ranked', 'skill_tier')
    fill_by_group('achievement_completion_rate', 'skill_tier')

    habit_cols = ['play_frequency', 'avg_session_duration', 'total_playtime_hours',
                  'days_since_last_login', 'login_streak']
    for col in habit_cols:
        fill_by_group(col, 'engagement_level')

    # Fallback Fill
    numeric_cols = df_clean.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_cols:
        df_clean[col] = df_clean[col].fillna(df_clean[col].median())

    # Categorical Fill
    cat_cols = df_clean.select_dtypes(include=['object']).columns
    for col in cat_cols:
        df_clean[col] = df_clean[col].fillna('Missing')

    return df_clean

In [None]:
# Load Data
import binascii
import struct
print("Loading data...")
df_train = pd.read_csv('/content/public_dataset/task2/train.csv')
df_test = pd.read_csv('/content/public_dataset/task2/test.csv')

# 1. Apply your New Feature Engineering (Must be done before dropping IDs)
print("Applying ID Feature Engineering...")
df_train = engineer_id_features(df_train)
df_test = engineer_id_features(df_test)

# 2. Apply Smart Fill
print("Applying Smart Cleaning...")
X_train_clean = smart_fill_data(df_train)
X_test_clean = smart_fill_data(df_test)

# Define Targets and IDs
target_col = 'segment'
id_col = 'player_id'

y = X_train_clean[target_col]

# Drop ID and the intermediate string column 'latent_vector_seq'
# We ONLY keep 'density_score' as a feature
cols_to_drop = [target_col, 'id', id_col, 'latent_vector_seq']
X = X_train_clean.drop(columns=cols_to_drop, errors='ignore')

test_ids = X_test_clean[id_col]
X_test_final = X_test_clean.drop(columns=['id', id_col, 'latent_vector_seq'], errors='ignore')

# Identify Features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Features included: {len(X.columns)}")
print(f"New feature 'density_score' included? {'density_score' in numerical_features}")

# Preprocessing Pipeline
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    verbose_feature_names_out=False
)

print("Preprocessing features...")
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test_final)

# Convert to numpy for Optuna
X_tune = np.asarray(X_processed)
y_tune = np.asarray(y)

Loading data...
Applying ID Feature Engineering...
Applying Smart Cleaning...
Features included: 42
New feature 'density_score' included? True
Preprocessing features...


In [None]:
def optimize_model(model_name, n_trials=15):
    print(f"\n--- Starting Optuna Study for {model_name} ---")

    def objective(trial):
        if model_name == 'lgbm':
            params = {
                'objective': 'multiclass',
                'metric': 'multi_logloss',
                'verbosity': -1,
                'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
                'num_leaves': trial.suggest_int('num_leaves', 20, 100),
                'max_depth': trial.suggest_int('max_depth', 5, 12),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'class_weight': 'balanced',
                'device': 'gpu',
                'gpu_platform_id': 0,
                'gpu_device_id': 0
            }
            model = lgb.LGBMClassifier(**params, random_state=42)

        elif model_name == 'xgb':
            params = {
                'eval_metric': 'mlogloss',
                'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
                'max_depth': trial.suggest_int('max_depth', 4, 10),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'device': 'cuda',
                'tree_method': 'hist',
            }
            model = xgb.XGBClassifier(**params, random_state=42)

        elif model_name == 'cat':
            params = {
                'iterations': trial.suggest_int('iterations', 500, 1500),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
                'depth': trial.suggest_int('depth', 4, 10),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
                'auto_class_weights': 'Balanced',
                'verbose': 0,
                'allow_writing_files': False,
                'task_type': 'GPU',
                'devices': '0'
            }
            model = CatBoostClassifier(**params, random_state=42)

        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        f1_scores = []

        for train_idx, val_idx in cv.split(X_tune, y_tune):
            X_tr, X_val = X_tune[train_idx], X_tune[val_idx]
            y_tr, y_val = y_tune[train_idx], y_tune[val_idx]

            model.fit(X_tr, y_tr)
            y_pred = model.predict(X_val)
            f1_scores.append(f1_score(y_val, y_pred, average='weighted'))

        return np.mean(f1_scores)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)

    print(f"{model_name} Best F1: {study.best_value:.4f}")
    return study.best_params

best_params_xgb = optimize_model('xgb', n_trials=10)
best_params_cat = optimize_model('cat', n_trials=10)
best_params_lgbm = optimize_model('lgbm', n_trials=10)

[I 2025-11-25 07:39:03,298] A new study created in memory with name: no-name-fa4ef52f-9845-4a4d-8074-06d2c95f9df7



--- Starting Optuna Study for xgb ---


[I 2025-11-25 07:40:30,587] Trial 0 finished with value: 0.9682712322240884 and parameters: {'n_estimators': 1111, 'learning_rate': 0.016176927184383995, 'max_depth': 8, 'subsample': 0.8844104391417233, 'colsample_bytree': 0.7722915869064056}. Best is trial 0 with value: 0.9682712322240884.
[I 2025-11-25 07:41:01,040] Trial 1 finished with value: 0.9679266872639043 and parameters: {'n_estimators': 1281, 'learning_rate': 0.040166697932915545, 'max_depth': 4, 'subsample': 0.9373702007668332, 'colsample_bytree': 0.7731302633251657}. Best is trial 0 with value: 0.9682712322240884.
[I 2025-11-25 07:43:28,963] Trial 2 finished with value: 0.9682610342791335 and parameters: {'n_estimators': 1442, 'learning_rate': 0.03136175877258498, 'max_depth': 10, 'subsample': 0.8847639133260263, 'colsample_bytree': 0.9826541490750784}. Best is trial 0 with value: 0.9682712322240884.
[I 2025-11-25 07:43:58,600] Trial 3 finished with value: 0.9680735325182225 and parameters: {'n_estimators': 675, 'learning_

xgb Best F1: 0.9685

--- Starting Optuna Study for cat ---


[I 2025-11-25 07:50:00,629] Trial 0 finished with value: 0.9671325649670601 and parameters: {'iterations': 710, 'learning_rate': 0.056469382283277685, 'depth': 6, 'l2_leaf_reg': 3.706400336779396}. Best is trial 0 with value: 0.9671325649670601.
[I 2025-11-25 07:50:41,412] Trial 1 finished with value: 0.9670327190895134 and parameters: {'iterations': 1423, 'learning_rate': 0.04114068417651309, 'depth': 5, 'l2_leaf_reg': 2.784613374821821}. Best is trial 0 with value: 0.9671325649670601.
[I 2025-11-25 07:51:17,290] Trial 2 finished with value: 0.9667636553648554 and parameters: {'iterations': 1460, 'learning_rate': 0.021128860109811763, 'depth': 4, 'l2_leaf_reg': 4.292726763175385}. Best is trial 0 with value: 0.9671325649670601.
[I 2025-11-25 07:51:47,469] Trial 3 finished with value: 0.9672123940641121 and parameters: {'iterations': 587, 'learning_rate': 0.060832086238503225, 'depth': 7, 'l2_leaf_reg': 4.3678076993670745}. Best is trial 3 with value: 0.9672123940641121.
[I 2025-11-25 

cat Best F1: 0.9672

--- Starting Optuna Study for lgbm ---


[I 2025-11-25 08:01:08,114] Trial 0 finished with value: 0.9684837819082966 and parameters: {'n_estimators': 1129, 'learning_rate': 0.0815808818253678, 'num_leaves': 62, 'max_depth': 10, 'subsample': 0.9550959586709007, 'colsample_bytree': 0.9140116948134649}. Best is trial 0 with value: 0.9684837819082966.
[I 2025-11-25 08:03:11,094] Trial 1 finished with value: 0.9686260917437287 and parameters: {'n_estimators': 597, 'learning_rate': 0.016314617319672958, 'num_leaves': 56, 'max_depth': 8, 'subsample': 0.9814676042344688, 'colsample_bytree': 0.8980535233054161}. Best is trial 1 with value: 0.9686260917437287.
[I 2025-11-25 08:07:28,825] Trial 2 finished with value: 0.9685720670204248 and parameters: {'n_estimators': 1496, 'learning_rate': 0.053129699164801966, 'num_leaves': 42, 'max_depth': 10, 'subsample': 0.9698511696023542, 'colsample_bytree': 0.7387106115215616}. Best is trial 1 with value: 0.9686260917437287.
[I 2025-11-25 08:10:50,010] Trial 3 finished with value: 0.968503587874

lgbm Best F1: 0.9687


In [None]:
print("\n--- Building Final Stacking Model ---")

# Re-instantiate models
lgbm_final = lgb.LGBMClassifier(
    **best_params_lgbm, objective='multiclass', class_weight='balanced',
    device='gpu', verbosity=-1, random_state=42
)

xgb_final = xgb.XGBClassifier(
    **best_params_xgb, eval_metric='mlogloss', device='cuda',
    tree_method='hist', random_state=42
)

cat_final = CatBoostClassifier(
    **best_params_cat, auto_class_weights='Balanced', task_type='GPU',
    verbose=0, allow_writing_files=False, random_state=42
)

estimators = [
    ('lgbm', lgbm_final),
    ('xgb', xgb_final),
    ('cat', cat_final)
]

stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    n_jobs=1,
    cv=3,
    verbose=1
)

print("Fitting Stack on full dataset...")
stacking_model.fit(X_processed, y)

print("Predicting Test set...")
y_test_pred = stacking_model.predict(X_test_processed)

submission = pd.DataFrame({
    'player_id': test_ids,
    'segment': y_test_pred
})

submission.to_csv('/content/drive/MyDrive/ML19/submission_task2_stacking_eng.csv', index=False)
submission.to_csv('submission_task2_stacking_eng.csv', index=False)
print("Submission saved: submission_task2_stacking_eng.csv")


--- Building Final Stacking Model ---
Fitting Stack on full dataset...


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.8min finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   23.0s finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   44.0s finished


Predicting Test set...
Submission saved: submission_task2_stacking_eng.csv


In [None]:
print("done")

done


### more optuna


In [1]:
import pandas as pd


In [7]:
t1 = pd.read_csv("/content/almost_final.csv")

In [8]:
task2 = pd.read_csv("/content/final_task2.csv")

In [9]:
task2

Unnamed: 0,player_id,segment
0,P106074,2
1,P024878,0
2,P033678,0
3,P020935,0
4,P049711,0
...,...,...
25884,P027029,0
25885,P084134,1
25886,P072499,1
25887,P011845,0


In [10]:
t1["task2"] = task2["segment"]

In [12]:
t1.to_csv("final_ans.csv", index=False)