# Load dataset

In [1]:
!pip install kaggle



In [2]:
import os
import zipfile

def download_data_from_kaggle():
    try:
        from google.colab import files
        uploaded = files.upload()
    except ImportError:
        print("Running outside of Colab. Please ensure your kaggle.json is in ~/.kaggle/")

    if 'kaggle.json' in os.listdir('.'):
        !mkdir -p ~/.kaggle
        !mv kaggle.json ~/.kaggle/
        !chmod 600 ~/.kaggle/kaggle.json
    else:
        print("kaggle.json not found. Please upload it or place it in the correct directory.")

    if not os.path.exists('cpe342-karena.zip'):
        print("Downloading data from Kaggle competition 'cpe342-karena'...")
        !kaggle competitions download -c cpe342-karena
    else:
        print("Data already downloaded.")

    if os.path.exists('cpe342-karena.zip'):
        print("Unzipping data...")
        try:
            with zipfile.ZipFile('cpe342-karena.zip', 'r') as zip_ref:
                zip_ref.extractall('.')
            print("Data unzipped.")
        except zipfile.BadZipFile:
            print("Error: Downloaded file is not a valid zip file.")
        except Exception as e:
            print(f"An error occurred during unzipping: {e}")
    else:
        print("Zip file not found, cannot unzip.")

In [3]:
download_data_from_kaggle()

Saving kaggle.json to kaggle.json
Downloading data from Kaggle competition 'cpe342-karena'...
Downloading cpe342-karena.zip to /content
 90% 830M/920M [00:07<00:01, 57.5MB/s]
100% 920M/920M [00:07<00:00, 131MB/s] 
Unzipping data...
Data unzipped.


# Data preprocessing

In [4]:
import pandas as pd
from sklearn.impute import SimpleImputer

df = pd.read_csv('task2/train.csv')
df = df.drop(['id','player_id'], axis=1)

# Continuous features and missing counts
continuous_features = [
    'play_frequency', 'avg_session_duration', 'total_playtime_hours',
    'login_streak', 'days_since_last_login', 'total_spending_thb',
    'avg_monthly_spending', 'spending_frequency', 'friend_count',
    'team_play_percentage', 'chat_activity_score', 'friend_invites_sent',
    'gifts_sent_received', 'ranked_participation_rate', 'tournament_entries',
    'competitive_rank', 'win_rate_ranked', 'watches_esports',
    'achievement_completion_rate', 'collection_progress', 'rare_items_count',
    'speed_of_progression', 'item_type_preference_cosmetic',
    'item_type_preference_performance', 'item_type_preference_social',
    'account_age_days', 'vip_tier', 'responds_to_discounts',
    'preferred_game_mode', 'avg_match_length', 'peak_concurrent_hours',
    'random_metric_1', 'random_metric_2', 'random_metric_3'
]

# Categorical features
categorical_features = [
    'region', 'platform', 'device_type', 'payment_method',
    'language', 'account_status', 'player_type_tag',
    'engagement_level', 'loyalty_tier', 'skill_tier'
]


In [5]:
# Compute missing percentages
missing_cont = df[continuous_features].isnull().mean() * 100

# Features <30% missing → median impute
cont_to_impute = missing_cont[missing_cont < 30].index.tolist()

imputer = SimpleImputer(strategy='median')
df[cont_to_impute] = imputer.fit_transform(df[cont_to_impute])

# Confirm no nulls remain in continuous features
print(df[cont_to_impute].isnull().sum().sum())

0


In [6]:
for col in categorical_features:
    missing_pct = df[col].isnull().mean() * 100
    if missing_pct < 20:
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna('Unknown')

# Check
df[categorical_features].isnull().sum()

Unnamed: 0,0
region,0
platform,0
device_type,0
payment_method,0
language,0
account_status,0
player_type_tag,0
engagement_level,0
loyalty_tier,0
skill_tier,0


## Feature Engineering

In [7]:
import pandas as pd
import numpy as np

# --- Copy of original df ---
df_fe = df.copy()

# --- Continuous features ---
continuous_features = [
    'play_frequency', 'avg_session_duration', 'total_playtime_hours',
    'login_streak', 'days_since_last_login', 'total_spending_thb',
    'avg_monthly_spending', 'spending_frequency', 'friend_count',
    'team_play_percentage', 'chat_activity_score', 'friend_invites_sent',
    'gifts_sent_received', 'ranked_participation_rate', 'tournament_entries',
    'competitive_rank', 'win_rate_ranked', 'watches_esports',
    'achievement_completion_rate', 'collection_progress', 'rare_items_count',
    'speed_of_progression', 'item_type_preference_cosmetic',
    'item_type_preference_performance', 'item_type_preference_social',
    'account_age_days', 'vip_tier', 'responds_to_discounts',
    'avg_match_length', 'peak_concurrent_hours', 'random_metric_1',
    'random_metric_2', 'random_metric_3'
]

# --- Categorical features ---
categorical_features = [
    'region', 'platform', 'device_type', 'payment_method',
    'language', 'account_status', 'player_type_tag',
    'engagement_level', 'loyalty_tier', 'skill_tier'
]

# --- Derived Features ---

# 1. Time & Play Patterns
df_fe['freq_per_day'] = df_fe['play_frequency'] / df_fe['account_age_days']
df_fe['avg_session_per_play'] = df_fe['avg_session_duration'] / df_fe['play_frequency']
df_fe['playtime_per_login'] = df_fe['total_playtime_hours'] / df_fe['login_streak']
df_fe['consistency_score'] = df_fe['login_streak'] / df_fe['account_age_days']
df_fe['recency_days'] = df_fe['days_since_last_login']

# 2. Spending Behavior
df_fe['total_avg_ratio'] = df_fe['total_spending_thb'] / (df_fe['avg_monthly_spending'] + 1e-6)
df_fe['spend_per_playtime'] = df_fe['total_spending_thb'] / (df_fe['total_playtime_hours'] + 1e-6)
df_fe['spending_per_freq'] = df_fe['spending_frequency'] / (df_fe['play_frequency'] + 1e-6)
df_fe['discount_effect'] = df_fe['responds_to_discounts'] * df_fe['total_spending_thb']

# 3. Social & Team Engagement
df_fe['friends_per_play'] = df_fe['friend_count'] / (df_fe['play_frequency'] + 1e-6)
df_fe['social_score'] = df_fe['friend_invites_sent'] + df_fe['gifts_sent_received']
df_fe['teamplay_ratio'] = df_fe['team_play_percentage'] / 100
df_fe['ranked_per_hour'] = df_fe['ranked_participation_rate'] / (df_fe['total_playtime_hours'] + 1e-6)
df_fe['tournament_per_hour'] = df_fe['tournament_entries'] / (df_fe['total_playtime_hours'] + 1e-6)

# 4. Progression & Achievement
df_fe['achievements_per_hour'] = df_fe['achievement_completion_rate'] / (df_fe['total_playtime_hours'] + 1e-6)
df_fe['collection_per_day'] = df_fe['collection_progress'] / df_fe['account_age_days']
df_fe['rare_items_per_playtime'] = df_fe['rare_items_count'] / (df_fe['total_playtime_hours'] + 1e-6)
df_fe['progress_speed_per_session'] = df_fe['speed_of_progression'] / (df_fe['avg_session_duration'] + 1e-6)

# 5. Engagement Intensity
df_fe['playtime_per_day'] = df_fe['total_playtime_hours'] / df_fe['account_age_days']
df_fe['avg_match_hours'] = df_fe['avg_match_length'] * df_fe['play_frequency']
df_fe['peak_intensity'] = df_fe['peak_concurrent_hours'] / (df_fe['avg_session_duration'] + 1e-6)

# 6. Interaction Features
df_fe['spend_vip'] = df_fe['total_spending_thb'] * df_fe['vip_tier']
df_fe['friends_team_interaction'] = df_fe['friend_count'] * df_fe['team_play_percentage']
df_fe['achievements_collection'] = df_fe['achievement_completion_rate'] * df_fe['collection_progress']

# --- Handle Categorical Features ---
df_fe = pd.get_dummies(df_fe, columns=categorical_features, dummy_na=True)

# --- Drop redundant continuous features ---
# Compute correlation matrix
corr_matrix = df_fe[continuous_features].corr().abs()

# Keep track of features to drop
to_drop = set()
threshold = 0.9  # correlation threshold

for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if corr_matrix.iloc[i, j] > threshold:
            colname = corr_matrix.columns[i]
            to_drop.add(colname)

print(f"Dropping {len(to_drop)} highly correlated continuous features:", to_drop)
df_fe.drop(columns=list(to_drop), inplace=True)

Dropping 0 highly correlated continuous features: set()


# Modeling

In [8]:
!pip install catboost optuna

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna, catboost
Successfully installed catboost-1.2.8 colorlog-6.10.1 optuna-4.6.0


In [9]:
import joblib
from google.colab import files
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, classification_report
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
import optuna

# Models
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

In [10]:
# df_fe: feature-engineered & imputed dataframe
X = df_fe.drop(columns=['segment'])
y = df_fe['segment']

# Replace infinite values with NaN and then fill any remaining NaNs
# This step is crucial before SMOTE, as it cannot handle inf values.
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(X.median(), inplace=True)

In [11]:
# Stratified train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)

In [12]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print("Class distribution after SMOTE:\n", pd.Series(y_train_res).value_counts())

Class distribution after SMOTE:
 segment
2    36058
1    36058
0    36058
3    36058
Name: count, dtype: int64


In [13]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# --- XGBoost Objective ---
def objective_xgb(trial):
    params = {
        'objective': 'multi:softmax',
        'num_class': 4,
        'use_label_encoder': False,
        'eval_metric': 'mlogloss',
        'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    model = xgb.XGBClassifier(**params)

    f1_scores = []
    for train_idx, val_idx in skf.split(X_train_res, y_train_res):
        X_train_fold, X_val_fold = X_train_res.iloc[train_idx], X_train_res.iloc[val_idx]
        y_train_fold, y_val_fold = y_train_res.iloc[train_idx], y_train_res.iloc[val_idx]

        model.fit(X_train_fold, y_train_fold)
        y_pred_val = model.predict(X_val_fold)
        f1_scores.append(f1_score(y_val_fold, y_pred_val, average='macro'))

    return np.mean(f1_scores)

# --- LightGBM Objective ---
def objective_lgb(trial):
    params = {
        'objective': 'multiclass',
        'num_class': 4,
        'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

    model = lgb.LGBMClassifier(**params)

    f1_scores = []
    for train_idx, val_idx in skf.split(X_train_res, y_train_res):
        X_train_fold, X_val_fold = X_train_res.iloc[train_idx], X_train_res.iloc[val_idx]
        y_train_fold, y_val_fold = y_train_res.iloc[train_idx], y_train_res.iloc[val_idx]

        model.fit(X_train_fold, y_train_fold)
        y_pred_val = model.predict(X_val_fold)
        f1_scores.append(f1_score(y_val_fold, y_pred_val, average='macro'))

    return np.mean(f1_scores)

# --- CatBoost Objective ---
def objective_cat(trial):
    params = {
        'loss_function': 'MultiClass',
        'verbose': 0,
        'random_seed': 42,
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'bootstrap_type': 'Bernoulli' # Added to support subsample
    }

    model = CatBoostClassifier(**params)

    f1_scores = []
    for train_idx, val_idx in skf.split(X_train_res, y_train_res):
        X_train_fold, X_val_fold = X_train_res.iloc[train_idx], X_train_res.iloc[val_idx]
        y_train_fold, y_val_fold = y_train_res.iloc[train_idx], y_train_res.iloc[val_idx]

        model.fit(X_train_fold, y_train_fold)
        y_pred_val = model.predict(X_val_fold)
        f1_scores.append(f1_score(y_val_fold, y_pred_val, average='macro'))

    return np.mean(f1_scores)

In [None]:
# --- Run Optuna Studies ---

# XGBoost
print("Optimizing XGBoost with Optuna...")
study_xgb = optuna.create_study(direction='maximize', study_name='xgb_optimization')
study_xgb.optimize(objective_xgb, n_trials=10, n_jobs=-1) # Increased trials for better search
best_xgb_params = study_xgb.best_params
best_xgb_model = xgb.XGBClassifier(**best_xgb_params, objective='multi:softmax', num_class=4, use_label_encoder=False, eval_metric='mlogloss', random_state=42)
joblib.dump(best_xgb_model, 'xgboost_model.joblib')
files.download('xgboost_model.joblib')
print("Best XGB params:", best_xgb_params)

In [None]:
# LightGBM
print("Optimizing LightGBM with Optuna...")
study_lgb = optuna.create_study(direction='maximize', study_name='lgb_optimization')
study_lgb.optimize(objective_lgb, n_trials=10, n_jobs=-1) # Increased trials for better search
best_lgb_params = study_lgb.best_params
best_lgb_model = lgb.LGBMClassifier(**best_lgb_params, objective='multiclass', num_class=4, random_state=42)
joblib.dump(best_lgb_model, 'lgboost_model.joblib')
files.download('lgboost_model.joblib')
print("Best LGB params:", best_lgb_params)

In [None]:
# CatBoost
print("Optimizing CatBoost with Optuna...")
study_cat = optuna.create_study(direction='maximize', study_name='cat_optimization')
study_cat.optimize(objective_cat, n_trials=10, n_jobs=-1) # Increased trials for better search
best_cat_params = study_cat.best_params
best_cat_model = CatBoostClassifier(**best_cat_params, loss_function='MultiClass', verbose=0, random_seed=42, bootstrap_type='Bernoulli') # Added bootstrap_type
print("Best CatBoost params:", best_cat_params)
joblib.dump(best_cat_model, 'catboost_model.joblib')
files.download('catboost_model.joblib')

[I 2025-11-20 17:57:21,008] A new study created in memory with name: cat_optimization


Optimizing CatBoost with Optuna...


[I 2025-11-20 18:05:23,768] Trial 0 finished with value: 0.8093549170835398 and parameters: {'iterations': 700, 'learning_rate': 0.042281946735354334, 'depth': 5, 'l2_leaf_reg': 0.020788913969190862, 'subsample': 0.5280029156999297, 'min_data_in_leaf': 58}. Best is trial 0 with value: 0.8093549170835398.
[I 2025-11-20 18:23:34,970] Trial 2 finished with value: 0.7875626981497698 and parameters: {'iterations': 341, 'learning_rate': 0.01280220710332645, 'depth': 8, 'l2_leaf_reg': 1.4095458380416873e-05, 'subsample': 0.785185008704786, 'min_data_in_leaf': 70}. Best is trial 0 with value: 0.8093549170835398.
[I 2025-11-20 18:25:18,976] Trial 3 finished with value: 0.802677137052003 and parameters: {'iterations': 133, 'learning_rate': 0.11224154837957007, 'depth': 5, 'l2_leaf_reg': 4.487084515553473e-06, 'subsample': 0.5799198641541911, 'min_data_in_leaf': 56}. Best is trial 0 with value: 0.8093549170835398.
[I 2025-11-20 18:26:27,096] Trial 4 finished with value: 0.7928302845213903 and par

Best CatBoost params: {'iterations': 496, 'learning_rate': 0.26890211382487456, 'depth': 9, 'l2_leaf_reg': 0.027237014731997772, 'subsample': 0.8974746718718253, 'min_data_in_leaf': 98}


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
best_xgb_model = joblib.load('xgboost_model.joblib')
best_lgb_model = joblib.load('lgboost_model.joblib')
best_cat_model = joblib.load('catboost_model.joblib')

print("Models loaded successfully!")

Models loaded successfully!


In [16]:
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', best_xgb_model),
        ('lgb', best_lgb_model),
        ('cat', best_cat_model)
    ],
    voting='soft',  # use 'soft' for probability averaging
    n_jobs=-1
)

# Train ensemble
voting_clf.fit(X_train_res, y_train_res)

# Predictions
y_pred = voting_clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.85      0.81      4006
           1       0.73      0.72      0.72      2540
           2       0.74      0.68      0.71      2055
           3       0.75      0.70      0.73      1565

    accuracy                           0.76     10166
   macro avg       0.75      0.74      0.74     10166
weighted avg       0.76      0.76      0.76     10166



In [17]:
import joblib
from google.colab import files

joblib.dump(voting_clf, 'voting_clf.joblib')
files.download('voting_clf.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Testset

## Load Test Data


In [18]:
test_df = pd.read_csv('task2/test.csv')
test_ids = test_df['id']
test_df.head()

Unnamed: 0,id,player_id,play_frequency,avg_session_duration,total_playtime_hours,login_streak,days_since_last_login,total_spending_thb,avg_monthly_spending,spending_frequency,...,region,platform,device_type,payment_method,language,account_status,player_type_tag,engagement_level,loyalty_tier,skill_tier
0,ANS00001,P106074,4.917599,50.951821,1610.535142,3.0,40.047516,61031.190124,5403.618682,21.099955,...,,PC,Tablet,,EN,Active,Competitive,Mid,Diamond,Gold
1,ANS00002,P024878,8.060471,81.376671,543.088681,104.0,37.296412,10862.656232,,17.909547,...,EU,Mobile,Laptop,Card,ES,Active,,Mid,Silver,Bronze
2,ANS00003,P033678,,27.707037,721.533684,14.0,52.484579,27515.141077,1098.230071,26.183854,...,,PC,Phone,Wallet,ES,Dormant,Collector,Low,Bronze,Platinum
3,ANS00004,P020935,27.002787,26.859972,1442.810933,62.0,43.851594,5299.499711,,3.228531,...,APAC,PC,Phone,Wallet,ES,Dormant,Competitive,,Bronze,Silver
4,ANS00005,P049711,6.188164,49.545383,2039.185739,8.0,15.519366,7491.446985,1645.853549,27.231039,...,LATAM,Console,Desktop,Wallet,ES,Dormant,Competitive,Low,Silver,Platinum


In [19]:
import pandas as pd
from sklearn.impute import SimpleImputer

test_df = test_df.drop(['id','player_id'], axis=1)

# Continuous features and missing counts
continuous_features = [
    'play_frequency', 'avg_session_duration', 'total_playtime_hours',
    'login_streak', 'days_since_last_login', 'total_spending_thb',
    'avg_monthly_spending', 'spending_frequency', 'friend_count',
    'team_play_percentage', 'chat_activity_score', 'friend_invites_sent',
    'gifts_sent_received', 'ranked_participation_rate', 'tournament_entries',
    'competitive_rank', 'win_rate_ranked', 'watches_esports',
    'achievement_completion_rate', 'collection_progress', 'rare_items_count',
    'speed_of_progression', 'item_type_preference_cosmetic',
    'item_type_preference_performance', 'item_type_preference_social',
    'account_age_days', 'vip_tier', 'responds_to_discounts',
    'preferred_game_mode', 'avg_match_length', 'peak_concurrent_hours',
    'random_metric_1', 'random_metric_2', 'random_metric_3'
]

# Categorical features
categorical_features = [
    'region', 'platform', 'device_type', 'payment_method',
    'language', 'account_status', 'player_type_tag',
    'engagement_level', 'loyalty_tier', 'skill_tier'
]

# Compute missing percentages
missing_cont = test_df[continuous_features].isnull().mean() * 100

# Features <30% missing → median impute
cont_to_impute = missing_cont[missing_cont < 30].index.tolist()

imputer = SimpleImputer(strategy='median')
test_df[cont_to_impute] = imputer.fit_transform(test_df[cont_to_impute])


In [20]:
for col in categorical_features:
    missing_pct = test_df[col].isnull().mean() * 100
    if missing_pct < 20:
        test_df[col] = test_df[col].fillna(test_df[col].mode()[0])
    else:
        test_df[col] = test_df[col].fillna('Unknown')

# Check
test_df[categorical_features].isnull().sum()

Unnamed: 0,0
region,0
platform,0
device_type,0
payment_method,0
language,0
account_status,0
player_type_tag,0
engagement_level,0
loyalty_tier,0
skill_tier,0


In [21]:
import pandas as pd
import numpy as np

# --- Copy of original df ---
test_df_fe = test_df.copy()

# --- Continuous features ---
continuous_features = [
    'play_frequency', 'avg_session_duration', 'total_playtime_hours',
    'login_streak', 'days_since_last_login', 'total_spending_thb',
    'avg_monthly_spending', 'spending_frequency', 'friend_count',
    'team_play_percentage', 'chat_activity_score', 'friend_invites_sent',
    'gifts_sent_received', 'ranked_participation_rate', 'tournament_entries',
    'competitive_rank', 'win_rate_ranked', 'watches_esports',
    'achievement_completion_rate', 'collection_progress', 'rare_items_count',
    'speed_of_progression', 'item_type_preference_cosmetic',
    'item_type_preference_performance', 'item_type_preference_social',
    'account_age_days', 'vip_tier', 'responds_to_discounts',
    'avg_match_length', 'peak_concurrent_hours', 'random_metric_1',
    'random_metric_2', 'random_metric_3'
]

# --- Categorical features ---
categorical_features = [
    'region', 'platform', 'device_type', 'payment_method',
    'language', 'account_status', 'player_type_tag',
    'engagement_level', 'loyalty_tier', 'skill_tier'
]

# --- Derived Features ---

# 1. Time & Play Patterns
test_df_fe['freq_per_day'] = test_df_fe['play_frequency'] / test_df_fe['account_age_days']
test_df_fe['avg_session_per_play'] = test_df_fe['avg_session_duration'] / test_df_fe['play_frequency']
test_df_fe['playtime_per_login'] = test_df_fe['total_playtime_hours'] / test_df_fe['login_streak']
test_df_fe['consistency_score'] = test_df_fe['login_streak'] / test_df_fe['account_age_days']
test_df_fe['recency_days'] = test_df_fe['days_since_last_login']

# 2. Spending Behavior
test_df_fe['total_avg_ratio'] = test_df_fe['total_spending_thb'] / (test_df_fe['avg_monthly_spending'] + 1e-6)
test_df_fe['spend_per_playtime'] = test_df_fe['total_spending_thb'] / (test_df_fe['total_playtime_hours'] + 1e-6)
test_df_fe['spending_per_freq'] = test_df_fe['spending_frequency'] / (test_df_fe['play_frequency'] + 1e-6)
test_df_fe['discount_effect'] = test_df_fe['responds_to_discounts'] * test_df_fe['total_spending_thb']

# 3. Social & Team Engagement
test_df_fe['friends_per_play'] = test_df_fe['friend_count'] / (test_df_fe['play_frequency'] + 1e-6)
test_df_fe['social_score'] = test_df_fe['friend_invites_sent'] + test_df_fe['gifts_sent_received']
test_df_fe['teamplay_ratio'] = test_df_fe['team_play_percentage'] / 100
test_df_fe['ranked_per_hour'] = test_df_fe['ranked_participation_rate'] / (test_df_fe['total_playtime_hours'] + 1e-6)
test_df_fe['tournament_per_hour'] = test_df_fe['tournament_entries'] / (test_df_fe['total_playtime_hours'] + 1e-6)

# 4. Progression & Achievement
test_df_fe['achievements_per_hour'] = test_df_fe['achievement_completion_rate'] / (test_df_fe['total_playtime_hours'] + 1e-6)
test_df_fe['collection_per_day'] = test_df_fe['collection_progress'] / test_df_fe['account_age_days']
test_df_fe['rare_items_per_playtime'] = test_df_fe['rare_items_count'] / (test_df_fe['total_playtime_hours'] + 1e-6)
test_df_fe['progress_speed_per_session'] = test_df_fe['speed_of_progression'] / (test_df_fe['avg_session_duration'] + 1e-6)

# 5. Engagement Intensity
test_df_fe['playtime_per_day'] = test_df_fe['total_playtime_hours'] / test_df_fe['account_age_days']
test_df_fe['avg_match_hours'] = test_df_fe['avg_match_length'] * test_df_fe['play_frequency']
test_df_fe['peak_intensity'] = test_df_fe['peak_concurrent_hours'] / (test_df_fe['avg_session_duration'] + 1e-6)

# 6. Interaction Features
test_df_fe['spend_vip'] = test_df_fe['total_spending_thb'] * test_df_fe['vip_tier']
test_df_fe['friends_team_interaction'] = test_df_fe['friend_count'] * test_df_fe['team_play_percentage']
test_df_fe['achievements_collection'] = test_df_fe['achievement_completion_rate'] * test_df_fe['collection_progress']

# --- Handle Categorical Features ---
test_df_fe = pd.get_dummies(test_df_fe, columns=categorical_features, dummy_na=True)

In [22]:
# Ensure the test features match the training features
missing_cols = set(X_train_res.columns) - set(test_df_fe.columns)
for col in missing_cols:
    test_df_fe[col] = 0  # add missing dummy columns with 0

# Align column order
test_df_fe = test_df_fe[X_train_res.columns]

# Generate predictions
y_test_pred = voting_clf.predict(test_df_fe)


In [23]:
submission = pd.DataFrame({
    'id': test_ids,
    'segment': y_test_pred
})

# Save CSV
submission.to_csv('submission.csv', index=False)
print("Submission file saved!")


Submission file saved!
