# Player Segment Classification

In [None]:
!pip install kaggle catboost optuna

In [None]:
import os
import zipfile
import joblib
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, classification_report
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import optuna

try:
    from google.colab import files
    COLAB_ENV = True
except ImportError:
    COLAB_ENV = False

In [None]:
# Configuration
RANDOM_STATE = 42
N_TRIALS = 10
TEST_SIZE = 0.1
CV_FOLDS = 3

## Data Loading Functions

In [None]:
def download_kaggle_data():
    """Download and extract Kaggle competition data."""
    if COLAB_ENV:
        try:
            uploaded = files.upload()
        except Exception as e:
            print(f"File upload failed: {e}")
    else:
        print("Running outside of Colab. Ensure kaggle.json is in ~/.kaggle/")

    if 'kaggle.json' in os.listdir('.'):
        !mkdir -p ~/.kaggle
        !mv kaggle.json ~/.kaggle/
        !chmod 600 ~/.kaggle/kaggle.json
    else:
        print("kaggle.json not found.")

    if not os.path.exists('cpe342-karena.zip'):
        print("Downloading data...")
        !kaggle competitions download -c cpe342-karena
    else:
        print("Data already downloaded.")

    if os.path.exists('cpe342-karena.zip'):
        print("Extracting data...")
        with zipfile.ZipFile('cpe342-karena.zip', 'r') as zip_ref:
            zip_ref.extractall('.')
        print("Data extracted.")

## Data Preprocessing Functions

In [None]:
def load_and_preprocess_data(file_path, is_training=True):
    """Load and preprocess player data."""
    df = pd.read_csv(file_path)
    
    if is_training:
        df = df.drop(['id', 'player_id'], axis=1)
    else:
        ids = df['id'].copy()
        df = df.drop(['id', 'player_id'], axis=1)
    
    continuous_features = [
        'play_frequency', 'avg_session_duration', 'total_playtime_hours',
        'login_streak', 'days_since_last_login', 'total_spending_thb',
        'avg_monthly_spending', 'spending_frequency', 'friend_count',
        'team_play_percentage', 'chat_activity_score', 'friend_invites_sent',
        'gifts_sent_received', 'ranked_participation_rate', 'tournament_entries',
        'competitive_rank', 'win_rate_ranked', 'watches_esports',
        'achievement_completion_rate', 'collection_progress', 'rare_items_count',
        'speed_of_progression', 'item_type_preference_cosmetic',
        'item_type_preference_performance', 'item_type_preference_social',
        'account_age_days', 'vip_tier', 'responds_to_discounts',
        'preferred_game_mode', 'avg_match_length', 'peak_concurrent_hours',
        'random_metric_1', 'random_metric_2', 'random_metric_3'
    ]
    
    categorical_features = [
        'region', 'platform', 'device_type', 'payment_method',
        'language', 'account_status', 'player_type_tag',
        'engagement_level', 'loyalty_tier', 'skill_tier'
    ]
    
    # Impute continuous features
    missing_cont = df[continuous_features].isnull().mean() * 100
    cont_to_impute = missing_cont[missing_cont < 30].index.tolist()
    imputer = SimpleImputer(strategy='median')
    df[cont_to_impute] = imputer.fit_transform(df[cont_to_impute])
    
    # Impute categorical features
    for col in categorical_features:
        missing_pct = df[col].isnull().mean() * 100
        if missing_pct < 20:
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            df[col] = df[col].fillna('Unknown')
    
    if is_training:
        return df
    else:
        return df, ids

In [None]:
def engineer_features(df):
    """Create derived features and encode categorical variables."""
    df_fe = df.copy()
    
    categorical_features = [
        'region', 'platform', 'device_type', 'payment_method',
        'language', 'account_status', 'player_type_tag',
        'engagement_level', 'loyalty_tier', 'skill_tier'
    ]
    
    # Time & Play Patterns
    df_fe['freq_per_day'] = df_fe['play_frequency'] / df_fe['account_age_days']
    df_fe['avg_session_per_play'] = df_fe['avg_session_duration'] / df_fe['play_frequency']
    df_fe['playtime_per_login'] = df_fe['total_playtime_hours'] / df_fe['login_streak']
    df_fe['consistency_score'] = df_fe['login_streak'] / df_fe['account_age_days']
    
    # Spending Behavior
    df_fe['total_avg_ratio'] = df_fe['total_spending_thb'] / (df_fe['avg_monthly_spending'] + 1e-6)
    df_fe['spend_per_playtime'] = df_fe['total_spending_thb'] / (df_fe['total_playtime_hours'] + 1e-6)
    df_fe['spending_per_freq'] = df_fe['spending_frequency'] / (df_fe['play_frequency'] + 1e-6)
    df_fe['discount_effect'] = df_fe['responds_to_discounts'] * df_fe['total_spending_thb']
    
    # Social & Team Engagement
    df_fe['friends_per_play'] = df_fe['friend_count'] / (df_fe['play_frequency'] + 1e-6)
    df_fe['social_score'] = df_fe['friend_invites_sent'] + df_fe['gifts_sent_received']
    df_fe['teamplay_ratio'] = df_fe['team_play_percentage'] / 100
    df_fe['ranked_per_hour'] = df_fe['ranked_participation_rate'] / (df_fe['total_playtime_hours'] + 1e-6)
    df_fe['tournament_per_hour'] = df_fe['tournament_entries'] / (df_fe['total_playtime_hours'] + 1e-6)
    
    # Progression & Achievement
    df_fe['achievements_per_hour'] = df_fe['achievement_completion_rate'] / (df_fe['total_playtime_hours'] + 1e-6)
    df_fe['collection_per_day'] = df_fe['collection_progress'] / df_fe['account_age_days']
    df_fe['rare_items_per_playtime'] = df_fe['rare_items_count'] / (df_fe['total_playtime_hours'] + 1e-6)
    df_fe['progress_speed_per_session'] = df_fe['speed_of_progression'] / (df_fe['avg_session_duration'] + 1e-6)
    
    # Engagement Intensity
    df_fe['playtime_per_day'] = df_fe['total_playtime_hours'] / df_fe['account_age_days']
    df_fe['avg_match_hours'] = df_fe['avg_match_length'] * df_fe['play_frequency']
    df_fe['peak_intensity'] = df_fe['peak_concurrent_hours'] / (df_fe['avg_session_duration'] + 1e-6)
    
    # Interaction Features
    df_fe['spend_vip'] = df_fe['total_spending_thb'] * df_fe['vip_tier']
    df_fe['friends_team_interaction'] = df_fe['friend_count'] * df_fe['team_play_percentage']
    df_fe['achievements_collection'] = df_fe['achievement_completion_rate'] * df_fe['collection_progress']
    
    # Handle categorical features
    df_fe = pd.get_dummies(df_fe, columns=categorical_features, dummy_na=True)
    
    # Handle infinite values
    df_fe.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_fe.fillna(df_fe.median(), inplace=True)
    
    return df_fe

## Model Optimization Functions

In [None]:
def optimize_xgboost(X, y, n_trials=10, random_state=42):
    """Optimize XGBoost hyperparameters."""
    skf = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=random_state)
    
    def objective(trial):
        params = {
            'objective': 'multi:softmax',
            'num_class': 4,
            'use_label_encoder': False,
            'eval_metric': 'mlogloss',
            'random_state': random_state,
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        }
        
        model = xgb.XGBClassifier(**params)
        f1_scores = []
        
        for train_idx, val_idx in skf.split(X, y):
            X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
            
            model.fit(X_train_fold, y_train_fold)
            y_pred_val = model.predict(X_val_fold)
            f1_scores.append(f1_score(y_val_fold, y_pred_val, average='macro'))
        
        return np.mean(f1_scores)
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)
    return study.best_params


def optimize_lightgbm(X, y, n_trials=10, random_state=42):
    """Optimize LightGBM hyperparameters."""
    skf = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=random_state)
    
    def objective(trial):
        params = {
            'objective': 'multiclass',
            'num_class': 4,
            'random_state': random_state,
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 2, 256),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        }
        
        model = lgb.LGBMClassifier(**params)
        f1_scores = []
        
        for train_idx, val_idx in skf.split(X, y):
            X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
            
            model.fit(X_train_fold, y_train_fold)
            y_pred_val = model.predict(X_val_fold)
            f1_scores.append(f1_score(y_val_fold, y_pred_val, average='macro'))
        
        return np.mean(f1_scores)
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)
    return study.best_params


def optimize_catboost(X, y, n_trials=10, random_state=42):
    """Optimize CatBoost hyperparameters."""
    skf = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=random_state)
    
    def objective(trial):
        params = {
            'loss_function': 'MultiClass',
            'verbose': 0,
            'random_seed': random_state,
            'iterations': trial.suggest_int('iterations', 100, 1000),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
            'depth': trial.suggest_int('depth', 3, 10),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
            'bootstrap_type': 'Bernoulli'
        }
        
        model = CatBoostClassifier(**params)
        f1_scores = []
        
        for train_idx, val_idx in skf.split(X, y):
            X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
            
            model.fit(X_train_fold, y_train_fold)
            y_pred_val = model.predict(X_val_fold)
            f1_scores.append(f1_score(y_val_fold, y_pred_val, average='macro'))
        
        return np.mean(f1_scores)
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)
    return study.best_params

In [None]:
def create_ensemble(X_train, y_train, best_xgb_params, best_lgb_params, best_cat_params):
    """Create and train ensemble model."""
    xgb_model = xgb.XGBClassifier(**best_xgb_params)
    lgb_model = lgb.LGBMClassifier(**best_lgb_params)
    cat_model = CatBoostClassifier(**best_cat_params)
    
    voting_clf = VotingClassifier(
        estimators=[
            ('xgb', xgb_model),
            ('lgb', lgb_model),
            ('cat', cat_model)
        ],
        voting='soft',
        n_jobs=-1
    )
    
    voting_clf.fit(X_train, y_train)
    return voting_clf

# Training Pipeline

In [None]:
# Download data
download_kaggle_data()

## Training Data Preprocessing

In [None]:
# Load and preprocess training data
df = load_and_preprocess_data('task2/train.csv', is_training=True)
df_fe = engineer_features(df)

# Prepare features and target
X = df_fe.drop(columns=['segment'])
y = df_fe['segment']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

# Apply SMOTE for class balancing
smote = SMOTE(random_state=RANDOM_STATE)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print(f"Training data shape: {X_train_res.shape}")
print(f"Class distribution after SMOTE: {pd.Series(y_train_res).value_counts()}")

## Model Training and Optimization

In [None]:
print("Optimizing XGBoost...")
best_xgb_params = optimize_xgboost(X_train_res, y_train_res, N_TRIALS, RANDOM_STATE)
print(f"Best XGBoost params: {best_xgb_params}")

In [None]:
print("Optimizing LightGBM...")
best_lgb_params = optimize_lightgbm(X_train_res, y_train_res, N_TRIALS, RANDOM_STATE)
print(f"Best LightGBM params: {best_lgb_params}")

In [None]:
print("Optimizing CatBoost...")
best_cat_params = optimize_catboost(X_train_res, y_train_res, N_TRIALS, RANDOM_STATE)
print(f"Best CatBoost params: {best_cat_params}")

In [None]:
# Create ensemble
voting_clf = create_ensemble(X_train_res, y_train_res, best_xgb_params, best_lgb_params, best_cat_params)

# Evaluate on test set
y_pred = voting_clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# Save models and preprocessors
joblib.dump(voting_clf, 'voting_clf.joblib')
joblib.dump(X_train_res.columns.tolist(), 'feature_columns.joblib')

if COLAB_ENV:
    files.download('voting_clf.joblib')
    files.download('feature_columns.joblib')

# Prediction Pipeline

## Load and Preprocess Test Data

In [None]:
def preprocess_test_data(test_path, feature_columns):
    """Preprocess test data to match training format."""
    test_df, test_ids = load_and_preprocess_data(test_path, is_training=False)
    test_df_fe = engineer_features(test_df)
    
    # Ensure test features match training features
    missing_cols = set(feature_columns) - set(test_df_fe.columns)
    for col in missing_cols:
        test_df_fe[col] = 0
    
    # Align column order
    test_df_fe = test_df_fe[feature_columns]
    
    return test_df_fe, test_ids

In [None]:
# Load saved models and feature columns
voting_clf = joblib.load('voting_clf.joblib')
feature_columns = joblib.load('feature_columns.joblib')

# Preprocess test data
test_df_processed, test_ids = preprocess_test_data('task2/test.csv', feature_columns)

## Generate Predictions and Create Submission

In [None]:
# Make predictions
predictions = voting_clf.predict(test_df_processed)

# Create submission file
submission = pd.DataFrame({
    'id': test_ids,
    'segment': predictions
})

submission.to_csv('submission.csv', index=False)
print(f"Submission created with {len(submission)} predictions")
print(f"Prediction distribution: {submission['segment'].value_counts().to_dict()}")

if COLAB_ENV:
    files.download('submission.csv')