# Player Monthly Spending Prediction - Refactored

Clean, standardized implementation of the player spending prediction pipeline.

In [None]:
!pip install kaggle optuna catboost

In [None]:
import os
import zipfile
import joblib
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.calibration import CalibratedClassifierCV
import lightgbm as lgb
from catboost import CatBoostClassifier, CatBoostRegressor
import optuna

try:
    from google.colab import files
    COLAB_ENV = True
except ImportError:
    COLAB_ENV = False

In [None]:
# Configuration
RANDOM_STATE = 42
N_TRIALS = 40
TEST_SIZE = 0.15
CV_FOLDS = 5

## Data Loading Functions

In [None]:
def download_kaggle_data():
    """Download and extract Kaggle competition data."""
    if COLAB_ENV:
        try:
            uploaded = files.upload()
        except Exception as e:
            print(f"File upload failed: {e}")
    else:
        print("Running outside of Colab. Ensure kaggle.json is in ~/.kaggle/")

    if 'kaggle.json' in os.listdir('.'):
        !mkdir -p ~/.kaggle
        !mv kaggle.json ~/.kaggle/
        !chmod 600 ~/.kaggle/kaggle.json
    else:
        print("kaggle.json not found.")

    if not os.path.exists('cpe342-karena.zip'):
        print("Downloading data...")
        !kaggle competitions download -c cpe342-karena
    else:
        print("Data already downloaded.")

    if os.path.exists('cpe342-karena.zip'):
        print("Extracting data...")
        with zipfile.ZipFile('cpe342-karena.zip', 'r') as zip_ref:
            zip_ref.extractall('.')
        print("Data extracted.")

## Data Preprocessing Functions

In [None]:
def load_and_preprocess_data(file_path, is_training=True):
    """Load and preprocess player spending data."""
    df = pd.read_csv(file_path)
    
    if is_training:
        df = df.drop(['id', 'player_id'], axis=1, errors='ignore')
        # Create binary target and log target
        df['will_spend'] = (df['spending_30d'] > 0).astype(int)
        df['log_spend'] = np.log1p(df['spending_30d'])
    else:
        ids = df['id'].copy()
        df = df.drop(['id', 'player_id'], axis=1, errors='ignore')
        return df, ids
    
    # Remove duplicates
    df = df.drop_duplicates().reset_index(drop=True)
    
    return df

In [None]:
def create_preprocessor():
    """Create preprocessing pipeline for features."""
    # Identify feature types
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='__missing__')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])
    
    # Note: Features will be determined dynamically based on data types
    return numeric_transformer, categorical_transformer

## Model Optimization Functions

In [None]:
def optimize_classifier(X, y, preprocessor, n_trials=30, random_state=42):
    """Optimize CatBoost classifier hyperparameters."""
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.15, random_state=random_state, stratify=y
    )
    
    def objective(trial):
        params = {
            'iterations': trial.suggest_int('iterations', 200, 1500),
            'depth': trial.suggest_int('depth', 4, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
            'random_seed': random_state,
            'verbose': 0
        }
        
        clf = Pipeline([
            ('preproc', preprocessor),
            ('clf', CatBoostClassifier(**params))
        ])
        
        clf.fit(X_train, y_train)
        p = clf.predict_proba(X_val)[:, 1]
        
        from sklearn.metrics import log_loss
        return log_loss(y_val, p, labels=[0, 1])
    
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)
    return study.best_params

In [None]:
def optimize_regressor(X, y, preprocessor, n_trials=40, random_state=42):
    """Optimize LightGBM regressor hyperparameters."""
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.15, random_state=random_state
    )
    
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 200, 2000),
            'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 16, 256),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 200),
            'subsample': trial.suggest_float('subsample', 0.4, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
            'random_state': random_state
        }
        
        reg = Pipeline([
            ('preproc', preprocessor),
            ('reg', lgb.LGBMRegressor(**params))
        ])
        
        reg.fit(X_train, y_train)
        pred = np.expm1(reg.predict(X_val))
        true = np.expm1(y_val)
        
        return mean_absolute_error(true, pred)
    
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)
    return study.best_params

In [None]:
def create_two_stage_model(X_train, y_cls, y_spend, best_clf_params, best_reg_params, preprocessor):
    """Create and train two-stage model."""
    # Stage 1: Classifier
    clf = Pipeline([
        ('preproc', preprocessor),
        ('clf', CatBoostClassifier(verbose=0, random_seed=RANDOM_STATE, **best_clf_params))
    ])
    clf.fit(X_train, y_cls)
    
    # Stage 2: Regressor (train only on positive spenders)
    pos_mask = y_spend > 0
    X_train_reg = X_train[pos_mask]
    y_train_reg = np.log1p(y_spend[pos_mask])
    
    reg = Pipeline([
        ('preproc', preprocessor),
        ('reg', lgb.LGBMRegressor(random_state=RANDOM_STATE, **best_reg_params))
    ])
    reg.fit(X_train_reg, y_train_reg)
    
    return clf, reg

# Training Pipeline

In [None]:
# Download data
download_kaggle_data()

## Training Data Preprocessing

In [None]:
# Load and preprocess training data
df = load_and_preprocess_data('task3/train.csv', is_training=True)

# Separate features and targets
feature_cols = [c for c in df.columns if c not in ['spending_30d', 'will_spend', 'log_spend']]
X = df[feature_cols].copy()
y_cls = df['will_spend']
y_spend = df['spending_30d']

# Identify feature types
num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = X.select_dtypes(include=['object', 'bool']).columns.tolist()

print(f"Numeric features: {len(num_features)}")
print(f"Categorical features: {len(cat_features)}")

# Create preprocessor
numeric_transformer, categorical_transformer = create_preprocessor()
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, num_features),
    ('cat', categorical_transformer, cat_features)
], remainder='drop', sparse_threshold=0)

# Split data
X_train, X_holdout, y_train_cls, y_holdout_cls, y_train_spend, y_holdout_spend = train_test_split(
    X, y_cls, y_spend, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_cls
)

## Model Training and Optimization

In [None]:
print("Optimizing classifier...")
best_clf_params = optimize_classifier(X_train, y_train_cls, preprocessor, N_TRIALS, RANDOM_STATE)
print(f"Best classifier params: {best_clf_params}")

In [None]:
print("Optimizing regressor...")
# Prepare regressor training data (positive spenders only)
pos_mask = y_train_spend > 0
X_train_reg = X_train[pos_mask]
y_train_reg = np.log1p(y_train_spend[pos_mask])

best_reg_params = optimize_regressor(X_train_reg, y_train_reg, preprocessor, N_TRIALS, RANDOM_STATE)
print(f"Best regressor params: {best_reg_params}")

In [None]:
# Create final models
clf_final, reg_final = create_two_stage_model(
    X_train, y_train_cls, y_train_spend, 
    best_clf_params, best_reg_params, preprocessor
)

In [None]:
# Evaluate on holdout
p_holdout = clf_final.predict_proba(X_holdout)[:, 1]
pred_log = reg_final.predict(X_holdout)
pred_spend = np.expm1(pred_log)
final_pred = p_holdout * pred_spend

mae = mean_absolute_error(y_holdout_spend, final_pred)
normalized_mae = mae / (y_holdout_spend.mean() + 1e-9)

print(f"Holdout MAE: {mae:.4f}")
print(f"Holdout Normalized MAE: {normalized_mae:.4f}")

In [None]:
# Save models
joblib.dump(clf_final, 'classifier.joblib')
joblib.dump(reg_final, 'regressor.joblib')
joblib.dump(preprocessor, 'preprocessor.joblib')
joblib.dump(feature_cols, 'feature_cols.joblib')

if COLAB_ENV:
    files.download('classifier.joblib')
    files.download('regressor.joblib')
    files.download('preprocessor.joblib')
    files.download('feature_cols.joblib')

# Prediction Pipeline

## Load and Preprocess Test Data

In [None]:
def preprocess_test_data(test_path, feature_cols):
    """Preprocess test data using saved feature columns."""
    df_test, test_ids = load_and_preprocess_data(test_path, is_training=False)
    
    # Ensure test data has same features as training
    X_test = df_test[feature_cols].copy()
    
    return X_test, test_ids

In [None]:
def make_two_stage_predictions(clf, reg, X_test):
    """Make predictions using two-stage model."""
    # Stage 1: Probability of spending
    p_spend = clf.predict_proba(X_test)[:, 1]
    
    # Stage 2: Amount prediction (log scale)
    pred_log = reg.predict(X_test)
    pred_amount = np.expm1(pred_log)
    
    # Final prediction: probability * amount
    final_pred = p_spend * pred_amount
    
    return final_pred

In [None]:
# Load saved models
clf_final = joblib.load('classifier.joblib')
reg_final = joblib.load('regressor.joblib')
feature_cols = joblib.load('feature_cols.joblib')

In [None]:
# Preprocess test data
X_test, test_ids = preprocess_test_data('task3/test.csv', feature_cols)

## Generate Predictions and Create Submission

In [None]:
# Make predictions
predictions = make_two_stage_predictions(clf_final, reg_final, X_test)

# Create submission file
submission = pd.DataFrame({
    'id': test_ids,
    'task3': predictions
})

submission.to_csv('submission.csv', index=False)
print(f"Submission created with {len(submission)} predictions")
print(f"Prediction statistics:")
print(f"  Mean: {predictions.mean():.2f}")
print(f"  Median: {np.median(predictions):.2f}")
print(f"  Max: {predictions.max():.2f}")
print(f"  Zero predictions: {(predictions == 0).sum()}")

if COLAB_ENV:
    files.download('submission.csv')