In [None]:
!pip install -q kaggle optuna catboost
!pip install -q xgboost --upgrade
!pip install -q lightgbm --upgrade


#import

In [None]:
import os
import json
import numpy as np
import pandas as pd
import warnings
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import VotingClassifier, StackingRegressor
from sklearn.model_selection import train_test_split
from google.colab import userdata
warnings.filterwarnings('ignore')


#preprocess

In [None]:
def preprocess(df_input):
    """
    Applies imputation, feature engineering, and log transforms
    exactly as per the original methodology.
    """
    df = df_input.copy()

    # Drop ID columns for training, but keep if needed later (handled outside)
    df = df.drop(columns=['id', 'player_id'], errors='ignore')

    # Identify types
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns

    # Filter out target from numeric_cols if it exists to avoid imputing it
    if 'spending_30d' in numeric_cols:
        numeric_cols = numeric_cols.drop('spending_30d')

    # [cite_start]A. Impute Numerical with Median
    for col in numeric_cols:
        if df[col].isnull().sum() > 0:
            df[col] = df[col].fillna(df[col].median())

    # [cite_start]B. Impute Categorical with Mode
    for col in categorical_cols:
        if df[col].isnull().sum() > 0:
            df[col] = df[col].fillna(df[col].mode()[0])
        # Ensure proper type
        df[col] = df[col].astype('category')

    # [cite_start]C. Feature Engineering
    # Use .get() to avoid errors if columns are missing
    df['engagement_intensity'] = df.get('sessions_per_week', 0) * df.get('avg_session_length', 0)

    # Added 1e-5 to avoid division by zero
    df['spending_efficiency'] = df.get('historical_spending', 0) / (df.get('total_playtime_hours', 0) + 1e-5)

    df['conversion_strength'] = df.get('total_transactions', 0) / (df.get('days_since_last_login', 0) + 1)

    # [cite_start]D. Log Transform Skewed Features
    skewed_feats = ['total_playtime_hours', 'historical_spending', 'avg_transaction_value']
    for feat in skewed_feats:
        if feat in df.columns:
            df[f'log_{feat}'] = np.log1p(df[feat])

    return df

In [None]:
print("Loading and processing data")
train_file_path = "/content/task3/train.csv"
test_file_path = "/content/task3/test.csv"

df_train = pd.read_csv(train_file_path)
df_test = pd.read_csv(test_file_path)
test_player_ids = df_test['player_id']

# start process Data
df_train_processed = preprocess(df_train)
df_test_processed = preprocess(df_test)

target_class = 'will_spend'
target_reg = 'log_target'

df_train_processed[target_class] = (df_train_processed['spending_30d'] > 0).astype(int)
df_train_processed[target_reg] = np.log1p(df_train_processed['spending_30d'])

features_to_drop_regressor = [
    'achievement_completion_rate', 'competitive_rank', 'games_played',
    'days_since_last_login', 'cross_game_activity', 'seasonal_spending_pattern',
    'account_age_days', 'primary_game', 'guild_membership'
]

all_features = [col for col in df_train_processed.columns
                if col not in ['spending_30d', target_class, target_reg]]

print("Encoding categorical features...")
combined = pd.concat([df_train_processed[all_features], df_test_processed[all_features]],
                     axis=0, ignore_index=True)
combined_encoded = pd.get_dummies(combined, drop_first=True)

X_all = combined_encoded
X_class = X_all.iloc[:len(df_train_processed)].reset_index(drop=True)
X_test_all = X_all.iloc[len(df_train_processed):].reset_index(drop=True)

y_class = df_train_processed[target_class]
y_reg_full = df_train_processed[target_reg]

print("Preparing regressor feature set...")
regressor_cols = [col for col in X_class.columns
                  if not any(col.startswith(feat) for feat in features_to_drop_regressor)]

features_classifier = X_class.columns.tolist()
features_regressor = regressor_cols

spenders_mask = (y_class == 1)
X_reg = X_class.loc[spenders_mask, features_regressor].reset_index(drop=True)
y_reg = y_reg_full.loc[spenders_mask].reset_index(drop=True)

X_test_reg = X_test_all[features_regressor].reset_index(drop=True)
print("preprocess done...")

In [None]:
# Soft Voting
print("\nTraining Classification Voting Ensemble...")
clf_xgb = xgb.XGBClassifier(
    n_estimators=500, learning_rate=0.05, max_depth=6,
    subsample=0.8, colsample_bytree=0.8,
    eval_metric='logloss',
    device="cuda",
    random_state=42, n_jobs=-1
)

clf_lgb = lgb.LGBMClassifier(
    n_estimators=600, learning_rate=0.05, max_depth=-1, num_leaves=63,
    subsample=0.8, colsample_bytree=0.8,
    device_type="gpu",
    random_state=42, n_jobs=-1, verbose=-1
)

clf_cat = CatBoostClassifier(
    n_estimators=500, learning_rate=0.05, depth=6,
    loss_function='Logloss',
    task_type="GPU",
    devices='0',
    random_state=42, verbose=0
)

voting_clf = VotingClassifier(
    estimators=[('xgb', clf_xgb), ('lgb', clf_lgb), ('cat', clf_cat)],
    voting='soft',
    n_jobs=-1
)

voting_clf.fit(X_class, y_class)
print("Classifier trained.")

# Regression Model Stacking - Only on Spenders
print("\nTraining Regression Stacking Ensemble...")

reg_xgb = xgb.XGBRegressor(
    n_estimators=700, learning_rate=0.03, max_depth=6,
    subsample=0.8, colsample_bytree=0.8,
    objective='reg:squarederror', eval_metric='rmse',
    device="cuda",
    random_state=42, n_jobs=-1
)

reg_lgb = lgb.LGBMRegressor(
    n_estimators=800, learning_rate=0.03, max_depth=-1, num_leaves=63,
    subsample=0.8, colsample_bytree=0.8,
    device_type="gpu",
    random_state=42, n_jobs=-1, verbose=-1
)

reg_cat = CatBoostRegressor(
    n_estimators=700, learning_rate=0.03, depth=6,
    loss_function='RMSE',
    task_type="GPU",
    devices='0',
    random_state=42, verbose=0
)

# Final Estimator for Stacking
final_reg = xgb.XGBRegressor(
    n_estimators=500, learning_rate=0.03, max_depth=6,
    subsample=0.8, colsample_bytree=0.8,
    objective='reg:squarederror',
    device="cuda",
    random_state=42, n_jobs=-1
)

stack_reg = StackingRegressor(
    estimators=[('xgb', reg_xgb), ('lgb', reg_lgb), ('cat', reg_cat)],
    final_estimator=final_reg,
    n_jobs=1,
    passthrough=True
)

stack_reg.fit(X_reg, y_reg)
print("Regressor trained.")



In [None]:

print("\n final predictions...")

# Predict Probability of Spending (0 to 1)
pred_class_proba = voting_clf.predict_proba(X_test_all)[:, 1]

# Predict Log Amount (for EVERYONE in test)
pred_reg_log = stack_reg.predict(X_test_reg)
pred_reg_amount = np.expm1(pred_reg_log) # Reverse log transform

# Clip negatives (regression artifacts)
pred_reg_amount = np.maximum(pred_reg_amount, 0)

#Combine: Expected Value = P(Spend) * Predicted Amount
final_predictions = pred_class_proba * pred_reg_amount

# Create Submission
submission = pd.DataFrame({
    'player_id': test_player_ids,
    'spending_30d': final_predictions
})

submission.to_csv('submission.csv', index=False)
print("Success! 'submission.csv' has been created.")
print(submission.head())