# Load dataset

In [1]:
!pip install kaggle



In [2]:
import os
import zipfile

def download_data_from_kaggle():
    try:
        from google.colab import files
        uploaded = files.upload()
    except ImportError:
        print("Running outside of Colab. Please ensure your kaggle.json is in ~/.kaggle/")

    if 'kaggle.json' in os.listdir('.'):
        !mkdir -p ~/.kaggle
        !mv kaggle.json ~/.kaggle/
        !chmod 600 ~/.kaggle/kaggle.json
    else:
        print("kaggle.json not found. Please upload it or place it in the correct directory.")

    if not os.path.exists('cpe342-karena.zip'):
        print("Downloading data from Kaggle competition 'cpe342-karena'...")
        !kaggle competitions download -c cpe342-karena
    else:
        print("Data already downloaded.")

    if os.path.exists('cpe342-karena.zip'):
        print("Unzipping data...")
        try:
            with zipfile.ZipFile('cpe342-karena.zip', 'r') as zip_ref:
                zip_ref.extractall('.')
            print("Data unzipped.")
        except zipfile.BadZipFile:
            print("Error: Downloaded file is not a valid zip file.")
        except Exception as e:
            print(f"An error occurred during unzipping: {e}")
    else:
        print("Zip file not found, cannot unzip.")

In [3]:
download_data_from_kaggle()

Saving kaggle.json to kaggle.json
Downloading data from Kaggle competition 'cpe342-karena'...
Downloading cpe342-karena.zip to /content
 95% 876M/920M [00:08<00:00, 134MB/s]
100% 920M/920M [00:10<00:00, 92.3MB/s]
Unzipping data...
Data unzipped.


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Modeling

In [19]:
!pip install catboost lightgbm xgboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [20]:
# --- Import Libraries ---
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import catboost as cb
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from scipy.stats import uniform, randint
import joblib
from google.colab import files

# --- Constants ---
CORRELATION_THRESHOLD = 0.8
TOP_N_FEATURES = 20
RANDOM_STATE = 42

In [21]:
# @title
def cleaning_data(df, knn_cols, knn_imputer=None, median_imputer=None, is_training=True):
    """
    Impute missing values using KNN for correlated columns and median for others.
    Ensures imputers are fitted only on training data, reused on test data.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe
    knn_cols : list
        List of columns to apply KNN imputation
    knn_imputer : fitted KNNImputer (optional)
    median_imputer : fitted SimpleImputer (optional)
    is_training : bool
        True for training set (fits imputers), False for test (reuses imputers)

    Returns
    -------
    df_imputed : pd.DataFrame
        Imputed dataframe
    knn_imputer : fitted KNNImputer
    median_imputer : fitted SimpleImputer
    """

    df_imputed = df.copy()

    if is_training:
        # --- Remove id cols ---
        df_imputed = df_imputed.drop(['id', 'player_id'], axis=1)

        knn_imputer = KNNImputer(n_neighbors=5)
        median_imputer = SimpleImputer(strategy='median')

        # --- KNN imputation ---
        df_imputed[knn_cols] = knn_imputer.fit_transform(df_imputed[knn_cols])

        # --- Median imputation for remaining missing values ---
        remaining_cols_with_nan = df_imputed.columns[df_imputed.isnull().any()].tolist()

        if 'is_cheater' in remaining_cols_with_nan:
            remaining_cols_with_nan.remove('is_cheater')

        if remaining_cols_with_nan:
            df_imputed[remaining_cols_with_nan] = median_imputer.fit_transform(df_imputed[remaining_cols_with_nan])

        # --- Drop rows with missing target (if any) ---
        df_imputed = df_imputed[df_imputed['is_cheater'].notna()].reset_index(drop=True)

    else:
        if knn_imputer is None or median_imputer is None:
            raise ValueError("Pre-fitted imputers must be provided for the test set (is_training=False).")

        # --- Apply trained imputers ---
        df_imputed[knn_cols] = knn_imputer.transform(df_imputed[knn_cols])

        remaining_cols_with_nan_test = df_imputed.columns[df_imputed.isnull().any()].tolist()

        if 'is_cheater' in remaining_cols_with_nan_test:
            remaining_cols_with_nan_test.remove('is_cheater')

        if remaining_cols_with_nan_test:
            df_imputed[remaining_cols_with_nan_test] = median_imputer.transform(df_imputed[remaining_cols_with_nan_test])

    return df_imputed, knn_imputer, median_imputer

def feature_extractions(df):
    """
    generates efficiency, stability, and behavioral ratio metrics.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame (after imputation)

    Returns
    -------
    df_feature_extracted : pd.DataFrame
        DataFrame with additional engineered features
    """

    df_feature_extracted = df.copy()

    # --- Efficiency & Aim Features ---
    df_feature_extracted["kill_efficiency"] = df_feature_extracted["kill_death_ratio"] * df_feature_extracted["accuracy_score"]
    df_feature_extracted["headshot_ratio_to_accuracy"] = df_feature_extracted["headshot_percentage"] / (df_feature_extracted["accuracy_score"] + 1e-6)
    df_feature_extracted["reaction_accuracy_ratio"] = df_feature_extracted["accuracy_score"] / (df_feature_extracted["reaction_time_ms"] + 1e-6)
    df_feature_extracted["damage_efficiency"] = df_feature_extracted["damage_per_round"] / (df_feature_extracted["survival_time_avg"] + 1e-6)

    # --- Stability & Consistency ---
    df_feature_extracted["reports_per_day"] = df_feature_extracted["reports_received"] / (df_feature_extracted["account_age_days"] + 1)
    df_feature_extracted["device_change_rate"] = df_feature_extracted["device_changes_count"] / (df_feature_extracted["account_age_days"] + 1)
    df_feature_extracted["session_intensity"] = df_feature_extracted["sessions_per_day"] * df_feature_extracted["avg_session_length_min"]

    # --- Behavioral Ratios ---
    df_feature_extracted["performance_per_account_age"] = (
        (df_feature_extracted["kill_death_ratio"] + df_feature_extracted["accuracy_score"] + df_feature_extracted["win_rate"]) / (df_feature_extracted["account_age_days"] + 1)
    )
    df_feature_extracted["input_to_accuracy_ratio"] = df_feature_extracted["input_consistency_score"] / (df_feature_extracted["accuracy_score"] + 1e-6)
    df_feature_extracted["friendliness_ratio"] = (
        df_feature_extracted["communication_rate"] * df_feature_extracted["team_play_score"] / (df_feature_extracted["reports_per_day"] + 1e-6)
    )
    return df_feature_extracted

def standardize_features(df, scaler=None, exclude_cols=None, is_training=True):
    """
    Standardizes numerical features (mean=0, std=1) for consistent scaling.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe (after imputation & feature engineering)
    scaler : fitted StandardScaler, optional
        Used when is_training=False to apply the same scaling to test data
    exclude_cols : list, optional
        Columns to exclude from scaling (IDs, target, binary flags)
    is_training : bool
        If True, fits new scaler; else reuses existing one

    Returns
    -------
    df_scaled : pd.DataFrame
        Scaled dataframe (same shape, same column names)
    scaler : fitted StandardScaler
    """

    df_scaled = df.copy()

    exclude_cols = ['id', 'player_id', 'is_cheater']
    num_cols = df_scaled.select_dtypes(include=['float64', 'int64']).columns.tolist()
    num_cols = [col for col in num_cols if col not in exclude_cols]

    if is_training:
        scaler = StandardScaler()
        df_scaled[num_cols] = scaler.fit_transform(df_scaled[num_cols])
    else:
        if scaler is None:
            raise ValueError("Pre-fitted scaler must be provided for test data (is_training=False).")
        df_scaled[num_cols] = scaler.transform(df_scaled[num_cols])

    return df_scaled, scaler

def remove_highly_correlated_features(df, threshold=0.9, exclude_cols=None, flag_dropped=False):
    """
    Removes highly correlated features from the dataframe.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe (after standardization)
    threshold : float
        Correlation threshold above which features are considered highly correlated
    exclude_cols : list
        Columns to exclude from correlation check (IDs, target, etc.)
    flag_dropped : bool
        If True, adds a boolean column `dropped_<feature>` to indicate features that were dropped

    Returns
    -------
    df_reduced : pd.DataFrame
        DataFrame with highly correlated features removed
    dropped_features : list
        List of features that were dropped
    """
    df_reduced = df.copy()
    exclude_cols = exclude_cols or ['id', 'player_id', 'is_cheater']

    # Only check numeric columns
    numeric_cols = df_reduced.select_dtypes(include=['float64', 'int64']).columns.tolist()
    numeric_cols = [c for c in numeric_cols if c not in exclude_cols]

    corr_matrix = df_reduced[numeric_cols].corr().abs()

    # Upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation above threshold
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

    if flag_dropped:
        for col in to_drop:
            df_reduced[f'dropped_{col}'] = True

    df_reduced.drop(columns=to_drop, inplace=True)

    return df_reduced, to_drop

def select_important_features(df, target_col='is_cheater', n_top_features=None, importance_threshold=None, random_state=42):
    """
    Trains a RandomForestClassifier to rank feature importance and select top features.

    Parameters
    ----------
    df : pd.DataFrame
        Processed dataframe with features and target
    target_col : str
        Name of target column
    n_top_features : int, optional
        Number of top features to keep (overrides importance_threshold if set)
    importance_threshold : float, optional
        Minimum feature importance to keep a feature (0-1)
    random_state : int
        Random state for reproducibility

    Returns
    -------
    X_selected : pd.DataFrame
        DataFrame with selected important features
    important_features : list
        List of selected feature names
    feature_importances : pd.Series
        All feature importances (sorted descending)
    """
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # Train a Random Forest
    model = RandomForestClassifier(n_estimators=200, random_state=random_state)
    model.fit(X, y)

    # Get feature importances
    feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)

    # Select features based on n_top_features or importance_threshold
    if n_top_features is not None:
        important_features = feature_importances.head(n_top_features).index.tolist()
    elif importance_threshold is not None:
        important_features = feature_importances[feature_importances >= importance_threshold].index.tolist()
    else:
        # Default: keep all features
        important_features = X.columns.tolist()

    X_selected = X[important_features].copy()

    return X_selected, important_features, feature_importances

def randomized_search_gb_ensemble(X_train, y_train, n_iter=20, cv=3, scoring='roc_auc', random_state=42):
    """
    Perform RandomizedSearchCV on CatBoost + XGBoost + LightGBM ensemble with class imbalance handling.

    Parameters
    ----------
    X_train : pd.DataFrame
        Training features
    y_train : pd.Series
        Target
    n_iter : int
        Number of parameter combinations to try
    cv : int
        Number of cross-validation folds
    scoring : str
        Scoring metric
    random_state : int
        Random seed

    Returns
    -------
    best_model : VotingClassifier
        Ensemble trained with best found parameters
    random_search : RandomizedSearchCV
        Fitted RandomizedSearchCV object (for inspection)
    """

    # --- Compute class weights for imbalance ---
    from sklearn.utils.class_weight import compute_class_weight
    classes = np.unique(y_train)
    class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weight_dict = dict(zip(classes, class_weights))

    # --- Base models with class imbalance handling ---
    clf_cb = cb.CatBoostClassifier(
        iterations=300,
        verbose=0,
        random_seed=random_state,
        class_weights=class_weight_dict
    )

    clf_xgb = xgb.XGBClassifier(
        n_estimators=300,
        use_label_encoder=False,
        eval_metric='auc',
        random_state=random_state,
        scale_pos_weight=class_weight_dict.get(1, 1) / class_weight_dict.get(0, 1)  # adjust for minority
    )

    clf_lgb = lgb.LGBMClassifier(
        n_estimators=300,
        random_state=random_state,
        class_weight=class_weight_dict
    )

    # --- Ensemble ---
    ensemble = VotingClassifier(
        estimators=[('catboost', clf_cb), ('xgboost', clf_xgb), ('lightgbm', clf_lgb)],
        voting='soft'
    )

    # --- Parameter distributions ---
    param_dist = {
        'catboost__depth': randint(4, 8),
        'catboost__learning_rate': uniform(0.03, 0.07),
        'xgboost__max_depth': randint(4, 8),
        'xgboost__learning_rate': uniform(0.03, 0.07),
        'lightgbm__num_leaves': randint(31, 70),
        'lightgbm__learning_rate': uniform(0.03, 0.07)
    }

    # --- Randomized Search ---
    random_search = RandomizedSearchCV(
        estimator=ensemble,
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=cv,
        scoring=scoring,
        verbose=2,
        random_state=random_state,
        n_jobs=-1
    )

    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_

    print("Best params:", random_search.best_params_)
    print("Best score (ROC-AUC):", random_search.best_score_)

    return best_model, random_search

knn_cols = ['kill_death_ratio', 'headshot_percentage', 'accuracy_score', 'damage_per_round']

In [22]:
# --- Load data ---
df = pd.read_csv('task1/train.csv')
df = df[df['is_cheater'].notna()].reset_index(drop=True)

In [23]:
# --- Cleaning / Imputation ---
df_clean, knn_imputer, median_imputer = cleaning_data(df, knn_cols, is_training=True)

In [24]:
# --- Feature Engineering ---
df_feat = feature_extractions(df_clean)

In [25]:
# --- Standardization ---
df_scaled, scaler = standardize_features(df_feat, is_training=True)

In [26]:
# --- Correlation Filtering ---
df_final, dropped_features = remove_highly_correlated_features(df_scaled, threshold=CORRELATION_THRESHOLD)

In [27]:
# --- Feature Selection (RandomForest) ---
X_selected, important_features, _ = select_important_features(df_final, n_top_features=TOP_N_FEATURES)
y = df_final['is_cheater']

In [28]:
# --- RandomizedSearchCV Ensemble ---
ensemble_model, random_search = randomized_search_gb_ensemble(X_selected, y, n_iter=20, cv=4)

Fitting 4 folds for each of 20 candidates, totalling 80 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 34129, number of negative: 63619
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005286 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4872
[LightGBM] [Info] Number of data points in the train set: 97748, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Best params: {'catboost__depth': 6, 'catboost__learning_rate': np.float64(0.0857580090802163), 'lightgbm__learning_rate': np.float64(0.042840435290631466), 'lightgbm__num_leaves': 38, 'xgboost__learning_rate': np.float64(0.07190609389379257), 'xgboost__max_depth': 6}
Best score (ROC-AUC): 0.8967961891100169


In [29]:
# Save models and imputers
model_filename = 'ensemble_model.joblib'
knn_imputer_filename = 'knn_imputer.joblib'
median_imputer_filename = 'median_imputer.joblib'
scaler_filename = 'scaler.joblib'
dropped_features_filename = 'dropped_features.joblib'
important_features_filename = 'important_features.joblib'
random_search_filename = 'random_search.joblib'

joblib.dump(ensemble_model, model_filename)
print(f"Model saved as {model_filename}")

joblib.dump(knn_imputer, knn_imputer_filename)
print(f"KNN Imputer saved as {knn_imputer_filename}")

joblib.dump(median_imputer, median_imputer_filename)
print(f"Median Imputer saved as {median_imputer_filename}")

joblib.dump(scaler, scaler_filename)
print(f"Scaler saved as {scaler_filename}")

joblib.dump(dropped_features, dropped_features_filename)
print(f"Dropped Features saved as {dropped_features_filename}")

joblib.dump(important_features, important_features_filename)
print(f"Important Features saved as {important_features_filename}")

joblib.dump(random_search, random_search_filename)
print(f"Random Search saved as {random_search_filename}")


files.download(model_filename)
files.download(knn_imputer_filename)
files.download(median_imputer_filename)
files.download(scaler_filename)
files.download(dropped_features_filename)
files.download(important_features_filename)
files.download(random_search_filename)

Model saved as ensemble_model.joblib
KNN Imputer saved as knn_imputer.joblib
Median Imputer saved as median_imputer.joblib
Scaler saved as scaler.joblib
Dropped Features saved as dropped_features.joblib
Important Features saved as important_features.joblib
Random Search saved as random_search.joblib


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [30]:
# Load the saved models and imputers
ensemble_model = joblib.load('ensemble_model.joblib')
knn_imputer = joblib.load('knn_imputer.joblib')
median_imputer = joblib.load('median_imputer.joblib')
scaler = joblib.load('scaler.joblib')
dropped_features = joblib.load('dropped_features.joblib')
important_features = joblib.load('important_features.joblib')
random_search = joblib.load('random_search.joblib')

print("Models and imputers loaded successfully.")

Models and imputers loaded successfully.


In [31]:
# --- Prepare Kaggle Test Data ---
df_test_kaggle = pd.read_csv('task1/test.csv')
ids = df_test_kaggle['id']
df_test_kaggle = df_test_kaggle.drop(['id', 'player_id'], axis=1)

In [32]:
df_test_kaggle_clean, _, _ = cleaning_data(df_test_kaggle, knn_cols, knn_imputer, median_imputer, is_training=False)

In [33]:
df_test_kaggle_feat = feature_extractions(df_test_kaggle_clean)

In [34]:
df_test_kaggle_scaled, _ = standardize_features(df_test_kaggle_feat, scaler=scaler, is_training=False)

In [35]:
df_test_kaggle_final = df_test_kaggle_scaled.drop(columns=[f for f in dropped_features if f in df_test_kaggle_scaled.columns])

In [36]:
X_test_kaggle_selected = df_test_kaggle_final[important_features]

In [37]:
# --- Predict for submission ---
y_pred_kaggle = ensemble_model.predict(X_test_kaggle_selected)

# --- create submission DataFrame ---
submission = pd.DataFrame({
    'id': ids,
    'is_cheater': y_pred_kaggle
})

In [38]:
submission.to_csv('submission.csv', index=False)
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>