# TPS February 2022: LGBM Solution

Creating several models that I hope to blend near the end of the competiton.

## Models

- [👽 TPS Feb 22: XGBoost+Optuna+CV+OOF](https://www.kaggle.com/mmellinger66/tps-feb-22-xgboost-cv-oof/)
- [👽 TPS Feb 22: ExtraTreeClassifier + CV + OOF](https://www.kaggle.com/mmellinger66/tps-feb-22-extratreeclassifier-cv-oof/)
- [👽TPS Feb 22: EDA+LGBM+Optuna+CV+OOF](https://www.kaggle.com/mmellinger66/tps-feb-22-eda-lgbm-cv-oof)

## Problem Type

Multi-class Classification

## Metric

$$Accuracy = \frac{Number\; of\; correct\; predictions}{Total\; number\; of\; predictions } = \frac{TP+TN}{TP+FP+FN+TN}$$

- [Accuracy](https://developers.google.com/machine-learning/crash-course/classification/accuracy)

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [None]:
import os
from pathlib import Path
import time
import gc

import pandas as pd
import numpy as np
import datatable as dt  # Fast table loading

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

# from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from scipy.stats import mode

import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances


# Visualization Libraries

import matplotlib.pylab as plt
import seaborn as sns

import warnings

warnings.filterwarnings("ignore")

plt.style.use("fivethirtyeight")  # ggplot fivethirtyeight bmh
pd.options.display.max_columns = 500

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [None]:
# Black formatter https://black.readthedocs.io/en/stable/

# ! pip install nb-black > /dev/null

# %load_ext lab_black

In [None]:
# Used in Message, pred_{MODEL}
MODEL = "lgbm1"

In [None]:
class Config:
    debug = False
    optimize = False
    competition = "TPS_202202"
    seed = 42
    N_ESTIMATORS = 20  # 30, 1000, 3000, 5000
    N_FOLDS = 5  # 5,10,15
    SEED_LENGTH = 1  # 5,10

In [None]:
TARGET = "target"
TARGET_ENC = "target_num"

In [None]:
# Change for every competition
data_dir = Path("../input/tabular-playground-series-feb-2022")

In [None]:
if not os.path.exists("results"):
    os.makedirs("results")

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data</h1>
</div>

Use fast table loading:
- [Tutorial on reading datasets](https://www.kaggle.com/hiro5299834/tutorial-on-reading-datasets)


In [None]:
%%time
#train_df = pd.read_csv(data_dir / "train.csv")
#test_df = pd.read_csv(data_dir / "test.csv")
train_df = dt.fread(data_dir / "train.csv").to_pandas()
test_df = dt.fread(data_dir / "test.csv").to_pandas()

submission_df = pd.read_csv(data_dir / "sample_submission.csv")

print(f"train data: Rows={train_df.shape[0]}, Columns={train_df.shape[1]}")
print(f"test data : Rows={test_df.shape[0]}, Columns={test_df.shape[1]}")

In [None]:
# Check pandas memory usage
start_memory = train_df.memory_usage().sum() / 1024 ** 2
print(f"Memory usage: {start_memory:0.2f} MB")

## Create kfolds

In [None]:
def create_folds(df, n_folds=5, seed=42):

    df["fold"] = -1

    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    # kf = GroupKFold(n_splits=Config.N_FOLDS)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(df)):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df

In [None]:
train_df = create_folds(train_df)

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Explore the Data</h1>
</div>

In [None]:
train_df.columns

In [None]:
train_df.head()

## Missing Values

There are no missing values.

In [None]:
n = train_df.isna().sum().sum()
print(f"Number values missing: {n}")

## Identify Categorical and Continuous Features

Note, some categorical feature could look continuous.  Will need to verify > 20 unique values, for example.

In [None]:
continuous_features = []
cat_features = []

In [None]:
continuous_features = [
    f for f in train_df.columns if f not in ("row_id", TARGET, "fold")
]

In [None]:
# " - ".join(continuous_features)

In [None]:
# ["count","mean","std"]
train_df[continuous_features].describe().T.sort_values(
    by="std", ascending=False
).style.background_gradient(cmap="coolwarm").set_precision(4)

In [None]:
len(continuous_features), len(continuous_features) / 4

## Feature Analysis

In [None]:
# plt.figure(figsize=(12, 8))
train_df[continuous_features].hist(
    bins=5, alpha=0.5, layout=(72, 4), log=True, figsize=(20, 260)
)
plt.show()

In [None]:
# train_df["A0T0G9C1"].unique()

## Target Analysis

In [None]:
# https://drawingfromdata.com/seaborn/matplotlib/visualization/rotate-axis-labels-matplotlib-seaborn.html
plt.figure()
fig, ax = plt.subplots(1, 1, figsize=(10, 4))

sns.countplot(x=train_df[TARGET], data=train_df)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()

## Label Encode Target Manually

In [None]:
targets = train_df[TARGET].unique()
target_dict = {t: i for i, t in enumerate(targets)}  # Use {} not []
target_dict

In [None]:
# Need the inverse dictionary for the submission file
target_dict_inv = {v: k for k, v in target_dict.items()}
target_dict_inv

In [None]:
train_df[TARGET_ENC] = train_df[TARGET].map(target_dict)

In [None]:
train_df = train_df.drop(["target"], axis=1)

In [None]:
end_mem = train_df.memory_usage().sum() / 1024 ** 2
end_mem

### Now using the numerical `target_num` for the TARGET

In [None]:
train_df.head()

In [None]:
# FEATURES = continuous_features[:20]
FEATURES = continuous_features
# df = train_df.copy()

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Feature Engineering</h1>
</div>

Features from @lucamassaron

https://www.kaggle.com/lucamassaron/basic-eda-and-model-to-start#Feature-engineering

In [None]:
def create_features(df, features):

    df['mean'] = df[features].mean(axis=1)
    df['median'] = df[features].median(axis=1)
    df['q01'] = df[features].quantile(q=0.01, axis=1)
    df['q05'] = df[features].quantile(q=0.05, axis=1)
    df['q10'] = df[features].quantile(q=0.10, axis=1)
    df['q25'] = df[features].quantile(q=0.25, axis=1)
    df['q75'] = df[features].quantile(q=0.75, axis=1)
    df['q90'] = df[features].quantile(q=0.90, axis=1)
    df['q95'] = df[features].quantile(q=0.95, axis=1)
    df['q99'] = df[features].quantile(q=0.99, axis=1)
    df['max'] = df[features].max(axis=1)
    df['min'] = df[features].min(axis=1)

    df['std'] = df[features].std(axis=1)
    df['range'] = df['max'] - df['min']
    df['iqr'] = df['q75'] - df['q25']
    df['tails'] = df['range'] / df['iqr']
    df['dispersion'] = df['std'] / df['mean']
    df['dispersion_2'] = df['iqr'] / df['median']
    df['skew'] = df[features].skew(axis=1)
    df['kurt'] = df[features].kurt(axis=1)

    df['median-max'] = df['median'] - df['max']
    df['median-min'] = df['median'] - df['min']
    df['q99-q95'] = df['q99'] - df['q95']
    df['q99-q90'] = df['q99'] - df['q90']
    df['q01-q05'] = df['q01'] - df['q05']
    df['q01-q10'] =  df['q01'] - df['q10']

    return df

## Combine Train and Test to Create Features in Both

- tt - train/test combined
- Use isTrain flag to track so after we combine, we can separate the data again


In [None]:
train_df["isTrain"] = True
test_df["isTrain"] = False

tt = pd.concat([train_df, test_df]).reset_index(drop=True).copy()
# tt = create_features(tt, FEATURES)
# tt = create_features_other(tt)

train_df = tt.query("isTrain").reset_index(drop=True).copy()
test_df = tt.query("isTrain == False").reset_index(drop=True).copy()

del tt
gc.collect()

## GCD Clustering Idea

Idea from [@ambrosm](https://www.kaggle.com/ambrosm)

- [TPSFEB22-03 Clustering Improves the Predictions](https://www.kaggle.com/ambrosm/tpsfeb22-03-clustering-improves-the-predictions)

In [None]:
# [
#     e
#     for e in train_df.columns
#     if e.startswith("A")
# ]

In [None]:
from math import factorial

# elements = [
#     e
#     for e in train_df.columns
#     if e not in ["row_id", TARGET, TARGET_ENC, "fold", "gcd", "isTrain"]
# ]

elements = [
    e
    for e in train_df.columns
    if e.startswith("A")
]
def bias(w, x, y, z):
    return factorial(10) / (
        factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4 ** 10
    )


def bias_of(s):
#     print(f"Bias of: {s}")
    w = int(s[1 : s.index("T")])
    x = int(s[s.index("T") + 1 : s.index("G")])
    y = int(s[s.index("G") + 1 : s.index("C")])
    z = int(s[s.index("C") + 1 :])
    return factorial(10) / (
        factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4 ** 10
    )


train_i = pd.DataFrame(
    {
        col: ((train_df[col] + bias_of(col)) * 1000000).round().astype(int)
        for col in elements
    }
)

test_i = pd.DataFrame(
    {
        col: ((test_df[col] + bias_of(col)) * 1000000).round().astype(int)
        for col in elements
    }
)


def gcd_of_all(df_i):
    gcd = df_i[elements[0]]
    for col in elements[1:]:
        gcd = np.gcd(gcd, df_i[col])
    return gcd


train_df["gcd"] = gcd_of_all(train_i)
test_df["gcd"] = gcd_of_all(test_i)

In [None]:
original_features = FEATURES[:]
FEATURES = [col for col in train_df.columns if col not in ['row_id', 'fold', TARGET, TARGET_ENC, 'isTrain']]
# FEATURESX

In [None]:
# FEATURES = continuous_features
# FEATURES

In [None]:
train_df.columns

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization</h1>
</div>

- [LGBM GPU Hyperparameters with Optuna & Dummies](https://www.kaggle.com/tunguz/lgbm-gpu-hyperparameters-with-optuna-dummies)

In [None]:
def objective(trial, X_train, X_valid, y_train, y_valid):

    params = {
        "boosting_type": "gbdt",
        # "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
#         "objective": trial.suggest_categorical("objective", ["multi:softprob"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 1000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    return accuracy_score(y_valid, oof)

In [None]:
# Setting optuna verbosity to show only warning messages
# If the line is uncommeted each iteration results will be shown
# optuna.logging.set_verbosity(optuna.logging.WARNING)

X = train_df[FEATURES].copy()
y = train_df[TARGET_ENC]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
time_limit = 3600 * 3

if Config.optimize:
    study = optuna.create_study(direction="maximize")
    study.optimize(
        lambda trial: objective(trial, X_train, X_valid, y_train, y_valid),        
        n_trials=2,
        # timeout=time_limit,  # this or n_trials
    )

## Show Optimization Results

In [None]:
if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)

In [None]:
# Historic
if Config.optimize:
    plot_optimization_history(study)

In [None]:
# Importance
if Config.optimize:
    plot_param_importances(study)

In [None]:
def get_seed_list(low=0, high=1000, length=5):
    np.random.seed(42)
    return np.random.randint(low=low, high=high, size=length)

In [None]:
# https://lightgbm.readthedocs.io/en/latest/Parameters.html

lgbm_params = {
    "n_estimators": Config.N_ESTIMATORS,
    #     "device_type": "gpu",
    "objective": "multiclass",
    "metric": "multi_logloss",
    "lambda_l1": 0.009130931198077825,
    "lambda_l2": 3.530680683338868e-05,
    #     "reg_alpha": 0.009130931198077825,
    #     "reg_lambda": 3.530680683338868e-05,
    "num_leaves": 430,
    "importance_type": "split",
    #     "learning_rate": 0.029330486500731102,
    "learning_rate": 0.1,
    "feature_fraction": 0.8757445736567416,
    "bagging_fraction": 0.9989307214277753,
    "bagging_freq": 10,
    "min_child_samples": 20,
    "random_state": 42,
    "n_jobs": -1,
}

## Use Best Hyperparameters

In [None]:
if Config.optimize:
    lgbm_params = study.best_trial.params

In [None]:
SEED_LENGTH = 1
seed_list = get_seed_list(length=SEED_LENGTH)
seed_list

In [None]:
def train_model(df, test, FEATURES, TARGET, params, n_folds=5, seed=42):
    final_test_predictions = []
    final_valid_predictions = {}
    scores = []
    fold_scores = []
    feature_importance_lst = []

    print(f"\n===== Estimators: {params['n_estimators']}, Random State: {seed} =====")
    params["random_state"] = seed

    test = test[FEATURES].copy()

    for fold in range(n_folds):

        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")
        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        valid_ids = (
            xvalid.row_id.values.tolist()
        )  # Id's of everything in validation fold

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        model = lgb.LGBMClassifier(**params)

        model.fit(
            xtrain,
            ytrain,
            eval_set=[(xvalid, yvalid)],
            callbacks=[
                lgb.log_evaluation(0),
                lgb.early_stopping(500, False, True),
            ],
            #             eval_metric="multi_logloss",
        )

        # Feature importance
        fi = pd.DataFrame(
            index=model.feature_name_,
            data=model.feature_importances_,
            columns=[f"{fold}_importance"],
        )
        feature_importance_lst.append(fi)

        # Predict OOF Validation Set
        preds_valid = model.predict(xvalid)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))

        # Validation Score
        fold_score = accuracy_score(yvalid, preds_valid)
        fold_scores.append(fold_score)

        # Predict Test Data
        test_preds = model.predict(xtest[FEATURES])
        final_test_predictions.append(test_preds)

        run_time = time.time() - start_time

        print(f"---> fold: {fold+1}, Accuracy: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )

In [None]:
(
    model,
    feature_importance_lst,
    fold_scores,
    final_valid_predictions,
    final_test_predictions,
) = train_model(
    train_df, test_df, FEATURES, TARGET_ENC, lgbm_params, Config.N_FOLDS, Config.seed
)

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Scores</h1>
</div>

In [None]:
cv_score = np.mean(fold_scores)  # Used in filename
print(f"scores -> mean: {cv_score:0.6f}, std: {np.std(fold_scores):0.6f}")

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Model Feature Importance</h1>
</div>

In [None]:
fis_df = pd.concat(feature_importance_lst, axis=1)
fis_df.sort_values("1_importance").head(20).plot(
    kind="barh", figsize=(12, 12), title="Feature Importance Across Folds"
)
plt.show()

In [None]:
# lgb.plot_importance(model, max_num_features=40, figsize=(15, 15))
# plt.show()

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Save OOF Predictions</h1>
</div>
Save the dictionary that we created for all the training predictions that were made when each fold was used for validation

In [None]:
final_valid_predictions_df = pd.DataFrame.from_dict(
    final_valid_predictions, orient="index"
).reset_index()
final_valid_predictions_df.columns = ["id", "pred_lgbm1"]
final_valid_predictions_df["pred_lgbm1"] = final_valid_predictions_df["pred_lgbm1"].map(
    target_dict_inv
)
# f"results/test_preds_cv{cv_score:0.6f}_s{Config.seed}_k{Config.N_FOLDS}_{MODEL}.csv",


final_valid_predictions_df.to_csv(
    f"results/oof_preds_cv:{cv_score:0.6f}_s:{Config.seed}_k:{Config.N_FOLDS}_{MODEL}.csv",
    index=False,
)
final_valid_predictions_df.head()

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Submission</h1>
</div>

In [None]:
### Non-zero means results are different
mean_result = np.mean(np.column_stack(final_test_predictions), axis=1).astype("int")
mode_result = mode(final_test_predictions).mode[0]
r3 = mean_result - mode_result
r3.sum()

In [None]:
message = f"{MODEL}_cv:{cv_score:0.6f}_seed:{Config.seed}_k:{Config.N_FOLDS}_n:{Config.N_ESTIMATORS}"
print(f"\nSubmit Message: {message}\n")

submission_df["target"] = mode_result
submission_df["target"] = submission_df["target"].map(target_dict_inv)
submission_df.to_csv(
    f"results/test_preds_cv:{cv_score:0.6f}_s:{Config.seed}_k:{Config.N_FOLDS}_{MODEL}.csv",
    index=False,
)
submission_df.to_csv("submission.csv", index=False)  # Submit button compatible file

submission_df