### Inspiration from these notebooks:
- Clustering from: https://www.kaggle.com/code/nitishraj/pseudo-label-soft-voting-with-sklearn
- Clustering from: www.kaggle.com/competitions/tabular-playground-series-jul-2022/discussion/334808
- Ideas from: https://www.kaggle.com/code/adaubas/tps-jul22-lgbm-extratree-qda-soft-voting
- Bayesian GMM classifier from: https://www.kaggle.com/code/karlcini/bayesiangmmclassifier

In [None]:
pip install scikit-lego

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import RobustScaler, PowerTransformer
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklego.mixture import BayesianGMMClassifier, GMMClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
data=pd.read_csv("../input/tabular-playground-series-jul-2022/data.csv")
ss=pd.read_csv("../input/tabular-playground-series-jul-2022/sample_submission.csv")

In [None]:
# Clustering from: https://www.kaggle.com/code/nitishraj/pseudo-label-soft-voting-with-sklearn

# Using PowerTransformer before Scaling to adjust for Outliers
data_scaled = pd.DataFrame(PowerTransformer().fit_transform(data), columns=data.columns)
data_scaled = pd.DataFrame(
    RobustScaler().fit_transform(data_scaled), columns=data_scaled.columns
)

# www.kaggle.com/competitions/tabular-playground-series-jul-2022/discussion/334808
useful_cols = [
    "f_07",
    "f_08",
    "f_09",
    "f_10",
    "f_11",
    "f_12",
    "f_13",
    "f_22",
    "f_23",
    "f_24",
    "f_25",
    "f_26",
    "f_27",
    "f_28",
]

# Test Data for predictions later
test_data = data_scaled[useful_cols].copy()

In [None]:
%%time
# Fit Bayesian Gaussian Mixture
print("Fitting Bayesian Gaussian Mixture..")
bgm = BayesianGaussianMixture(
    n_components=7,
    max_iter=300,
    n_init=10,
    random_state=2,
    verbose_interval=100,
)

bgm_labels = bgm.fit_predict(data_scaled[useful_cols])
bgm_proba = bgm.predict_proba(data_scaled[useful_cols])

In [None]:
%%time
# Using idea from: https://www.kaggle.com/code/adaubas/tps-jul22-lgbm-extratree-qda-soft-voting

# Creating Best data based on predicted probability of BGM model
n_components = 7
data_scaled["predict"] = bgm_labels
data_scaled["predict_proba"] = 0

for n in range(n_components):
    data_scaled[f"bgm_proba_{n}"] = bgm_proba[:, n]
    data_scaled.loc[data_scaled.predict == n, "bgm_proba"] = data_scaled[
        f"bgm_proba_{n}"
    ]

train_index = np.array([])
for n in range(n_components):
    median = data_scaled[data_scaled.predict == n]["bgm_proba"].median()

    # Experiment with different thresholds
    # Higher thereshold might overfit
    n_inx = data_scaled[
        (data_scaled.predict == n) & (data_scaled.bgm_proba > 0.675)
    ].index

    train_index = np.concatenate((train_index, n_inx))
    print(
        f"class:{n}",
        f"median: {round(median,4)}",
        "Training data:"
        + str(round(len(n_inx) / len(data_scaled[(data_scaled.predict == n)]), 2) * 100)
        + "%",
    )


print(f"\nSize of Training data : {len(train_index)}")

In [None]:
X = data_scaled.loc[train_index][useful_cols]
y = data_scaled.loc[train_index]["predict"]

In [None]:
%%time
# https://www.kaggle.com/code/karlcini/bayesiangmmclassifier

bgm = BayesianGMMClassifier(
    n_components=7,
    random_state=42,
    # tol =1e-3,
    covariance_type="full",
    max_iter=500,
    n_init=7,
    init_params="kmeans", # you can use k-means++
)
bgm.fit(X, y)

In [None]:
y_pred = bgm.predict(X)
accuracy_score(y, y_pred)

In [None]:
predictions = bgm.predict(test_data)
ss["Predicted"] = predictions
ss.to_csv(
    "submission.csv",
    index=False,
)