# Acknowledgements

- I gratefully thank [AmbrosM](https://www.kaggle.com/ambrosm) for ["TPSFEB22-01 EDA which makes sense ⭐️⭐️⭐️⭐️⭐️"](https://www.kaggle.com/ambrosm/tpsfeb22-01-eda-which-makes-sense) and ["TPSFEB22-02 Postprocessing against the mutants 💀"](https://www.kaggle.com/ambrosm/tpsfeb22-02-postprocessing-against-the-mutants).

# Libraries

In [None]:
import pandas as pd
import numpy as np
import random
import time
import os

from tqdm.notebook import tqdm
from math import factorial
import datatable as dt

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.spatial import ConvexHull

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import ExtraTreesClassifier

from scipy.stats import mode

import matplotlib.pyplot as plt
import seaborn as sns

# Parameters

In [None]:
N_SPLITS = 10
SEED = 42
N_CLASSES = 10
N_ESTIMATORS = 1000
VERBOSE = False

WEIGHT = False

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

# Datasets

In [None]:
train = dt.fread("../input/tabular-playground-series-feb-2022/train.csv").to_pandas().set_index('row_id')
test = dt.fread("../input/tabular-playground-series-feb-2022/test.csv").to_pandas().set_index('row_id')
submission = dt.fread("../input/tabular-playground-series-feb-2022/sample_submission.csv").to_pandas()

features = [col for col in test.columns]
target = 'target'
target_encoded = 'target_encoded'

In [None]:
if WEIGHT:
    weight_col = 'sample_weight'
    v_counts = train.value_counts()
    train = pd.DataFrame([list(tup) for tup in v_counts.index.values], columns=train.columns)
    train[weight_col] = v_counts.values
    train['row_id'] = train.index
    train = train.set_index('row_id')
else:
    train = train.drop_duplicates()
    train['row_id'] = train.reset_index(drop=True).index
    train = train.set_index('row_id')

In [None]:
def bias(w, x, y, z):
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def gcd_of_all(df_i, columns):
    gcd = df_i[columns[0]]
    for col in columns[1:]:
        gcd = np.gcd(gcd, df_i[col])
    return gcd

In [None]:
gcd_col = 'gcd'
train_int = pd.DataFrame({col: ((train[col] + bias_of(col)) * 1000000).round().astype(int) for col in features})
test_int = pd.DataFrame({col: ((test[col] + bias_of(col)) * 1000000).round().astype(int) for col in features})

train[gcd_col] = gcd_of_all(train_int, features)
test[gcd_col] = gcd_of_all(test_int, features)

np.unique(train[gcd_col], return_counts=True), np.unique(test[gcd_col], return_counts=True)

In [None]:
# Convert the 10 bacteria names to the integers 0 .. 9
le = LabelEncoder()
train[target_encoded] = le.fit_transform(train[target])

In [None]:
plt.figure(figsize=(16,4), tight_layout=True)
sns.heatmap(train[features].corr())

In [None]:
train_idx = []
test_idx = []
for n_gcd, gcd_val in enumerate(np.sort(train[gcd_col].unique())):
    train_idx.append(train[train[gcd_col]==gcd_val].index)
    test_idx.append(test[test[gcd_col]==gcd_val].index)

In [None]:
gcd_val_l = np.sort(train[gcd_col].unique())
gcd_val_l[0], train[train[gcd_col]==gcd_val_l[0]].shape, \
gcd_val_l[1], train[train[gcd_col]==gcd_val_l[1]].shape, \
gcd_val_l[2], train[train[gcd_col]==gcd_val_l[2]].shape, \
gcd_val_l[3], train[train[gcd_col]==gcd_val_l[3]].shape

In [None]:
gcd_val_l[0], test[test[gcd_col]==gcd_val_l[0]].shape, \
gcd_val_l[1], test[test[gcd_col]==gcd_val_l[1]].shape, \
gcd_val_l[2], test[test[gcd_col]==gcd_val_l[2]].shape, \
gcd_val_l[3], test[test[gcd_col]==gcd_val_l[3]].shape

In [None]:
display(train.info())
display(train.head())

In [None]:
display(test.info())
display(test.head())

# ExtraTreesClassifier

In [None]:
oof_col = f'oof'
pred_col = f'pred'
oof_cols = [f'{oof_col}_{i}' for i in range(N_CLASSES)]
pred_cols = [f'{pred_col}_{i}' for i in range(N_CLASSES)]

train[oof_cols+[oof_col]] = 0
test[pred_cols+[pred_col]] = 0
gcd_val_l = np.sort(train[gcd_col].unique())

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for n_gcd, gcd_val in enumerate(tqdm(gcd_val_l, total=gcd_val_l.shape[0])):
    pred = []
    train_df = train.loc[train_idx[n_gcd]].reset_index(drop=True).copy()
    test_df = test.loc[test_idx[n_gcd]].reset_index(drop=True).copy()
    
    for fold, (trn_idx, val_idx) in enumerate(skf.split(X=train_df[features], y=train_df[target_encoded])):
        X_train = train_df[features].iloc[trn_idx]
        y_train = train_df[target_encoded].iloc[trn_idx]
        X_valid = train_df[features].iloc[val_idx]
        y_valid = train_df[target_encoded].iloc[val_idx]
        X_test = test_df[features]
        
        if WEIGHT:
            W_train = train_df[weight_col].iloc[trn_idx]
            W_valid = train_df[weight_col].iloc[val_idx]

        start = time.time()
        clf = ExtraTreesClassifier(
            n_estimators=N_ESTIMATORS,
            n_jobs=-1,
            random_state=SEED,
            verbose=VERBOSE,
        )

        if WEIGHT:
            clf.fit(X_train, y_train, sample_weight=W_train)
        else:
            clf.fit(X_train, y_train)
    
        train_df.loc[val_idx, oof_cols] = clf.predict_proba(X_valid)
        pred.append(clf.predict_proba(X_test))

        elapsed = time.time() - start
        if WEIGHT:
            acc = accuracy_score(
                y_valid,
                np.argmax(train_df.loc[val_idx, oof_cols].to_numpy(), axis=1),
                sample_weight=W_valid
            )
        else:
            acc = accuracy_score(
                y_valid,
                np.argmax(train_df.loc[val_idx, oof_cols].to_numpy(), axis=1)
            )
        print(f"gcd{gcd_val}, fold{fold}, ACCURACY: {acc:6f}, elapsed time: {elapsed:.2f}sec")

    test_df[pred_cols] = np.mean(pred, axis=0)
    train_df[oof_col] = np.argmax(train_df[oof_cols].to_numpy(), axis=1)
    test_df[pred_col] = np.argmax(test_df[pred_cols].to_numpy(), axis=1)
    
    train_df.index = train_idx[n_gcd]
    train.loc[train_idx[n_gcd], oof_cols] = train_df[oof_cols]
    train.loc[train_idx[n_gcd], oof_col] = train_df[oof_col]
    
    test_df.index = test_idx[n_gcd]
    test.loc[test_idx[n_gcd], pred_cols] = test_df[pred_cols]
    test.loc[test_idx[n_gcd], pred_col] = test_df[pred_col]
    
    if WEIGHT:
        acc = accuracy_score(
            train_df[target_encoded],
            np.argmax(train_df[oof_cols].to_numpy(), axis=1),
            sample_weight=train_df[weight_col]
        )
    else:
        acc = accuracy_score(
            train_df[target_encoded],
            np.argmax(train_df[oof_cols].to_numpy(), axis=1)
        )
    print(f"gcd{gcd_val}, ACCURACY: {acc:6f}\n")

if WEIGHT:
    acc = accuracy_score(
        train[target_encoded],
        np.argmax(train[oof_cols].to_numpy(), axis=1),
        sample_weight=train[weight_col]
    )
else:
    acc = accuracy_score(
        train[target_encoded],
        np.argmax(train[oof_cols].to_numpy(), axis=1)
    )
print(f"ACCURACY: {acc:6f}")

In [None]:
train.to_csv(f"train_oof.csv")
test.to_csv(f"test_pred.csv")

# Confusion matrix

In [None]:
plt.figure(figsize=(16, 4))
cm = confusion_matrix(train[target_encoded], train[oof_col])
sns.heatmap(cm, annot=True, cmap='Blues_r')

# Classification report

In [None]:
cr = classification_report(train['target_encoded'], train[oof_col], digits=6)
print(cr)

# Submission

In [None]:
submission[target] = le.inverse_transform(test[pred_col])
submission.to_csv("submission.csv", index=False)
submission