In [None]:
# Intel® Extension for Scikit-learn installation:
!pip install scikit-learn-intelex

import pandas as pd
import numpy as np
from math import factorial
from numpy import mean, std, asarray, vstack, hstack

import random
import time
import os
from pathlib import Path

from tqdm.notebook import tqdm

from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, KFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier
from scipy.stats import mode
import lightgbm as lgb

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

## **Memory Optimization - Credit to @munumbutt**
 https://www.kaggle.com/munumbutt/extratrees-stratifiedkfold-memory-optimization

In [None]:

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

## **Removing duplicated rows with adapting the sample weight - Credit to @ambrosm**
https://www.kaggle.com/ambrosm/tpsfeb22-02-postprocessing-against-the-mutants#Deduplicating-the-training-data

In [None]:
print(f"{'*'*10} Loading Training Data... {'*'*10}")
train_df = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col="row_id")#.pipe(reduce_mem_usage)
print(f"{'*'*10} Loading Testing Data... {'*'*10}")
test_df = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv', index_col="row_id")#.pipe(reduce_mem_usage)
submission_df = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')#.pipe(reduce_mem_usage)

In [None]:
# Count the duplicates in the training data
f"There is {train_df.duplicated().sum()} duplciated rows"

In [None]:
# Create a new df w/o duplicates, but with additional sample_weight column
value_count = train_df.value_counts() # counts of unique values
dedup_train_df = pd.DataFrame([list(tup) for tup in value_count.index.values], columns = train_df.columns)
dedup_train_df["sample_weight"] = value_count.values

In [None]:
print(dedup_train_df.shape)
dedup_train_df

# **Define Feature Columns**

### **Adding Bias feature - Credit to Ambrosm**
 don't hesitate to upvote original content https://www.kaggle.com/ambrosm/tpsfeb22-01-eda-which-makes-sense

In [None]:
features =[e for e in dedup_train_df.columns if e != 'row_id' and e != 'target' and e!='target_num' and e!='sample_weight']

In [None]:
def bias(w, x, y, z):
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)
    

In [None]:
train_i = pd.DataFrame({col: ((dedup_train_df[col] + bias_of(col)) * 1000000).round().astype(int) for col in features})
test_i = pd.DataFrame({col: ((test_df[col] + bias_of(col)) * 1000000).round().astype(int) for col in features})

### Adding GCD feature
https://www.kaggle.com/ambrosm/tpsfeb22-01-eda-which-makes-sense

In [None]:
def gcd_of_all(df_i):
    """
    adding Greatest Common Divisor for every target. Every target are multiplier of 10, 100, 1000, etc.
    """
    gcd = df_i[features[0]]
    for col in features[1:]:
        gcd = np.gcd(gcd, df_i[col])
    return gcd

dedup_train_df['gcd'] = gcd_of_all(train_i)
test_df['gcd'] = gcd_of_all(test_i)
np.unique(dedup_train_df['gcd'], return_counts=True), np.unique(test_df['gcd'], return_counts=True)


In [None]:
dedup_train_df.head(2)

In [None]:
print("dedup_train_df shape :", dedup_train_df.shape)
print("test_df shape :", test_df.shape)

In [None]:
dedup_train_df.groupby(["gcd","target"]).size()

#### **The distribution appears to be similar**

# **Model**

## **Encoding**

In [None]:
target = train_df.columns.difference(test_df.columns)[0]
features += ["gcd"] 
le = LabelEncoder()
X = dedup_train_df[features]
y = le.fit_transform(
    dedup_train_df[target])
sample_weight = dedup_train_df["sample_weight"]


## **SUPER LEARNER ENSEMBLE** - Credit to @remekkinas
https://www.kaggle.com/remekkinas/super-learner-ensemble-extree-tuned-lda-umap#STARTING-MODEL

In [None]:
def get_models():
    models = {}
    
    models['ExtraTreesClassifier'] = ExtraTreesClassifier(
        n_estimators=N_ESTIMATORS,
        n_jobs=-1,
        random_state=SEED,
        verbose=VERBOSE,
        max_depth = MAX_DEPTH,
        min_samples_split = MIN_SAMPLES_SPLIT,
        min_samples_leaf  = MIN_SAMPLES_LEAF,
        criterion = CRITERION
    )

    models['RandomForestClassifier'] = RandomForestClassifier(n_estimators= 200)
    return models

In [None]:
def get_out_of_fold_predictions(X, y, models) :
    meta_X, meta_y = list(), list()
    kfold = KFold(n_splits = N_SPLITS, shuffle = True)
    print(" ** START OFF PREDICTIONS for base models ** ")
    for fold_id, (train_ix, test_ix) in enumerate(kfold.split(X)) :
        print(f" FOLD : {fold_id+1}")
        fold_yhats = list()
        train_X = X[train_ix]
        test_X  = X[test_ix]  
        train_y = y[train_ix]
        test_y  = y[test_ix]
        meta_y.extend(test_y)
        
        for name, model in models.items():
            # Model Training
            model.fit(train_X, train_y)
            yhat = model.predict_proba(test_X)    # give the probabilities for each # class 
            fold_yhats.append(yhat)
            
            # Model Validation
            yhat_eval = model.predict(test_X)
            
            # Score
            score = accuracy_score(test_y, yhat_eval)
            print( '\t %s: %.5f' %(name, score))
        
        meta_X.append(hstack(fold_yhats))    # stack horizontally
    return vstack(meta_X), asarray(meta_y)

def fit_base_models(X, y, models) :
    print(" ** START - Fitting base models ** ")
    for name, model in models.items():
        print(f" Fitting model -{name}")
        model.fit(X, y)
    print(" ** FINISH - Fitting base models ** ")

def fit_meta_model(X, y):
    print(" ** START - Fitting meta models ** ")
    model = LogisticRegression(solver = 'liblinear')
    model.fit(X, y)
    print(" ** FINISH - Fitting meta models ** ")
    return model

def evaluate_models(X, y, models):
    for name, model in models.items():
        yhat = model.predict(X)
        score = accuracy_score(y, yhat)
        print('%s: %.5f' % (model.__class__.__name__, score))
        
def super_learner_predictions(X, models, meta_model):
    meta_data_X = []
    for name, model in models.items():
        yhat = model.predict_proba(X)

        meta_data_X.append(yhat)
    meta_data_X = hstack(meta_data_X)
    return meta_model.predict(meta_data_X)
            
def super_learner_predictions_proba(X, models, meta_model):
    meta_data_X = []
    for name, model in models.items():

        yhat = model.predict_proba(X)
        meta_data_X.append(yhat)
    meta_data_X = hstack(meta_data_X)
    return meta_model.predict_proba(meta_data_X)

## **Parameters**

### Credits to @cv13j0 for Optuna Optimization, don't hesitate to upvote original work : https://www.kaggle.com/cv13j0/extra-trees-classification-optuna-optimization

In [None]:
KAGGLE = True
N_SPLITS = 10 # 10
SEED = 42
N_CLASSES = 10
N_ESTIMATORS = 100
MAX_DEPTH = 1500
if KAGGLE :
    N_ESTIMATORS = 1000 # 2370 
    MAX_DEPTH = 3691
    

MIN_SAMPLES_SPLIT = 3
MIN_SAMPLES_LEAF = 1
CRITERION  = 'gini'
VERBOSE = 0

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

In [None]:
print(f'target : {target}')
print(f'features : {features[0:10]}..')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X.values, y, test_size=0.10)

In [None]:
models = get_models()

In [None]:
meta_X, meta_y = get_out_of_fold_predictions(X_train, y_train, models)

In [None]:
fit_base_models(X_train, y_train, models)

In [None]:
meta_model = fit_meta_model(meta_X, meta_y)

In [None]:
evaluate_models(X_val, y_val, models)

In [None]:
type(meta_model)

In [None]:
y_hat = super_learner_predictions(X_val, models, meta_model)
score = accuracy_score(y_val, y_hat)
print("Super Learner : %.5f" % score)

## Training on full data for final prediction

In [None]:
final_models = get_models()
meta_X_full, meta_y_full = get_out_of_fold_predictions(X.values, y, final_models)
fit_base_models(X.values, y, final_models)
meta_model_full = fit_meta_model(meta_X_full, meta_y_full)

In [None]:
preds = super_learner_predictions(test_df[features], final_models, meta_model_full)

In [None]:
y_prob = super_learner_predictions_proba(test_df[features], final_models, meta_model_full)

# **Post Processing**

In [None]:
# "Optimization" code from https://www.kaggle.com/sfktrkl/tps-feb-2022

target_distribution = train_df['target'].value_counts().sort_index() / len(train_df) * 100
def get_diff(tune):
    y_pred_tuned = np.argmax(y_prob + tune, axis=1)
    return target_distribution - pd.Series(y_pred_tuned).value_counts().sort_index() / len(test_df) * 100

tune = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
diff = get_diff(tune)
while abs(diff).max() > 0.1:
    for i in range(len(diff)):
        if diff[i] > 0.1:
            tune[i] += 0.001
            break
        if diff[i] < -0.1:
            tune[i] -= 0.001
            break
    diff = get_diff(tune)

# Credits to https://www.kaggle.com/ambrosm/tpsfeb22-02-postprocessing-against-the-mutants
y_prob += tune
y_pred_tuned = le.inverse_transform(np.argmax(y_prob, axis=1))
pd.Series(y_pred_tuned, index=test_df.index).value_counts().sort_index() / len(test_df) * 100

# **Submission**

In [None]:
sub = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")

In [None]:
sub.target = y_pred_tuned
sub.to_csv("sl-tuned-submission.csv", index=False)
sub.head(10)

sub.target = le.inverse_transform(preds)
sub.to_csv("sl-base-submission.csv", index=False)
sub.head(10)

 --------------------------------------------------

# **Post Processing**

# Majority vote
y_pred = mode(y_preds_list).mode[0]
y_pred = le.inverse_transform(y_pred)


y_preds_list, y_proba_list, scores = [],[],[]
skf = StratifiedKFold(n_splits = N_SPLITS, shuffle = True, random_state = SEED)

for fold, (train_i, val_i) in enumerate(tqdm(skf.split(X, y), total = N_SPLITS)) :
    X_train = X.iloc[train_i]
    y_train = y.iloc[train_i]
    sample_weight_train = sample_weight.iloc[train_i]
    
    X_valid = X.iloc[val_i]
    y_valid = y.iloc[val_i]
    sample_weight_valid = sample_weight.iloc[val_i]
    
    start = time.time()

    # Model tuning
    clf =  ExtraTreesClassifier(
        n_estimators=N_ESTIMATORS,
        n_jobs=-1,
        random_state=SEED,
        verbose=VERBOSE,
        max_depth = MAX_DEPTH,
        min_samples_split = MIN_SAMPLES_SPLIT,
        min_samples_leaf  = MIN_SAMPLES_LEAF,
        criterion = CRITERION
    )
    
    # Model Training
    clf.fit(X_train, y_train, sample_weight_train)
    
    # Model Validation
    valid_pred = clf.predict(X_valid)
    valid_score = accuracy_score(y_valid, valid_pred, 
                                 sample_weight = sample_weight_valid)
    scores.append(valid_score)
    
    # Prediction for submission
    y_preds_list.append(clf.predict(test_df))
    y_proba_list.append(clf.predict_proba(test_df))
    elapsed = time.time() - start
    print(f'Accuracy score: {valid_score:5f}, elapsed time: {elapsed:.2f}sec \n')
    
score = np.array(scores).mean()
print(f'Mean accuracy score: {score:6f}')   
        

target_distrib = pd.DataFrame({
    'count': train_df.target.value_counts(),
    'share': train_df[target].value_counts() / train_df.shape[0] * 100
})

target_distrib['pred_count'] = pd.Series(y_pred, index=test_df .index).value_counts()
target_distrib['pred_share'] = target_distrib['pred_count'] / len(test_df) * 100
target_distrib.sort_index()

y_proba = sum(y_proba_list) / len(y_proba_list)
y_proba += np.array([0, 0, 0.01, 0.03, 0, 0, 0, 0, 0, 0])
y_pred_tuned = le.inverse_transform(np.argmax(y_proba, axis=1))
pd.Series(y_pred_tuned, index=test_df.index).value_counts().sort_index() / len(test_df) * 100

# **Submission**

submission_df["target"] = y_pred_tuned
submission_df.to_csv("submission.csv", index=False)
submission_df