In [None]:
%%capture

# Intel® Extension for Scikit-learn installation:
!pip install scikit-learn-intelex -q

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time
import warnings

from scipy.stats import mode
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearnex import patch_sklearn
patch_sklearn()

# Mute warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
cfg = {
    'TARGET' : 'target',
    'N_FOLDS' : 5,
    'RANDOM': 42,
    'SCORING': 'accuracy'
}

In [None]:
test = pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv", index_col = 'row_id')
train = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv", index_col = 'row_id')
sub = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")

train.drop_duplicates(keep='first', inplace=True)
train = train.reset_index(drop=True)

y_oryg = train.target

FEATURES = [col for col in train.columns if col not in [cfg['TARGET']]]

In [None]:
lb = LabelEncoder()
y = lb.fit_transform(train['target'])
train['target'] = y
num_classes = train.target.nunique()

In [None]:
counter = 0
test_preds = np.zeros((len(test), num_classes))
scores = np.zeros((int(num_classes * (num_classes - 1) / 2), 3))
oof_preds = np.zeros((len(train), num_classes))


for cls in range(num_classes):
    for n_cls in range(cls + 1, num_classes):
        print(f"Model: {cls} vs {n_cls}")
        dfx = train.query("target == @cls or target == @n_cls")
        cv = KFold(n_splits = cfg['N_FOLDS'], random_state = cfg['RANDOM'])
        fold_scores = []
        
        for fold, (tr_idx, val_idx) in enumerate(cv.split(dfx, dfx.target)):
            y_valid_idx = dfx.index[val_idx]
            X_train, y_train = dfx[FEATURES].iloc[tr_idx], dfx.iloc[tr_idx].target.values
            X_valid, y_valid = dfx[FEATURES].iloc[val_idx], dfx.iloc[val_idx].target.values
            
            model = ExtraTreesClassifier()
            model.fit(X_train, y_train)
            
            valid_preds = model.predict(X_valid)
            valid_preds_proba = model.predict_proba(X_valid)
            fold_score = accuracy_score(y_valid, valid_preds)
            fold_scores.append(fold_score)
            print(f"   * fold: {fold} acc: {fold_score}")
            
            #oof validations
            # 1. for val_idx - take from fold preds
            oof_preds[y_valid_idx, cls] += valid_preds_proba[:, 0]
            oof_preds[y_valid_idx, n_cls] += valid_preds_proba[:, 1]
            #print(val_idx)
        
        model.fit(dfx[FEATURES].values, dfx.target.values)
        
        #2 make prediction for rest of classes
        oof_train_idx = ~train.index.isin(dfx.index)
        oof_train = train[FEATURES][oof_train_idx].values
        
        oof_prediction = model.predict_proba(oof_train)
        
        oof_preds[oof_train_idx, cls] += oof_prediction[:, 0]
        oof_preds[oof_train_idx, n_cls] += oof_prediction[:, 1]
        
        # Model vs Model fit and predict - for test dataset (not validated)
        
        preds = model.predict_proba(test[FEATURES].values)
        test_preds[:, cls] += preds[:, 0]
        test_preds[:, n_cls] += preds[:, 1]
        
        print(f"  - Avg accuracy for model {cls} vs {n_cls}: {np.mean(fold_scores)} \n")
        scores[counter] = [cls, n_cls, np.mean(fold_scores)]
        counter += 1

oof_preds_df = pd.DataFrame(oof_preds, index = train.index, columns = list(lb.classes_))
oof_preds_df['oof_pred'] = oof_preds_df.idxmax(1)
oof_score = accuracy_score(y_oryg, oof_preds_df.oof_pred.values)
print(f"OOF accuracy for OvO: {oof_score}")

In [None]:
scores_df = pd.DataFrame(scores, columns = ['model_1', 'model_2', 'scores'])

scm = scores_df.pivot_table(index='model_1', columns='model_2', values='scores')  
sns.heatmap(scm, cmap='rocket')

In [None]:
test_preds_df = pd.DataFrame(test_preds, index = test.index, columns = list(lb.classes_))
test_preds_df['final_pred'] = test_preds_df.idxmax(1)

In [None]:
sub.target = test_preds_df.final_pred.values
sub.to_csv("ovo-submission.csv", index=False)
sub.head(10)

In [None]:
pd.Series(sub['target'].values, index=test.index).value_counts().sort_index() / len(test) * 100