In [None]:
%%capture

# Intel® Extension for Scikit-learn installation:
!pip install scikit-learn-intelex -q

import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd
import time
import warnings

import pyarrow.csv as pv
import pyarrow.parquet as pq

from scipy.stats import mode
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder

from sklearnex import patch_sklearn
patch_sklearn()

# Mute warnings
warnings.filterwarnings("ignore")

In [None]:
cfg = {
    'TARGET' : 'target',
    'N_FOLDS' : 5,
    'N_ESTIM': 300
}

n_estimators = [200, 300, 400, None, None]
n_splits = [10, 10, 10, 5, 5]
random_states = [2021, 42, 0, 42, 42]

In [None]:
test = pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv", index_col="row_id", low_memory=True)
train = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv", index_col="row_id", low_memory=True)


train.drop_duplicates(keep='first', inplace=True)


# Save the csv file to parquet. This you have to do just once.
# I learned parquet from this notebook: https://www.kaggle.com/wti200/one-vs-rest-approach
train.to_parquet('train_parquet.parquet')
test.to_parquet('test_parquet.parquet')

# Read the parquet data.
train_parquet = pd.read_parquet('train_parquet.parquet')
test_parquet = pd.read_parquet('test_parquet.parquet')

FEATURES = [col for col in train_parquet.columns if col not in [cfg['TARGET']]]

In [None]:
lb = LabelEncoder()
y = lb.fit_transform(train_parquet['target'])
train_parquet['target'] = y

In [None]:
train_cut = train_parquet[int(len(train) / 2): len(train)]
y_cut = train_parquet[int(len(train) / 2): len(train)].target

In [None]:
models = dict()
models['ex1'] = ExtraTreesClassifier(n_estimators = n_estimators[0],  random_state = random_states[0], n_jobs = -1)
models['ex2'] = ExtraTreesClassifier(n_estimators = n_estimators[1],  random_state = random_states[1], n_jobs = -1)
models['ex3'] = ExtraTreesClassifier(n_estimators = n_estimators[2],  random_state = random_states[2], n_jobs = -1)

members = [(n,m) for n,m in models.items()]

models['hard_voting'] = VotingClassifier(estimators = members, voting = 'hard')
models['soft_voting'] = VotingClassifier(estimators = members, voting = 'soft')

In [None]:
def evaluate_model(model, X, y, n_splits, random_state):
    cv = StratifiedKFold(n_splits = n_splits, random_state = random_state, shuffle = True)
    scores = cross_val_score(model, X, y, scoring = 'accuracy', cv = cv, n_jobs = -1)
    return scores

In [None]:
results, names = list(), list()
for idx, (name, model) in enumerate(models.items()):
    start_time = time.time()
    scores = evaluate_model(model, train_parquet[FEATURES].values, y, n_splits[idx], random_states[idx])
    results.append(scores)
    names.append(name)
    run_time = time.time() - start_time
    print('>%s %.5f (%.5f) run time %.2f: ' % (name, np.mean(scores), np.std(scores), run_time))
plt.boxplot(results, labels = names, showmeans = True)
plt.show()

In [None]:
models['hard_voting'].fit(train_parquet[FEATURES].values, y)

In [None]:
models['soft_voting'].fit(train_parquet[FEATURES].values, y)

In [None]:
sub = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")
sub.target = lb.inverse_transform(models['hard_voting'].predict(test_parquet[FEATURES].values))

sub.to_csv("vc-hard-submission.csv", index=False)
sub.head(10)

In [None]:
sub.target = lb.inverse_transform(models['soft_voting'].predict(test_parquet[FEATURES].values))

sub.to_csv("vc-soft-submission.csv", index=False)
sub.head(10)

In [None]:
results, names = list(), list()
for idx, (name, model) in enumerate(models.items()):
    start_time = time.time()
    scores = evaluate_model(model, train_cut[FEATURES].values, y_cut, n_splits[idx], random_states[idx])
    results.append(scores)
    names.append(name)
    run_time = time.time() - start_time
    print('>%s %.5f (%.5f) run time %.2f: ' % (name, np.mean(scores), np.std(scores), run_time))
plt.boxplot(results, labels = names, showmeans = True)
plt.show()

In [None]:
models['hard_voting'].fit(train_cut[FEATURES].values, y_cut)

In [None]:
models['soft_voting'].fit(train_cut[FEATURES].values, y_cut)

In [None]:
sub.target = lb.inverse_transform(models['hard_voting'].predict(test_parquet[FEATURES].values))

sub.to_csv("cut_vc-hard-submission.csv", index=False)
sub.head(10)

In [None]:
sub.target = lb.inverse_transform(models['soft_voting'].predict(test_parquet[FEATURES].values))

sub.to_csv("cut_vc-soft-submission.csv", index=False)
sub.head(10)