# Introduction
Hey, thanks for viewing my Kernel!

If you like my work, please, leave an upvote: it will be really appreciated and it will motivate me in offering more content to the Kaggle community ! 😊

In [None]:
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter("ignore")
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

display(train.head())
display(sub.head())

In [None]:
display(train.shape)
display(test.shape)

In [None]:
numeric_cols = train.select_dtypes(include=np.number).columns.tolist()
object_cols = list(set(train.columns) - set(numeric_cols))
numeric_cols.remove('row_id')
ingore_cols = ['row_id']
print('numeric cols len: ', len(numeric_cols))
print('object col: ', object_cols)
print('ignore col: ', ingore_cols)

# Data Cleaning

In [None]:
duplicates_train = train.duplicated().sum()
duplicates_test = test.duplicated().sum()
display(duplicates_train)
display(duplicates_test)

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

# Pseudo Labeling

In [None]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
targets =  pd.DataFrame(enc.fit_transform(train[['target']]))
targets.columns = list(enc.categories_[0])
targets.head()

In [None]:
def get_th(auc):
    auc_last_digits = str(auc)[-2:]
    th = float('0.' + auc_last_digits)
    return th

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

selected_indexs = []
th = 0.95
for col in targets.columns:
    X_train = train[numeric_cols]
    X_test = test[numeric_cols]
    y_train = targets[[col]]
    
    oof = np.zeros(len(X_train))
    preds = np.zeros(len(test))
    
    idx1 = X_train.index; idx2 = X_test.index
    
    skf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    for train_index, test_index in skf.split(X_train, y_train):
        clf = RandomForestClassifier()
        clf.fit(X_train.loc[train_index,:], y_train.loc[train_index, col])
        oof[idx1[test_index]] = clf.predict_proba(X_train.loc[test_index,:])[:,1]
        preds[idx2] += clf.predict_proba(X_test)[:,1] / skf.n_splits
    
    auc = roc_auc_score(y_train, oof)
    print(col, 'RF Scores CV =',round(auc,5), end='')
    
    #th = get_th(round(auc,5))
    test_temp = test.copy()
    test_temp['target'] = preds
    test_temp = test_temp[test_temp['target'] >= th]
    selected_indexs.append(list(test_temp.index))
    
    print(' - Pseudo Labels Len =', len(test_temp))

In [None]:
labeled_test = test.copy()
labeled_test['target'] = ''
for index, col in enumerate(targets.columns):
    indexs = selected_indexs[index]
    labeled_test.loc[indexs, 'target'] = col
labeled_test = labeled_test[labeled_test['target'] != '']
print(labeled_test.shape)
display(labeled_test.head())

In [None]:
new_train = pd.concat([train, labeled_test])
print(new_train.shape)
display(new_train.head())

In [None]:
new_train.to_csv('new_train.csv', index=False)
new_train.to_pickle('new_train.pkl')

# Modeling

In [None]:
%%capture
!pip install pycaret[full]

In [None]:
from pycaret.classification import *

numeric_cols = train.select_dtypes(include=np.number).columns.tolist()
ignore_cols = ['row_id']

clf = setup(data=new_train,
            target='target',
            #normalize = True, #normalisation helps some algorithms
            #normalize_method = 'robust', #resilient to outliers
            #transformation = True, #applies transformation to target column
            #transformation_method = 'quantile',
            data_split_shuffle = False, #so that we do not use "future" observations to predict "past" observations
            create_clusters = True,
            remove_outliers = True,
            #feature_interaction = True,
            numeric_features = numeric_cols,
            ignore_features = ignore_cols,
            session_id = 42,
            use_gpu = False,
            silent = True,
            fold = 10,
            n_jobs = -1)

In [None]:
#top = [create_model('et', n_estimators=300), create_model('rf', n_estimators=300)]
model_et = create_model('et')
model_rf = create_model('rf')

#model_et_calib = calibrate_model(model_et)
#model_rf_calib = calibrate_model(model_rf)

top = [model_et, model_rf]

In [None]:
stack = stack_models(top, optimize='Accuracy')
predict_model(stack);

In [None]:
final_stack = finalize_model(stack)

In [None]:
plot_model(final_stack, plot='error')

In [None]:
plot_model(final_stack, plot = 'confusion_matrix')

# Submission

In [None]:
import gc
gc.collect()
unseen_predictions = predict_model(final_stack, data=test)
unseen_predictions.head()

In [None]:
assert(len(test.index)==len(unseen_predictions))
sub = pd.DataFrame(list(zip(sub.row_id, unseen_predictions.Label)),columns = ['row_id', 'target'])
sub.to_csv('submission_stack.csv', index = False)
sub.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

train_test_preds = pd.DataFrame()
train_test_preds['label'] = list(train['target']) + list(unseen_predictions['Label'])
train_test_preds['train_test'] = 'Test preds'
train_test_preds.loc[0:len(train[['target']]), 'train_test'] = 'Training'

fig, ax = plt.subplots(figsize=(16,3))
sns.countplot(data=train_test_preds, x='label', hue='train_test', ax=ax)
plt.xticks(rotation=90)
plt.show()