# Introduction

Hey, thanks for viewing my Kernel!

If you like my work, please, leave an upvote: it will be really appreciated and it will motivate me in offering more content to the Kaggle community ! 😊

EDA is in this [notebook](https://www.kaggle.com/hasanbasriakcay/tps-feb22-eda-ignore-important-cols).

In [None]:
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter("ignore")
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

display(train.head())
display(sub.head())

In [None]:
train.drop_duplicates(keep='first', inplace=True)
train.shape

# Reduce Memory Usage

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

# Modeling

In [None]:
%%capture
!pip install pycaret[full]

In [None]:
from pycaret.classification import *

numeric_cols = train.select_dtypes(include=np.number).columns.tolist()
ignore_cols = ['row_id']

clf = setup(data=train,
            target='target',
            #normalize = True, #normalisation helps some algorithms
            #normalize_method = 'robust', #resilient to outliers
            #transformation = True, #applies transformation to target column
            #transformation_method = 'quantile',
            data_split_shuffle = False, #so that we do not use "future" observations to predict "past" observations
            create_clusters = True,
            remove_outliers = True,
            #feature_interaction = True,
            numeric_features = numeric_cols,
            ignore_features = ignore_cols,
            session_id = 42,
            use_gpu = False,
            silent = True,
            fold = 10,
            n_jobs = -1)

## Blending

In [None]:
#N = 2
#include = ['nb', 'ridge', 'rf', 'et', 'dt', 'lr', 'qda', 'lda', 'lightgbm']
#include = ['rf', 'et']
#top = compare_models(sort = 'Accuracy', n_select = N, include = include)

In [None]:
#tuned_top = [tune_model(i, optimize = 'accuracy', choose_better=True, n_iter=100) for i in top]

In [None]:
#blend = blend_models(top, optimize='Accuracy')
#predict_model(blend);

In [None]:
#final_blend = finalize_model(blend)

In [None]:
#plot_model(final_blend, plot='error')

In [None]:
#plot_model(final_blend, plot = 'confusion_matrix')

## Ensembling

In [None]:
include = ['nb', 'ridge', 'rf', 'et', 'dt', 'lr', 'qda', 'lda', 'lightgbm']
best = compare_models(sort = 'Accuracy', include = include)

In [None]:
#tuned = tune_model(best, optimize = 'accuracy', choose_better=True, n_iter=100)

In [None]:
ensemble = ensemble_model(best, method='Boosting', optimize='Accuracy')
predict_model(ensemble);

In [None]:
final_ensemble = finalize_model(ensemble)

In [None]:
plot_model(final_ensemble, plot='error')

In [None]:
plot_model(final_ensemble, plot = 'confusion_matrix')

## Stacking

In [None]:
#N = 2
#include = ['nb', 'ridge', 'rf', 'et', 'dt', 'lr', 'qda', 'lda', 'lightgbm']
#include = ['rf', 'et']
#top = compare_models(sort = 'Accuracy', n_select = N, include = include)

In [None]:
#tuned_top = [tune_model(i, optimize = 'accuracy', choose_better=True, n_iter=100) for i in top]

In [None]:
#stack = stack_models(top, optimize='Accuracy')
#predict_model(stack);

In [None]:
#final_stack = finalize_model(stack)

In [None]:
#plot_model(final_stack, plot='error')

In [None]:
#plot_model(final_stack, plot = 'confusion_matrix')

# Predictions

In [None]:
import gc
gc.collect()
unseen_predictions = predict_model(final_ensemble, data=test)
unseen_predictions.head()

In [None]:
assert(len(test.index)==len(unseen_predictions))
sub = pd.DataFrame(list(zip(sub.row_id, unseen_predictions.Label)),columns = ['row_id', 'target'])
sub.to_csv('submission_stack.csv', index = False)
sub.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

train_test_preds = pd.DataFrame()
train_test_preds['label'] = list(train['target']) + list(unseen_predictions['Label'])
train_test_preds['train_test'] = 'Test preds'
train_test_preds.loc[0:len(train[['target']]), 'train_test'] = 'Training'

fig, ax = plt.subplots(figsize=(16,3))
sns.countplot(data=train_test_preds, x='label', hue='train_test', ax=ax)
plt.xticks(rotation=90)
plt.show()