In [None]:
# install Pycaret
# PyCaret is an open source, 
# low-code machine learning library in Python 
# that allows you to go from preparing your data
# to deploying your model within minutes in your
# choice of notebook environment.
!pip install pycaret[full]

In [None]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.classification import *

In [None]:
# Load Data
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

all_data = pd.concat([train, test])


In [None]:
all_data2 = all_data.drop(columns='id')
all_data2

# Handle missing values
We can use mean values to handle missing values.
Or, we can predict missing values with clean data.
In pycaret, we can use imputation function(numeric_imputation) :)

In [None]:
# Distribution

plt.figure(figsize = (12, 6))
missing_values = all_data2.isnull().sum()[:-1]
sns.histplot(missing_values, color='violet');
plt.show()

print('\n')
print('-------- Distribution of Missing values --------')
print('Min:', missing_values.min())
print('Max:', missing_values.max())
print('Mean:', missing_values.mean())
print('------------------------------------------------')


# Modeling¶
## Blending Ensemble
- LightGBM
- Catboost
- Xgboost

In [None]:
train2 = all_data2[:len(train)]
test2 = all_data2[len(train):]
test2.drop(columns = 'claim', inplace = True)


In [None]:
from sklearn.metrics import log_loss

def pycaret_model(train, target, test, fold):
    
    print('Setup Your Data....')
    setup(data=train,
        target=target,
        normalize = True,
        normalize_method = 'robust',
        silent= True,
        use_gpu=True,
        session_id = 42,
        numeric_imputation = 'mean')
    
    add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False, target='pred_proba')
    
    print('Creating Models....')
    
    model_list = []
    lgbm = create_model(estimator = 'lightgbm', fold = fold)
    xgb = create_model(estimator = 'xgboost', fold = fold)
    cat = create_model(estimator = 'catboost', fold = fold)
    
    model_list.append(lgbm)
    model_list.append(xgb)
    model_list.append(cat)
    
    print('Blending Models....')
    blended = blend_models(estimator_list= model_list, fold=fold)
    pred_holdout = predict_model(blended)

    print('Finallizing Models....')
    final_model = finalize_model(blended)

    print('Done...!!!')

    prep_pipe = get_config('prep_pipe')
    prep_pipe.steps.append(['trained_model', final_model])
    pred = prep_pipe.predict_proba(test)

    return pred

In [None]:
result = pycaret_model(train2, 'claim', test2, 5)
result


In [None]:
sub = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
sub['claim'] = result[:, 1]
sub.to_csv('sub.csv', index = 0)