In [None]:
# =======================================================
# TPS October 2021 - CatBoost 
# =======================================================
# Name: Bárbara Sulpis
# Date: 10-oct-2021
# Description: Example of CatBoost algorithm
#  This example is developed with KFold, that splits the dataset in order to avoid overfitting
# 
# I based on another kaggle notebook example: https://www.kaggle.com/ranjeetshrivastav/tps-oct-21-eda-catboost
# Python 3 - kaggle/python Docker image: https://github.com/kaggle/docker-python

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from sklearn.model_selection import train_test_split

#Lgbm
import lightgbm as lgb

# roc
import sklearn.metrics as metrics   # Para la curva ROC
import matplotlib.pyplot as plt     # Para la curva ROC

# Catboost Classifier
from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import optuna

# ---------------------------
# Input data:
# Go to file -> add or upload data -> "Competition" data tab and select the commpetition which you want to add the csv data data "
# files are available in the read-only "../input/" directory
# ---------------------------
#list =  os. getcwd()
#print(list)
#os. chdir("kaggle") 

list =  os. getcwd()
print(list) # shoud be in "kaggle" directory

# I left this commented if you want to check that the files are there
# i = 0
# for subdir, dirs, files in os.walk('./'):
#     for file in files:
#         print(file)
#         i+= 1
#         if i>20: 
#             break


data = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv")        
subm = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv")        

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def Draw_ROC(y_test, preds, base_name):
    fpr, tpr, threshold = metrics.roc_curve(y_test , preds)
    roc_auc = metrics.auc(fpr, tpr)

    plt.title('Receiver Operating Characteristic ('+ base_name+ ')')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.6f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [None]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
    
            # test if column can be converted to an integer
            asint = props[col].astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props

In [None]:
data = reduce_mem_usage (data)

In [None]:
subm = reduce_mem_usage(subm)

In [None]:
# --------------------------------------------------------------------
#     TARGET
# --------------------------------------------------------------------
y = data['target']
X = data.drop(['target', 'id'], axis=1)

In [None]:
# -----------------------------------------
# Catboost Classification
# -----------------------------------------
# Based on example: https://www.kaggle.com/ranjeetshrivastav/tps-oct-21-eda-catboost
# The link has the HPO. I only used its best parameters  
# catboost params
cat_params = {'iterations': 2866,
 'od_wait': 3385,
 'learning_rate': 0.04280810491488757,
 'reg_lambda': 0.32139709692279206,
 'subsample': 0.8442605943226449,
 'random_strength': 22.468752639603235,
 'depth': 4,
 'min_data_in_leaf': 31,
 'leaf_estimation_iterations': 15,
 'task_type':"GPU",
 'bootstrap_type':'Poisson'}

In [None]:
folds = KFold(n_splits = 5, random_state = 2021, shuffle = True)

X_subm = subm.drop(['id'], axis=1)
predictions = np.zeros(len(X_subm))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X)):
    print(f"Fold: {fold}")
    X_train, X_test = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_test = y.iloc[trn_idx], y.iloc[val_idx]

    model = CatBoostClassifier(**cat_params)
   
    model.fit(X_train, y_train,
              eval_set=[(X_test, y_test)],
                early_stopping_rounds=400,
                verbose=False)
    pred = model.predict_proba(X_test)[:,1]
    roc = roc_auc_score(y_test, pred)
    print(f" roc_auc_score: {roc}")
    print("-"*50)
    
    predictions += model.predict_proba(X_subm)[:,1] / folds.n_splits 
    
# NOTE: If it throws an error do this: 
#    Side bar > Settings > Accelerator: select GPU 

In [None]:
# ---------------------------------------
#  SUBMISSION FILE 
# ---------------------------------------

submit = pd.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv")
submit['target'] = predictions
submit.to_csv("CatBoost_baseline.csv", index=False)