# **IMPORTS AND INSTALLATIONS**

In [None]:
%%capture 

!cp /kaggle/usr/lib/regularimports/playgrounds4e08_regularimports.py myimports.py
from myimports import *

clear_output();

# **FOREWORD**

This is my private work for the competition. <br>
I shall extract the original data separately and add this in full to each fold in the training data

# **CONFIGURATION**

In [None]:
%%time 

target      = "class"
test_req    = False

model_label = "LGBM"
version_nb  = 2
model_group = 1
device      = "cpu"

op_path    = f"/kaggle/working"
ip_path    = f"/kaggle/input/playgrounds4e08-datastore"

orig_req   = True
nsamples   = 1.0

n_splits     = 5
state        = 42
ftre_imp_req = True
cutoff       = 0.50

# **DATA LOADS**

In [None]:
%%time 

PrintColor(f"---> Loading datasets")
X        = pd.read_parquet(os.path.join(ip_path, "train.parquet"))
test     = pd.read_parquet(os.path.join(ip_path, "test.parquet"))
sub_fl   = pd.read_parquet(os.path.join(ip_path, "sample_submission.parquet"))

cat_cols = \
['capshape', 'capsurface', 'capcolor', 'doesbruiseorbleed',
 'gillattachment', 'gillspacing', 'gillcolor', 'stemroot', 'stemsurface',
 'stemcolor', 'veiltype', 'veilcolor', 'hasring', 'ringtype',
 'sporeprintcolor', 'habitat', 'season'
 ]

X[cat_cols]     = X[cat_cols].astype("category")
test[cat_cols]  = test[cat_cols].astype("category")

PrintColor(f"---> Shapes = {X.shape} | {test.shape}", color = Fore.CYAN)

PrintColor(f"---> Separating original data")
if orig_req:
    PrintColor(f"\n---> We need the original data for model training")
    
    if isinstance(nsamples, int):
        PrintColor(f"---> Partial original data is used = {nsamples:,.0f}", color = Fore.CYAN) 
        
        original = X.loc[X.Source == 'Original'].groupby(target).sample(n = nsamples)
        X = X.loc[X.Source == 'Competition']
        X.index = range(len(X))
        original.index = range(len(original))
        
    elif nsamples == 1.0:
        PrintColor(f"---> Full original data is used", color = Fore.CYAN) 
        original = X.loc[X.Source == 'Original']
        original.index = range(len(original))
        X = X.loc[X.Source == 'Competition']
        X.index = range(len(X))
        
else:
    X = X.loc[X.Source == 'Competition']
    PrintColor(f"---> Shapes = {X.shape} | {test.shape} | without original data", 
               color = Fore.RED
              )

# Sampling for testing purposes
if test_req:
    
    X       = X.groupby(target).head(1000)
    X.index = range(len(X))
    test    = test.iloc[0:100]
    sub_fl  = sub_fl.iloc[0:100]
    
    original = original.groupby(target).head(1000)
    original.index = range(len(original))
    
    PrintColor(f"---> Shapes = {X.shape} | {test.shape} | {original.shape} | Syntax check", 
               color = Fore.RED
              )
else:
    PrintColor(f"---> Syntax check is not needed", color = Fore.RED)
    
y = X[target].astype(np.uint8)
X = X.drop(target, axis=1)

PrintColor(f"---> Shapes = {X.shape} | {y.shape} | {test.shape} | {original.shape}")
    
print();
collect();

# **MODEL TRAINING**

In [None]:
%%time 

cv         = SKF(n_splits= n_splits, shuffle= True, random_state = state)
scores1    = []
scores2    = []
drop_cols  = ["Source", "id", target]
ftre_imp   = 0
sel_cols   = X.drop(columns = drop_cols, errors = "ignore").columns

test_preds = 0
orig_preds = 0
OOF_Preds = pd.DataFrame(X.loc[X.Source == 'Competition'].index, 
                         columns = [f"{model_label}V{version_nb}_{model_group}"],
                         dtype = np.float32,
                        )

PrintColor(f"\n-------- {model_label} MODEL TRAINING --------\n")
for fold_nb, (train_idx, dev_idx) in tqdm(enumerate(cv.split(X, y))):
    
    PrintColor(f" {'-' * 15} Fold {fold_nb} {'-' * 15} ", 
               color = Fore.RED
              )

    Xtr  = X.iloc[train_idx][sel_cols]
    ytr  = y.iloc[train_idx]
    Xdev = X.iloc[dev_idx].query("Source == 'Competition'")[sel_cols]
    ydev = y.loc[Xdev.index]
    
    print(f"---> {Xtr.shape} {ytr.shape} | without original")
    
    if orig_req:
        Xtr = pd.concat([Xtr, original[sel_cols]], axis=0, ignore_index = True)
        ytr = pd.concat([ytr, original[target]], axis=0, ignore_index = True)
        print(f"---> {Xtr.shape} {ytr.shape} | with original")
            
    model = LGBMC(objective     = "binary",
                  eval_metric   = "logloss",
                  device        = device,
                  n_estimators  = 3000,
                  max_depth     = 9,
                  learning_rate = 0.06, 
                  random_state  = state,
                  max_bin       = 1024,
                  colsample_bytree = 0.7,
                  reg_lambda    = 80,
                  verbosity     = -1,
                 )

    model.fit(Xtr, ytr,
              eval_set  = [(Xdev, ydev)],
              eval_names = [("Dev")],
              callbacks = [log_evaluation(0), early_stopping(100)],
              )

    if ftre_imp_req:
        ftre_imp  = ftre_imp + model.feature_importances_

    score1    = model.best_score_['Dev']['binary_logloss']
    dev_preds = model.predict_proba(Xdev)[:,1]
    score2    = matthews_corrcoef(ydev, np.where(dev_preds >= cutoff, 1, 0))

    print(f"---> OOF score [Logloss | MCC] = {score1:.6f} | {score2 :.6f}")
    scores1.append(score1)
    scores2.append(score2)

    test_preds = test_preds + (model.predict_proba(test[sel_cols])[:, 1] / n_splits)                          
    OOF_Preds.loc[Xdev.index, f"{model_label}V{version_nb}_{model_group}"] = dev_preds
    
    if orig_req:
        orig_preds = orig_preds + (model.predict_proba(original[sel_cols])[:,1]/ n_splits)
        
    
    del Xtr, Xdev, ytr, ydev, score1, score2, model;
    collect();
    print(f"\n{'=' * 50}\n")

PrintColor(f'\n\n---> {np.mean(scores1) :.6f} +- {np.std(scores1) :.6f} | OOF model eval metric score',
          color = Fore.CYAN
          )
PrintColor(f'---> {np.mean(scores2) :.6f} +- {np.std(scores2) :.6f} | OOF assignment metric score',
          color = Fore.CYAN
          )

collect();
print();

if ftre_imp_req:
    display(pd.DataFrame(ftre_imp, index = sel_cols, columns = ["FtreImp"]).\
            sort_values(["FtreImp"], ascending = False).\
            transpose().\
            style.format(formatter = "{:,.2f}").\
            set_caption(f"Feature Importances").\
            set_properties(**{"text-align": "center"}).\
            background_gradient(subset = sel_cols,
                                cmap = "rocket", 
                                axis=1
                               )
            )


# **CLOSURE**

In [None]:
def PostProcessPreds(sub_fl: pd.DataFrame, target: str = target):
    "This function post-processes the predictions using saved predictions and targets"
    
    try:
        sub_fl = sub_fl.set_index("id")
    except:
        print(f"---> Submission file index is intact")

    sub_fl.loc[3640058, target] = "e"
    sub_fl.loc[sub_fl.index.isin([3600675, 4057201, 4729429, 4929268, 4985595]), target] = "p"
    return sub_fl;

In [None]:
%%time 

print("\n\n")
sub_fl[target] = np.where(test_preds >= cutoff, "p", "e")
sub_fl = PostProcessPreds(sub_fl)

test_preds = \
pd.DataFrame(test_preds,index = range(len(test)),
             columns = [f"{model_label}V{version_nb}_{model_group}"],
             dtype = np.float32,
            )

print("\n\n")
display(test_preds.head(10).style.set_caption(f"Submission file predictions"))
print("\n\n")
display(sub_fl.head(10).style.set_caption(f"Submission file labels"))
print("\n\n")

OOF_Preds.index.name = "id"
OOF_Preds.sort_index().reset_index().\
to_parquet(os.path.join(op_path, f'OOF_Preds_{model_label}V{version_nb}_{model_group}.parquet'))

test_preds.\
to_parquet(os.path.join(op_path, f'Mdl_Preds_{model_label}V{version_nb}_{model_group}.parquet'))

sub_fl.\
to_csv(os.path.join(op_path, f'Submission_{model_label}V{version_nb}_{model_group}.csv'),
       index= True
      )

if orig_req:
    pd.DataFrame(orig_preds, 
                 columns = [f"{model_label}V{version_nb}_{model_group}"], 
                 index = range(len(original)),
                 dtype = np.float32,
                ).\
    to_parquet(os.path.join(op_path, f'Orig_Preds_{model_label}V{version_nb}_{model_group}.parquet'))

print()
!ls
%reset -f

# **CHECKS**

In [None]:
%%time

import pandas as pd

display(
    pd.read_parquet("Orig_Preds_LGBMV2_1.parquet").\
    head(10).style.format(precision = 5).\
    set_caption("Original data predictions")
)