# **IMPORTS AND INSTALLATIONS**

In [1]:
%%capture 

!cp /kaggle/usr/lib/regularimports/playgrounds4e08_regularimports.py myimports.py
from myimports import *

Collecting lightgbm==4.5.0
  Downloading lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 4.2.0
    Uninstalling lightgbm-4.2.0:
      Successfully uninstalled lightgbm-4.2.0
Successfully installed lightgbm-4.5.0
Collecting polars==1.2.1
  Downloading polars-1.2.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Downloading polars-1.2.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.9/30.9 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: polars
  Attempting uninstall: polars
    Found existing installation: polars 1.1.0
  

# **FOREWORD**

This kernel starts with the datasets created in the [Data-Store](https://www.kaggle.com/code/ravi20076/playgrounds4e08-datastore) kernel. I do this to prevent multiple data creation endeavors across my experiments as I move along my model pipeline. <br>

In this kernel, we start off with a simple LightGBM baseline model and assess the efficiacy of adding the tertiary original data to the model data. <br>
Note that the tertiary data is created from the associated GitHub repository but it does not have the GAN-noise component in the competition dataset. We need to factor this in our model endeavors in this assignment. 

# **CONFIGURATION**

In [2]:
%%time 

target      = "class"
test_req    = False

model_label = "LGBM"
version_nb  = 1
model_group = 1

op_path    = f"/kaggle/working"
ip_path    = f"/kaggle/input/playgrounds4e08-datastore"

orig_req   = False
nsamples   = 100000

n_splits     = 5
state        = 42
ftre_imp_req = True
cutoff       = 0.50

CPU times: user 11 µs, sys: 0 ns, total: 11 µs
Wall time: 15.3 µs


# **DATA LOADS**

In [3]:
%%time 

X      = pd.read_parquet(os.path.join(ip_path, "train.parquet"))
test   = pd.read_parquet(os.path.join(ip_path, "test.parquet"))
sub_fl = pd.read_parquet(os.path.join(ip_path, "sample_submission.parquet"))

cat_cols = \
['capshape', 'capsurface', 'capcolor', 'doesbruiseorbleed',
 'gillattachment', 'gillspacing', 'gillcolor', 'stemroot', 'stemsurface',
 'stemcolor', 'veiltype', 'veilcolor', 'hasring', 'ringtype',
 'sporeprintcolor', 'habitat', 'season'
 ]

X[cat_cols]     = X[cat_cols].astype("category")
test[cat_cols]  = test[cat_cols].astype("category")

PrintColor(f"---> Shapes = {X.shape} | {test.shape}")

if orig_req:
    PrintColor(f"\n---> We need the original data for model training")
    
    if isinstance(nsamples, int):
        original = X.loc[X.Source == 'Original'].groupby(target).sample(n = nsamples)
        X = X.loc[X.Source == 'Competition']
        X = pd.concat([X, original], axis=0, ignore_index = True)
        X.index = range(len(X))
        del original
    else:
        PrintColor(f"---> Full original data is used")     
else:
    X = X.loc[X.Source == 'Competition']
    PrintColor(f"---> Shapes = {X.shape} | {test.shape} | without original data", 
               color = Fore.RED
              )

# Sampling for testing purposes
if test_req:
    X       = X.groupby([target, "Source"]).head(1000)
    X.index = range(len(X))
    test    = test.iloc[0:100]
    sub_fl  = sub_fl.iloc[0:100]
    
    PrintColor(f"---> Shapes = {X.shape} | {test.shape}")
else:
    PrintColor(f"---> Syntax check is not needed", color = Fore.RED)
    
y = X[target]
X = X.drop(target, axis=1)

PrintColor(f"---> Shapes = {X.shape} | {y.shape} | {test.shape}")
    
print();
collect();

[1m[34m---> Shapes = (4201981, 22) | (2077964, 21)[0m
[1m[31m---> Shapes = (3116945, 22) | (2077964, 21) | without original data[0m
[1m[31m---> Syntax check is not needed[0m
[1m[34m---> Shapes = (3116945, 21) | (3116945,) | (2077964, 21)[0m

CPU times: user 19.3 s, sys: 3.33 s, total: 22.6 s
Wall time: 19.3 s


# **MODEL TRAINING**

|Version Label| Kernel version | Model| Description| OOF CV score| LB score|
|:-:| :-: | :-:| --------| :-:| :-:|
1  | 2    | LGBM | * Excluded original data completely | | | 
2  | 3    | LGBM | * Included 100_000 original data samples | | |
3  | 4    | LGBM | * Included complete original data samples | | |

In [4]:
%%time 

cv         = SKF(n_splits= n_splits, shuffle= True, random_state = state)
test_preds = 0
scores     = []
drop_cols  = ["Source", "id", target]
ftre_imp   = 0
sel_cols   = X.drop(columns = drop_cols, errors = "ignore").columns

OOF_Preds = pd.DataFrame(X.loc[X.Source == 'Competition'].index, 
                         columns = [f"{model_label}V{version_nb}_{model_group}"],
                         dtype = np.float32,
                        )

PrintColor(f"\n-------- {model_label} MODEL TRAINING --------\n")
for fold_nb, (train_idx, dev_idx) in tqdm(enumerate(cv.split(X, y))):

    Xtr  = X.iloc[train_idx][sel_cols]
    ytr  = y.iloc[train_idx]
    Xdev = X.iloc[dev_idx].query("Source == 'Competition'")[sel_cols]
    ydev = y.loc[Xdev.index]
    
    model = LGBMC(objective     = "binary",
                  eval_metric   = "logloss",
                  n_estimators  = 3000,
                  max_depth     = 9,
                  learning_rate = 0.06, 
                  random_state  = state,
                  max_bin       = 1024,
                  colsample_bytree = 0.7,
                  reg_lambda    = 80,
                  verbosity     = -1,
                 )

    model.fit(Xtr, ytr,
              eval_set  = [(Xdev, ydev)],
              eval_names = [("Dev")],
              callbacks = [log_evaluation(0), early_stopping(100)],
              )

    if ftre_imp_req:
        ftre_imp  = ftre_imp + model.feature_importances_

    score1    = model.best_score_['Dev']['binary_logloss']
    dev_preds = model.predict_proba(Xdev)[:,1]
    score2    = matthews_corrcoef(ydev, np.where(dev_preds >= cutoff, 1, 0))

    print(f"---> OOF score [Logloss | MCC] = {score1:.6f} | {score2 :.6f} | Fold{fold_nb}")
    scores.append(score2)

    test_preds = test_preds + (model.predict_proba(test[sel_cols])[:, 1]/n_splits)                          
    OOF_Preds.loc[Xdev.index, f"{model_label}V{version_nb}_{model_group}"] = dev_preds;
    del Xtr, Xdev, ytr, ydev, score1, score2, model;
    collect();

PrintColor(f'\n\n---> OOF MCC score: {np.mean(scores) :.6f} +- {np.std(scores) :.6f} \n',
          color = Fore.CYAN
          )
collect();
print();

if ftre_imp_req:
    display(pd.DataFrame(ftre_imp, index = sel_cols, columns = ["FtreImp"]).\
            sort_values(["FtreImp"], ascending = False).\
            transpose().\
            style.format(formatter = "{:,.2f}").\
            set_caption(f"Feature Importances").\
            set_properties(**{"text-align": "center"}).\
            background_gradient(subset = sel_cols,
                                cmap = "rocket", 
                                axis=1
                               )
            )


[1m[34m
-------- LGBM MODEL TRAINING --------
[0m


0it [00:00, ?it/s]

---> OOF score [Logloss | MCC] = 0.035990 | 0.984684 | Fold0
---> OOF score [Logloss | MCC] = 0.035990 | 0.984646 | Fold1
---> OOF score [Logloss | MCC] = 0.035973 | 0.984626 | Fold2
---> OOF score [Logloss | MCC] = 0.036266 | 0.984643 | Fold3
---> OOF score [Logloss | MCC] = 0.036083 | 0.984700 | Fold4
[1m[36m

---> OOF MCC score: 0.984660 +- 0.000028 
[0m



Unnamed: 0,stemwidth,stemheight,capdiameter,capsurface,gillcolor,capcolor,gillattachment,stemcolor,capshape,stemsurface,season,gillspacing,habitat,ringtype,stemroot,doesbruiseorbleed,sporeprintcolor,veilcolor,hasring,veiltype
FtreImp,72793.0,71603.0,66923.0,35591.0,28925.0,27422.0,20525.0,18976.0,17190.0,12922.0,12630.0,10953.0,10620.0,9700.0,5448.0,4208.0,3254.0,2792.0,1958.0,770.0


CPU times: user 3h 17min 7s, sys: 11.7 s, total: 3h 17min 19s
Wall time: 3h 5min 59s


# **CLOSURE**

In [5]:
def PostProcessPreds(sub_fl: pd.DataFrame, target: str = target):
    "This function post-processes the predictions using saved predictions and targets"
    
    try:
        sub_fl = sub_fl.set_index("id")
    except:
        print(f"---> Submission file index is intact")

    sub_fl.loc[3640058, target] = "e"
    sub_fl.loc[sub_fl.index.isin([3600675, 4057201, 4729429, 4929268, 4985595]), target] = "p"
    return sub_fl;

In [6]:
%%time 

print("\n\n")
sub_fl[target] = np.where(test_preds >= cutoff, "p", "e")
sub_fl = PostProcessPreds(sub_fl)

test_preds = \
pd.DataFrame(test_preds,index = range(len(test)),
             columns = [f"{model_label}V{version_nb}_{model_group}"],
             dtype = np.float32,
            )

print("\n\n")
display(test_preds.head(10).style.set_caption(f"Submission file predictions"))
print("\n\n")
display(sub_fl.head(10).style.set_caption(f"Submission file labels"))
print("\n\n")

OOF_Preds.index.name = "id"
OOF_Preds.sort_index().reset_index().\
to_parquet(os.path.join(op_path, f'OOF_Preds_{model_label}V{version_nb}_{model_group}.parquet'))

test_preds.\
to_parquet(os.path.join(op_path, f'Mdl_Preds_{model_label}V{version_nb}_{model_group}.parquet'))

sub_fl.\
to_parquet(os.path.join(op_path, f'Submission_{model_label}V{version_nb}_{model_group}.parquet'),
           index= True
          )

%reset -f




---> Submission file index is intact





Unnamed: 0,LGBMV1_1
0,0.001918
1,0.998411
2,0.993121
3,0.99495
4,0.001912
5,0.008565
6,0.002108
7,0.999325
8,0.996429
9,0.001113







Unnamed: 0_level_0,class
id,Unnamed: 1_level_1
3116945,e
3116946,p
3116947,p
3116948,p
3116949,e
3116950,e
3116951,e
3116952,p
3116953,p
3116954,e





CPU times: user 1.58 s, sys: 259 ms, total: 1.84 s
Wall time: 1.83 s
