# **Biomass Train Data Visualize Importance**

### Strategy for this competition

The test tabular data is missing some items compared to the training tabular data.

**First, in the training phase, we verify that we can predict targets using only the tabular data (#1, this notebook).**

We also verify that image data can predict items that are present in the train tabular data but not in the test tabular data (#2).

In the testing phase, we first predict the missing items in the test tabular data from the images. Finally, we use the model already trained on the training data to predict targets using the complete test tabular data (#3).

1. https://www.kaggle.com/code/stpeteishii/biomass-train-data-visualize-importance<br>
2. https://www.kaggle.com/code/stpeteishii/pre-gshh-ndvi-pytorch-lightning-cnn-regressor<br>
https://www.kaggle.com/code/stpeteishii/height-ave-cm-pytorch-lightning-cnn-regressor<br>
https://www.kaggle.com/code/stpeteishii/species-pytorch-lightning-cnn-classifier<br>
3. https://www.kaggle.com/code/stpeteishii/biomass-test-inference<br>

In [None]:
import os
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
from contextlib import contextmanager
from time import time
from tqdm import tqdm
import lightgbm as lgbm
import category_encoders as ce
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, log_loss, accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# Data preparation

In [None]:
data0 = pd.read_csv("/kaggle/input/csiro-biomass/train.csv")
display(data0[0:3].T)
print(data0.columns.tolist())
test0=pd.read_csv('/kaggle/input/csiro-biomass/test.csv')
display(test0[0:3].T)
print(test0.columns.tolist())
delete_cols=['sample_id','image_path','Sampling_Date','State']
data0=data0.drop(columns=delete_cols,axis=1)
display(data0[0:3].T)
print(data0.columns.tolist())
print(set(data0.columns.tolist())-set(test0.columns.tolist()))

# In test data,'Species', 'Pre_GSHH_NDVI', and 'Height_Ave_cm' will be predicted 
# from test image data.

In [None]:
target_names=sorted(data0['target_name'].unique().tolist())
target_name_mapping=dict(zip(target_names,list(range(len(target_names)))))
data0['target_name']=data0['target_name'].map(target_name_mapping)

In [None]:
from sklearn.preprocessing import LabelEncoder

def labelencoder(df):
    for c in df.columns:
        if df[c].dtype=='object': 
            df[c] = df[c].fillna('N')
            lbl = LabelEncoder()
            lbl.fit(list(df[c].values))
            df[c] = lbl.transform(df[c].values)
    return df

In [None]:
data1=labelencoder(data0)

# Target setting

In [None]:
target='target'
dataY=data1[target]
dataX=data1.drop(target,axis=1)

In [None]:
df_columns = list(dataX.columns)
print(df_columns)

In [None]:
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(dataX, dataY, test_size=0.1, random_state=42)

In [None]:
train_df=trainX

In [None]:
def create_numeric_feature(input_df):
    use_columns = df_columns 
    return input_df[use_columns].copy()

In [None]:
from contextlib import contextmanager
from time import time

class Timer:
    def __init__(self, logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None, sep=' '):

        if prefix: format_str = str(prefix) + sep + format_str
        if suffix: format_str = format_str + sep + str(suffix)
        self.format_str = format_str
        self.logger = logger
        self.start = None
        self.end = None

    @property
    def duration(self):
        if self.end is None:
            return 0
        return self.end - self.start

    def __enter__(self):
        self.start = time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time()
        out_str = self.format_str.format(self.duration)
        if self.logger:
            self.logger.info(out_str)
        else:
            print(out_str)

In [None]:
from tqdm import tqdm

def to_feature(input_df):

    processors = [
        create_numeric_feature,
    ]
    
    out_df = pd.DataFrame()
    
    for func in tqdm(processors, total=len(processors)):
        with Timer(prefix='create' + func.__name__ + ' '):
            _df = func(input_df)

        assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)
        
    return out_df

In [None]:
train_feat_df = to_feature(train_df)
#test_feat_df = to_feature(test_df)

# Model

In [None]:
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error

def fit_lgbm(X, y, cv, 
             params: dict=None, 
             verbose: int=50):

    if params is None:
        params = {}

    models = []
    oof_pred = np.zeros_like(y, dtype=float)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = lgbm.LGBMRegressor(**params)
        
        with Timer(prefix='fit fold={} '.format(i)):
            clf.fit(x_train, y_train, 
                    eval_set=[(x_valid, y_valid)])

        pred_i = clf.predict(x_valid)
        oof_pred[idx_valid] = pred_i
        models.append(clf)
        print(f'Fold {i} RMSLE: {mean_squared_error(y_valid, pred_i) ** .5:.4f}')
        print()

    score = mean_squared_error(y, oof_pred) ** .5
    print('-' * 50)
    print('FINISHED | Whole RMSLE: {:.4f}'.format(score))
    return oof_pred, models

In [None]:
params = {
    'objective': 'rmse', 
    'learning_rate': .1,
    'reg_lambda': 1.,
    'reg_alpha': .1,
    'max_depth': 5, 
    'n_estimators': 1000, 
    'colsample_bytree': .5, 
    'min_child_samples': 10,
    'subsample_freq': 3,
    'subsample': .9,
    'importance_type': 'gain', 
    'random_state': 71,
    'num_leaves': 62
}

In [None]:
y = trainY
ydf=pd.DataFrame(y)
display(ydf)

In [None]:
import joblib
from sklearn.model_selection import KFold
import os

os.makedirs('models', exist_ok=True)

MODELS = []
for i in range(1):
    fold = KFold(n_splits=5, shuffle=True, random_state=71)
    ydfi = ydf.iloc[:, i]
    y = np.array(ydfi)
    cv = list(fold.split(train_feat_df, y))
    oof, models = fit_lgbm(train_feat_df.values, y, cv, params=params)
    MODELS += [models]

    for fold_idx, model in enumerate(models):
        filename = f'models/model_target{i}_fold{fold_idx}.joblib'
        joblib.dump(model, filename)

    joblib.dump(models, f'models/all_models_target{i}.joblib')
    
    fig, ax = plt.subplots(figsize=(6,6))
    ax.set_title(target, fontsize=20)
    ax.set_xlabel('true', fontsize=12)
    ax.set_ylabel('pred', fontsize=12)
    ax.scatter(y, oof, alpha=0.3)

# Visualize Importance

In [None]:
def visualize_importance(models, feat_train_df):

    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df['feature_importance'] = model.feature_importances_
        _df['column'] = feat_train_df.columns
        _df['fold'] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df], 
                                          axis=0, ignore_index=True)

    order = feature_importance_df.groupby('column')\
        .sum()[['feature_importance']]\
        .sort_values('feature_importance', ascending=False).index[:50]

    fig, ax = plt.subplots(figsize=(8, max(6, len(order) * .25)))
    sns.boxenplot(data=feature_importance_df, 
                  x='feature_importance', 
                  y='column', 
                  order=order, 
                  ax=ax, 
                  palette='viridis', 
                  orient='h')
    
    ax.tick_params(axis='x', rotation=0)
    #ax.set_title('Importance')
    ax.grid()
    fig.tight_layout()
    
    return fig,ax

#fig, ax = visualize_importance(models, train_feat_df)

In [None]:
for i in range(1):
    models=MODELS[i]
    fold = KFold(n_splits=5, shuffle=True, random_state=71)
    ydfi=ydf.iloc[:,i]
    y=np.array(ydfi)
    cv = list(fold.split(train_feat_df, y))
    oof, models = fit_lgbm(train_feat_df.values, y, cv, params=params)

    fig, ax = visualize_importance(models, train_feat_df)
    ax.set_title(target+' Imortance',fontsize=20)

**The results shows items which are not included in test tabular data, 'Species', 'Pre_GSHH_NDVI' and 'Height_Ave_cm', are important to predict target.**

    # Confirming Model Structure
    loaded_models = joblib.load('models/all_models_target0.joblib')
    
    print(type(loaded_models))  # <class 'list'>
    print(len(loaded_models))   # 5 (because n_splits=5)
    
    # Accessing each fold's model
    for fold_idx, model in enumerate(loaded_models):
        print(f"Fold {fold_idx}: {type(model)}")
        # For inference, predict individually with each model
        pred_fold = model.predict(X_test)