In [None]:
import pandas as pd
import numpy as np
import h5py
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt

In [None]:
id_map = pd.read_csv('id_map.csv')
# sort id_map by ID column
id_map = id_map.sort_values(by=['ID'], ascending=True)
id_map

In [None]:
sub = pd.read_csv('SampleSubmission.csv')
sub.head()

In [None]:
# load images

train_h5=  "train_data.h5" 
test_h5 = "test_data.h5"


# Open the HDF5 file
with h5py.File(train_h5, 'r') as hdf:
    # Extract the images (X)
    train_data = np.array(hdf['images'])
    
    # Extract the labels (y)
    train_label = np.array(hdf['labels'])


with h5py.File(test_h5, 'r') as hdf:
    # Extract the images (X)
    test_data = np.array(hdf['images'])

print(f"Shape of train_data {train_data.shape}")

#### Feature Engineering Version 1 - For Statistics from raw image

In [None]:
# create a function to extract statistics 
class Version_1_FE(BaseEstimator, TransformerMixin):
    def __init__(self, features=None):
        if features is None:
            self.features = {
                'mean': np.mean,
                'median': np.median,
                'std': np.std,
                'min': np.min,
                'max': np.max,
            }
        else:
            self.features = features

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        num_samples, height, width, num_channels = X.shape
        output = np.zeros((num_samples, num_channels * len(self.features)))

        for i, sample in tqdm(enumerate(X), total=num_samples, desc="Processing samples"):
            feature_vector = []
            for channel in range(num_channels):
                # rather than extract the full data, extract a patch of size (8 * 8)
                patch_size = 8
                center_row, center_col= 16//2, 16//2 
                start_row = patch_size - center_row//2
                end_row = patch_size + center_row//2 + 1
                start_col = patch_size - center_col// 2
                end_col = patch_size + center_col//2 + 1

                channel_data = sample[start_row:end_row, start_col:end_col, channel]

                for feature_name, feature_func in self.features.items():
                    feature_value = feature_func(channel_data)
                    feature_vector.append(feature_value)
            output[i] = feature_vector

        return output

In [None]:
# appply version_1 feature engineering to train dataset
scf = Version_1_FE()
train_version_1_data = scf.transform(train_data)

In [None]:
# appply version_1 feature engineering to test dataset
scf = Version_1_FE()
test_version_1_data = scf.transform(test_data)

In [None]:
# bands naming dictionary
bands_dict = {0: 'Blue',
1: 'Green',
2: 'Red',
3: 'NIR',
4: 'SWIR1',
5: 'SWIR2',
}

In [None]:
# TRAIN
# create a column for each of the features created for train
col = [f"{bands_dict[i]}_{feature}" for i in range(train_data.shape[3]) for feature in scf.features.keys()]
train_dataframe = pd.DataFrame(train_version_1_data, columns=col)
train_dataframe["class"] = train_label
train_dataframe['id'] = ['id_' + str(i) for i in range(train_data.shape[0])]

In [None]:
# TEST
# create a column for each of the features created for test
col = [f"{bands_dict[i]}_{feature}" for i in range(test_data.shape[3]) for feature in scf.features.keys()]
test_dataframe = pd.DataFrame(test_version_1_data, columns=col)
test_dataframe['id'] = id_map['id']

In [None]:
not_needed_cols = ['id', 'class', 'fold']
train_columns =  [col for col in train_dataframe.columns if col not in not_needed_cols]
print(train_columns)

#### Modelling

In [None]:
sgkf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
for i, (train_index, test_index) in enumerate(sgkf.split(train_dataframe["id"].values, train_dataframe["class"].values)):
    train_dataframe.loc[test_index, "fold"] = i

In [None]:
from sklearn.utils.class_weight import compute_sample_weight
import lightgbm as lgb


auc_score = 0
fold = 5
test_predictions = []


for i in range(fold):
    train = train_dataframe[train_dataframe['fold']!= i]
    val = train_dataframe[train_dataframe['fold']== i]


    X_train = train[train_columns]
    y_train = train['class']

    X_test = val[train_columns]
    y_test =  val['class']

    train_sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)

    train_data = lgb.Dataset(
        X_train,
        label=y_train,
        weight=train_sample_weights,
    )

    validation_sample_weights = compute_sample_weight(
        class_weight="balanced",
        y=y_test,
    )

    validation_data = lgb.Dataset(X_test, label=y_test, weight=validation_sample_weights)

    param  = {
   'n_estimators':2500,'objective': 'binary',
    'learning_rate':0.03, 'num_leaves':15,'reg_alpha':1,'reg_lambda':7,
    'max_depth':9,
    'random_state':42,'verbose': -1,
     "num_class": 1
     }


    num_round = 5000
    early_stopping_rounds = 100

    model = lgb.train(
        param,
        train_data,
        num_round,
        valid_sets=[validation_data],
        callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds)],
    )

    pred_prob = model.predict(X_test)

    # evaluation of model
    auc_score += roc_auc_score(y_test, pred_prob)
    print(" ")
    print(f'AUC for fold {i+1}: {roc_auc_score(y_test, pred_prob)}')
    print(" ")

    # test prediction
    test_preds = model.predict(test_dataframe[train_columns])
    test_predictions.append(test_preds)


print(f'CV Log_loss: {auc_score/fold}')


#### Submission

In [49]:
# calculate mean predictions for test dataset
mean_test_preds = np.mean(test_predictions, axis=0)
test_dataframe['class'] = mean_test_preds

In [50]:
# merge with submission file

merged_submmission = sub[['id']].merge(test_dataframe, on='id')

In [51]:
merged_submmission[['id', 'class']].to_csv('second_sub.csv', index=False)

#### Feature Importance

In [None]:
lgb.plot_importance(model, importance_type='gain', figsize=(20, 10))
