In [1]:
import pandas as pd # type: ignore
import numpy as np # type: ignore
from sklearn.metrics import f1_score # type: ignore
from sklearn.utils.class_weight import compute_class_weight # type: ignore
from sklearn.model_selection import StratifiedKFold # type: ignore
import lightgbm as lgb # type: ignore
import warnings
import random

seed = 42
n_splits = 5  # Number of folds
random.seed(42)
np.random.seed(42)
warnings.filterwarnings('ignore')


In [2]:
# Load files
train = pd.read_csv('Train.csv',parse_dates=['time'])
test = pd.read_csv('Test.csv',parse_dates=['time'])

In [3]:
train=train.dropna(subset=['Target'])

In [4]:
# Aggregate the train dataset by 'ID' using mean, std, min, and max for each column
train_aggregated_df = train.groupby('ID').agg(['mean', 'std', 'min', 'max']).reset_index()

# Flatten the MultiIndex columns
train_aggregated_df.columns = ['_'.join(col).strip('_') for col in train_aggregated_df.columns]

# Display the first few rows
train_aggregated_df.head()

Unnamed: 0,ID,time_mean,time_std,time_min,time_max,Green_mean,Green_std,Green_min,Green_max,Blue_mean,...,NDWI_min,NDWI_max,CI_mean,CI_std,CI_min,CI_max,Target_mean,Target_std,Target_min,Target_max
0,1D_0000,2022-11-30 22:21:56.129032192,404 days 17:47:06.499532872,2021-01-04,2024-10-30,0.265979,0.216543,0.064,0.8526,0.286144,...,-0.562095,0.007948,0.051841,0.07248,-0.056774,0.284581,0.0,0.0,0.0,0.0
1,1D_0001,2022-11-30 22:21:56.129032192,404 days 17:47:06.499532872,2021-01-04,2024-10-30,0.270399,0.208321,0.0638,0.8756,0.289762,...,-0.617116,0.026412,0.048843,0.071093,-0.045155,0.236865,0.0,0.0,0.0,0.0
2,1D_0002,2022-11-30 22:21:56.129032192,404 days 17:47:06.499532872,2021-01-04,2024-10-30,0.266084,0.21604,0.0627,0.861,0.286621,...,-0.582704,-0.001007,0.049452,0.071069,-0.080468,0.300748,0.0,0.0,0.0,0.0
3,1D_0003,2022-11-30 22:21:56.129032192,404 days 17:47:06.499532872,2021-01-04,2024-10-30,0.266266,0.216637,0.0642,0.8625,0.286324,...,-0.549183,-0.00266,0.048114,0.066958,-0.040092,0.24704,0.0,0.0,0.0,0.0
4,1D_0004,2022-11-30 22:21:56.129032192,404 days 17:47:06.499532872,2021-01-04,2024-10-30,0.267871,0.215886,0.0632,0.8804,0.287685,...,-0.57524,-0.006099,0.050417,0.072362,-0.045661,0.261166,0.0,0.0,0.0,0.0


In [5]:
train_aggregated_df.drop(['Target_std','Target_min','Target_max'],axis=1,inplace=True)

In [6]:
# Aggregate the test dataset by 'ID' using mean, std, min, and max for each column
test_aggregated_df = test.groupby('ID').agg(['mean', 'std', 'min', 'max']).reset_index()

# Flatten the MultiIndex columns
test_aggregated_df.columns = ['_'.join(col).strip('_') for col in test_aggregated_df.columns]

# Display the first few rows
test_aggregated_df.head()


Unnamed: 0,ID,time_mean,time_std,time_min,time_max,Green_mean,Green_std,Green_min,Green_max,Blue_mean,...,NDMI_min,NDMI_max,NDWI_mean,NDWI_std,NDWI_min,NDWI_max,CI_mean,CI_std,CI_min,CI_max
0,1D_0005,2022-11-30 22:21:56.129032192,404 days 17:47:06.499532872,2021-01-04,2024-10-30,0.269574,0.20895,0.0659,0.9038,0.289363,...,-0.087035,0.610293,-0.222745,0.156857,-0.552954,0.007155,0.050226,0.069529,-0.04409,0.270099
1,1D_000A,2022-11-30 22:21:56.129032192,404 days 17:47:06.499532872,2021-01-04,2024-10-30,0.271481,0.216468,0.0643,0.9324,0.290375,...,-0.069135,0.611658,-0.275098,0.199968,-0.658319,-0.004489,0.061182,0.083192,-0.051211,0.314374
2,1D_000D,2022-11-30 22:21:56.129032192,404 days 17:47:06.499532872,2021-01-04,2024-10-30,0.271598,0.214323,0.0642,0.898,0.292014,...,-0.060208,0.612928,-0.238293,0.170185,-0.600428,-0.005743,0.046739,0.066896,-0.040166,0.233684
3,1D_000E,2022-11-30 22:21:56.129032192,404 days 17:47:06.499532872,2021-01-04,2024-10-30,0.276286,0.215715,0.0665,0.9605,0.293892,...,-0.011673,0.612466,-0.249908,0.180061,-0.613776,0.001104,0.050191,0.071297,-0.039916,0.270006
4,1D_0015,2022-11-30 22:21:56.129032192,404 days 17:47:06.499532872,2021-01-04,2024-10-30,0.277222,0.214505,0.0686,0.828,0.294147,...,-0.044343,0.610018,-0.271707,0.200442,-0.658626,0.002996,0.064108,0.086831,-0.060134,0.318731


In [7]:
to_drop=['time_mean','time_std','time_min','time_max']

In [8]:
train_aggregated_df.drop(columns=to_drop, inplace=True)
test_aggregated_df.drop(columns=to_drop,inplace=True)

In [9]:
train_aggregated_df.shape

(26197, 70)

In [10]:
test_aggregated_df.shape

(16960, 69)

In [11]:
ID=test_aggregated_df['ID']
y=train_aggregated_df['Target_mean']

X=train_aggregated_df.drop(['ID','Target_mean'],axis=1)
test=test_aggregated_df.drop(['ID'],axis=1)

In [12]:
X.shape

(26197, 68)

In [13]:
y.value_counts()

Target_mean
0.0    11913
2.0     9160
1.0     5124
Name: count, dtype: int64

In [14]:
# Compute class weights dynamically
class_weights_array = compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)

# Convert to dictionary
class_weights = {cls: weight for cls, weight in zip(np.unique(y), class_weights_array)}

print("Computed class weights:", class_weights)


Computed class weights: {0.0: 0.7330087579395058, 1.0: 1.7042024460057248, 2.0: 0.9533114992721979}


In [16]:
# LightGBM parameters
params = {
    "objective": "multiclass",
    "num_class": 3,
    "metric": "multi_logloss",
    "boosting_type": "gbdt",
    "num_leaves": 107,
    "learning_rate": 0.2,
    "lambda_l1": 0.08622519951897417,
    "lambda_l2": 0.33687324883916836,
    "min_data_in_leaf": 207,
    "feature_fraction": 0.7506850717273278,
    "bagging_fraction": 0.8093949846609051,
    "bagging_freq": 4,
}

# Class weights
class_weights = {0: 0.733, 1: 1.704, 2: 0.953}
sample_weights = y.map(class_weights).values

# Stratified K-Fold
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

# Store predictions
test_preds_total = np.zeros((test.shape[0], 3))  # To store test predictions
val_f1_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nTraining Fold {fold+1}/{n_splits}...")

    # Split the data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    sw_train, sw_val = sample_weights[train_idx], sample_weights[val_idx]

    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train, weight=sw_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    # Train the model
    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
        num_boost_round=5000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=100),
        ],
    )

    # Validation predictions
    val_preds = model.predict(X_val).argmax(axis=1)
    f1 = f1_score(y_val, val_preds, average="macro")
    val_f1_scores.append(f1)
    print(f"Fold {fold+1} F1 Score: {f1:.4f}")

    # Test predictions (soft voting by averaging probabilities)
    test_preds_total += model.predict(test) / n_splits

# Convert test predictions to class labels
test_preds_final = test_preds_total.argmax(axis=1)

# Final F1 score
print(f"\nAverage Validation F1 Score: {np.mean(val_f1_scores):.4f}")

# Create submission
submission = pd.DataFrame({
    'ID': ID,  # Replace with the actual test ID column
    'Target': test_preds_final
})

# Save to CSV
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")



Training Fold 1/5...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005357 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17083
[LightGBM] [Info] Number of data points in the train set: 20957, number of used features: 68
[LightGBM] [Info] Start training from score -1.098483
[LightGBM] [Info] Start training from score -1.098597
[LightGBM] [Info] Start training from score -1.098756
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 0.000443194
[200]	valid_0's multi_logloss: 0.00040671
Early stopping, best iteration is:
[163]	valid_0's multi_logloss: 0.000406126
Fold 1 F1 Score: 0.9998

Training Fold 2/5...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004648 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17081
[LightGBM] [Info] Number of data points in the train set: 