In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import os

train_file_path = "hacktrain.csv"
test_file_path = "hacktest.csv"

try:
    if not os.path.exists(train_file_path):
        raise FileNotFoundError(f"Training data not found at: {train_file_path}")
    train_df = pd.read_csv(train_file_path)
    print(f"Successfully loaded training data from {train_file_path}. Shape: {train_df.shape}")

    if not os.path.exists(test_file_path):
        raise FileNotFoundError(f"Test data not found at: {test_file_path}")
    test_df = pd.read_csv(test_file_path)
    print(f"Successfully loaded test data from {test_file_path}. Shape: {test_df.shape}")

except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please ensure 'hacktrain.csv' and 'hacktest.csv' are in the same directory.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred during file loading: {e}")
    exit()

if "Unnamed: 0" in train_df.columns:
    train_df = train_df.drop(columns=["Unnamed: 0"])
    print("Dropped 'Unnamed: 0' column from training data.")
if "Unnamed: 0" in test_df.columns:
    test_df = test_df.drop(columns=["Unnamed: 0"])
    print("Dropped 'Unnamed: 0' column from test data.")

print("\nFirst 5 rows of training data:")
print(train_df.head())

ndvi_cols = [col for col in train_df.columns if col.endswith('_N')]
print(f"\nIdentified {len(ndvi_cols)} NDVI columns.")

def add_statistical_features(df):
    print(f"  Adding statistical features to a DataFrame with {df.shape[0]} rows...")
    df['ndvi_mean'] = df[ndvi_cols].mean(axis=1)
    df['ndvi_std'] = df[ndvi_cols].std(axis=1)
    df['ndvi_min'] = df[ndvi_cols].min(axis=1)
    df['ndvi_max'] = df[ndvi_cols].max(axis=1)
    df['ndvi_range'] = df['ndvi_max'] - df['ndvi_min']
    df['ndvi_spring_mean'] = df[[col for col in ndvi_cols if col[4:6] in ['03', '04', '05']]].mean(axis=1)
    df['ndvi_summer_mean'] = df[[col for col in ndvi_cols if col[4:6] in ['06', '07', '08']]].mean(axis=1)
    df['ndvi_autumn_mean'] = df[[col for col in ndvi_cols if col[4:6] in ['09', '10', '11']]].mean(axis=1)
    print("  Statistical features added successfully.")
    return df

print("\nPerforming feature engineering on training data...")
train_df = add_statistical_features(train_df)
print("Performing feature engineering on test data...")
test_df = add_statistical_features(test_df)

print("\nImputing missing values using median strategy...")
imputer = SimpleImputer(strategy="median")
X_train_ndvi_imputed = imputer.fit_transform(train_df[ndvi_cols])
X_test_ndvi_imputed = imputer.transform(test_df[ndvi_cols])
print("Missing values in NDVI columns imputed.")

additional_features = ['ndvi_mean','ndvi_std','ndvi_min','ndvi_max','ndvi_range','ndvi_spring_mean','ndvi_summer_mean','ndvi_autumn_mean']
X_train_combined_features = np.hstack([X_train_ndvi_imputed, train_df[additional_features].values])
X_test_combined_features = np.hstack([X_test_ndvi_imputed, test_df[additional_features].values])
print(f"Combined features for training: {X_train_combined_features.shape}")
print(f"Combined features for testing: {X_test_combined_features.shape}")

print("\nScaling features using StandardScaler...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_combined_features)
X_test_scaled = scaler.transform(X_test_combined_features)
print("Features scaled successfully.")

print("Encoding target variable 'class'...")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(train_df["class"])
print(f"Original classes: {label_encoder.classes_}")
print("Target variable encoded.")

print("\nSplitting data into training and validation sets (80/20 split)...")
X_train_subset, X_val_subset, y_train_subset, y_val_subset = train_test_split(
    X_train_scaled, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)
print(f"Training subset shape: {X_train_subset.shape}, Validation subset shape: {X_val_subset.shape}")

print("Initializing and training XGBoost model...")
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, eval_metric='mlogloss')
xgb_model.fit(X_train_subset, y_train_subset)
print("Model training complete.")

val_preds_encoded = xgb_model.predict(X_val_subset)
val_accuracy = accuracy_score(y_val_subset, val_preds_encoded)
print(f"\nValidation Accuracy: {val_accuracy:.4f}")

print("\nMaking predictions on the test dataset...")
test_preds_encoded = xgb_model.predict(X_test_scaled)
test_labels_decoded = label_encoder.inverse_transform(test_preds_encoded)
print("Predictions generated and decoded.")

print("Preparing submission file...")
submission_df = pd.DataFrame({
    "ID": test_df["ID"],
    "class": test_labels_decoded
})
submission_output_path = "submission.csv"
submission_df.to_csv(submission_output_path, index=False)
print(f"\nSubmission file '{submission_output_path}' has been saved and is ready for submission!")
print("Script finished successfully.")


Successfully loaded training data from hacktrain.csv. Shape: (8000, 30)
Successfully loaded test data from hacktest.csv. Shape: (2845, 29)
Dropped 'Unnamed: 0' column from training data.
Dropped 'Unnamed: 0' column from test data.

First 5 rows of training data:
   ID  class  20150720_N  20150602_N  20150517_N  20150501_N  20150415_N  \
0   1  water    637.5950     658.668   -1882.030    -1924.36     997.904   
1   2  water    634.2400     593.705   -1625.790    -1672.32     914.198   
2   4  water     58.0174   -1599.160         NaN    -1052.63         NaN   
3   5  water     72.5180         NaN     380.436    -1256.93     515.805   
4   8  water   1136.4400         NaN         NaN     1647.83    1935.800   

   20150330_N  20150314_N  20150226_N  ...  20140610_N  20140525_N  \
0   -1739.990     630.087         NaN  ...         NaN   -1043.160   
1    -692.386     707.626   -1670.590  ...         NaN    -933.934   
2   -1564.630         NaN     729.790  ...    -1025.88     368.622   
