In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.feature_selection import RFECV
from sklearn.metrics import classification_report
from imblearn.combine import SMOTETomek
from imblearn.pipeline import make_pipeline as make_imb_pipeline

# Load data
train = pd.read_csv('/content/hacktrain.csv')
test = pd.read_csv('/content/hacktest.csv')

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("\nClass distribution:")
print(train['class'].value_counts())

# Identify NDVI columns
ndvi_cols = [col for col in train.columns if col.endswith('_N')]

# Fill missing values with median
train[ndvi_cols] = train[ndvi_cols].fillna(train[ndvi_cols].median())
test[ndvi_cols] = test[ndvi_cols].fillna(test[ndvi_cols].median())

# Verify no missing values remain
print("\nRemaining missing values in train:", train[ndvi_cols].isnull().sum().sum())
print("Remaining missing values in test:", test[ndvi_cols].isnull().sum().sum())


Train shape: (8000, 30)
Test shape: (2845, 29)

Class distribution:
class
forest        6159
farm           841
impervious     669
grass          196
water          105
orchard         30
Name: count, dtype: int64

Remaining missing values in train: 0
Remaining missing values in test: 0


In [3]:
def enhanced_features(df, ndvi_cols):
    features = pd.DataFrame()
    features['ID'] = df['ID']

    # Basic statistics
    stats = ['mean', 'std', 'min', 'max', 'median', 'skew', 'kurtosis']
    for stat in stats:
        features[f'ndvi_{stat}'] = getattr(df[ndvi_cols], stat)(axis=1)

    features['ndvi_range'] = features['ndvi_max'] - features['ndvi_min']
    features['ndvi_iqr'] = df[ndvi_cols].quantile(0.75, axis=1) - df[ndvi_cols].quantile(0.25, axis=1)

    # Polynomial fit coefficients
    features['ndvi_trend'] = df[ndvi_cols].apply(lambda x: np.polyfit(range(len(x)), x, 1)[0], axis=1)
    features['ndvi_curve'] = df[ndvi_cols].apply(lambda x: np.polyfit(range(len(x)), x, 2)[0], axis=1)

    # Seasonal differences and ratios
    for i in range(len(ndvi_cols) - 1):
        features[f'ndvi_diff_{i}'] = df[ndvi_cols[i+1]] - df[ndvi_cols[i]]
        features[f'ndvi_ratio_{i}'] = df[ndvi_cols[i+1]] / (df[ndvi_cols[i]] + 1e-6)

    # Rolling statistics (3-month windows)
    window_size = 3
    for i in range(len(ndvi_cols) - window_size + 1):
        window_cols = ndvi_cols[i:i+window_size]
        features[f'ndvi_rolling_mean_{i}'] = df[window_cols].mean(axis=1)
        features[f'ndvi_rolling_std_{i}'] = df[window_cols].std(axis=1)

    # Fourier transforms (first 3 frequency components)
    fft_values = np.abs(np.fft.rfft(df[ndvi_cols], axis=1))
    for i in range(1, min(4, fft_values.shape[1])):
        features[f'ndvi_fft_{i}'] = fft_values[:, i]

    # Percentiles
    for q in [0.1, 0.25, 0.75, 0.9]:
        features[f'ndvi_q{int(q*100)}'] = df[ndvi_cols].quantile(q=q, axis=1)

    # Replace inf/nan if any
    features.replace([np.inf, -np.inf], np.nan, inplace=True)
    features.fillna(0, inplace=True)

    return features

# Generate features
X_train = enhanced_features(train, ndvi_cols)
X_test = enhanced_features(test, ndvi_cols)
y_train = train['class']

print("\nGenerated features:", list(X_train.columns))
print("Total features (excluding ID):", len(X_train.columns) - 1)


  features[f'ndvi_rolling_mean_{i}'] = df[window_cols].mean(axis=1)
  features[f'ndvi_rolling_std_{i}'] = df[window_cols].std(axis=1)
  features[f'ndvi_rolling_mean_{i}'] = df[window_cols].mean(axis=1)
  features[f'ndvi_rolling_std_{i}'] = df[window_cols].std(axis=1)
  features[f'ndvi_rolling_mean_{i}'] = df[window_cols].mean(axis=1)
  features[f'ndvi_rolling_std_{i}'] = df[window_cols].std(axis=1)
  features[f'ndvi_rolling_mean_{i}'] = df[window_cols].mean(axis=1)
  features[f'ndvi_rolling_std_{i}'] = df[window_cols].std(axis=1)
  features[f'ndvi_rolling_mean_{i}'] = df[window_cols].mean(axis=1)
  features[f'ndvi_rolling_std_{i}'] = df[window_cols].std(axis=1)
  features[f'ndvi_rolling_mean_{i}'] = df[window_cols].mean(axis=1)
  features[f'ndvi_rolling_std_{i}'] = df[window_cols].std(axis=1)
  features[f'ndvi_rolling_mean_{i}'] = df[window_cols].mean(axis=1)
  features[f'ndvi_rolling_std_{i}'] = df[window_cols].std(axis=1)
  features[f'ndvi_fft_{i}'] = fft_values[:, i]
  features[f'nd


Generated features: ['ID', 'ndvi_mean', 'ndvi_std', 'ndvi_min', 'ndvi_max', 'ndvi_median', 'ndvi_skew', 'ndvi_kurtosis', 'ndvi_range', 'ndvi_iqr', 'ndvi_trend', 'ndvi_curve', 'ndvi_diff_0', 'ndvi_ratio_0', 'ndvi_diff_1', 'ndvi_ratio_1', 'ndvi_diff_2', 'ndvi_ratio_2', 'ndvi_diff_3', 'ndvi_ratio_3', 'ndvi_diff_4', 'ndvi_ratio_4', 'ndvi_diff_5', 'ndvi_ratio_5', 'ndvi_diff_6', 'ndvi_ratio_6', 'ndvi_diff_7', 'ndvi_ratio_7', 'ndvi_diff_8', 'ndvi_ratio_8', 'ndvi_diff_9', 'ndvi_ratio_9', 'ndvi_diff_10', 'ndvi_ratio_10', 'ndvi_diff_11', 'ndvi_ratio_11', 'ndvi_diff_12', 'ndvi_ratio_12', 'ndvi_diff_13', 'ndvi_ratio_13', 'ndvi_diff_14', 'ndvi_ratio_14', 'ndvi_diff_15', 'ndvi_ratio_15', 'ndvi_diff_16', 'ndvi_ratio_16', 'ndvi_diff_17', 'ndvi_ratio_17', 'ndvi_diff_18', 'ndvi_ratio_18', 'ndvi_diff_19', 'ndvi_ratio_19', 'ndvi_diff_20', 'ndvi_ratio_20', 'ndvi_diff_21', 'ndvi_ratio_21', 'ndvi_diff_22', 'ndvi_ratio_22', 'ndvi_diff_23', 'ndvi_ratio_23', 'ndvi_diff_24', 'ndvi_ratio_24', 'ndvi_diff_25', 'nd

  features[f'ndvi_rolling_mean_{i}'] = df[window_cols].mean(axis=1)
  features[f'ndvi_rolling_std_{i}'] = df[window_cols].std(axis=1)
  features[f'ndvi_rolling_mean_{i}'] = df[window_cols].mean(axis=1)
  features[f'ndvi_rolling_std_{i}'] = df[window_cols].std(axis=1)
  features[f'ndvi_rolling_mean_{i}'] = df[window_cols].mean(axis=1)
  features[f'ndvi_rolling_std_{i}'] = df[window_cols].std(axis=1)
  features[f'ndvi_rolling_mean_{i}'] = df[window_cols].mean(axis=1)
  features[f'ndvi_rolling_std_{i}'] = df[window_cols].std(axis=1)
  features[f'ndvi_rolling_mean_{i}'] = df[window_cols].mean(axis=1)
  features[f'ndvi_rolling_std_{i}'] = df[window_cols].std(axis=1)
  features[f'ndvi_rolling_mean_{i}'] = df[window_cols].mean(axis=1)
  features[f'ndvi_rolling_std_{i}'] = df[window_cols].std(axis=1)
  features[f'ndvi_rolling_mean_{i}'] = df[window_cols].mean(axis=1)
  features[f'ndvi_rolling_std_{i}'] = df[window_cols].std(axis=1)
  features[f'ndvi_fft_{i}'] = fft_values[:, i]
  features[f'nd

In [4]:
# Drop ID before training
X_train_model = X_train.drop(columns=['ID'])
X_test_model = X_test.drop(columns=['ID'])

# Train-test split
X_tr, X_val, y_tr, y_val = train_test_split(X_train_model, y_train, test_size=0.2, stratify=y_train, random_state=42)

# Apply robust scaling
scaler = RobustScaler()
X_tr_scaled = scaler.fit_transform(X_tr)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test_model)


In [5]:
# Handle Class Imbalance using SMOTETomek
smt = SMOTETomek(random_state=42)
X_resampled, y_resampled = smt.fit_resample(X_tr_scaled, y_tr)
print("After resampling class distribution:\n", pd.Series(y_resampled).value_counts())

After resampling class distribution:
 class
forest        4927
farm          4927
water         4927
impervious    4927
grass         4927
orchard       4927
Name: count, dtype: int64


In [6]:
# Train a Logistic Regression Model and Evaluate
clf = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
clf.fit(X_resampled, y_resampled)
y_pred = clf.predict(X_val_scaled)

print("\nValidation Classification Report:")
print(classification_report(y_val, y_pred))



Validation Classification Report:
              precision    recall  f1-score   support

        farm       0.64      0.79      0.71       168
      forest       0.98      0.88      0.92      1232
       grass       0.37      0.67      0.48        39
  impervious       0.77      0.81      0.79       134
     orchard       0.11      0.83      0.19         6
       water       0.41      0.57      0.48        21

    accuracy                           0.85      1600
   macro avg       0.55      0.76      0.60      1600
weighted avg       0.90      0.85      0.87      1600



ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
# Cross-validation and Feature Selection with RFECV
selector = RFECV(clf, step=5, cv=3, scoring='f1_macro', n_jobs=-1)
selector = selector.fit(X_resampled, y_resampled)

print("\nOptimal number of features:", selector.n_features_)
print("Selected features mask:", selector.support_)


ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Optimal number of features: 115
Selected features mask: [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True False  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True False  True  True  True  True  True  True  True  True  True
  True  True False  True  True  True  True  True  True  True  True  True
 False  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True False  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True]


In [10]:
# Transform the resampled training data with the selector
X_resampled_selected = selector.transform(X_resampled)

# Retrain the classifier on selected features only
clf_selected = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
clf_selected.fit(X_resampled_selected, y_resampled)

# Transform the test data with the selector
X_test_selected = selector.transform(X_test_scaled)

# Predict using the retrained classifier
final_predictions = clf_selected.predict(X_test_selected)

# Save to submission format
submission = pd.DataFrame({'ID': X_test['ID'], 'class': final_predictions})
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'")


Submission file saved as 'submission.csv'
