In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.metrics import accuracy_score

# Load CSVs
train_df = pd.read_csv('hacktrain.csv')
test_df = pd.read_csv('hacktest.csv')

# Extract target and features
y_raw = train_df['class']
X_train_raw = train_df.drop(['ID', 'class'], axis=1)
X_test_raw = test_df.drop(['ID'], axis=1)
test_ids = test_df['ID']


In [3]:
def clean_ndvi_row(row):
    row = row.ffill().bfill()
    q1, q3 = row.quantile(0.25), row.quantile(0.75)
    iqr = q3 - q1
    lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    median = row.median()
    return row.apply(lambda x: median if x < lower or x > upper else x)

# Train: aggressive denoising
X_train_cleaned = X_train_raw.apply(clean_ndvi_row, axis=1)

# Test: only fill NaNs
X_test_cleaned = X_test_raw.apply(lambda row: row.ffill().bfill(), axis=1)


In [4]:
def extract_features(df):
    feat = pd.DataFrame(index=df.index)

    feat['mean'] = df.mean(axis=1)
    feat['median'] = df.median(axis=1)
    feat['std'] = df.std(axis=1)
    feat['min'] = df.min(axis=1)
    feat['max'] = df.max(axis=1)
    feat['range'] = feat['max'] - feat['min']
    feat['iqr'] = df.quantile(0.75, axis=1) - df.quantile(0.25, axis=1)

    # Trend across full season
    feat['trend'] = df.apply(lambda row: np.polyfit(range(len(row)), row, 1)[0], axis=1)

    # Early vs late season change
    feat['start'] = df.iloc[:, :5].mean(axis=1)
    feat['end'] = df.iloc[:, -5:].mean(axis=1)
    feat['season_diff'] = feat['end'] - feat['start']

    # Slope shift: early half vs late half
    def slope(row, start, end):
        return np.polyfit(range(start, end), row[start:end], 1)[0]

    feat['early_slope'] = df.apply(lambda row: slope(row, 0, len(row)//2), axis=1)
    feat['late_slope'] = df.apply(lambda row: slope(row, len(row)//2, len(row)), axis=1)
    feat['slope_diff'] = feat['late_slope'] - feat['early_slope']

    # Peaks (abrupt changes — indicates vegetation cycles or clouds)
    feat['n_peaks'] = df.apply(lambda row: np.sum(np.abs(np.diff(row)) > 0.15), axis=1)

    return feat

# Extract features
X_train_feat = extract_features(X_train_cleaned)
X_test_feat = extract_features(X_test_cleaned)


In [6]:
# Encode target classes
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_raw)

# Scale to protect against outliers
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_feat)
X_test_scaled = scaler.transform(X_test_feat)

# Stratified split for validation accuracy check
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_scaled, y_encoded,
    test_size=0.2, stratify=y_encoded, random_state=42
)


In [7]:
# Best tuned LR for generalization
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    C=10,                        # less regularization → better fit
    max_iter=2000,              # allow convergence
    class_weight='balanced',    # handles noisy or imbalanced labels
    random_state=42
)

model.fit(X_tr, y_tr)

# Validate
val_preds = model.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)
print(f"✅ Validation Accuracy: {val_acc:.4f}")




✅ Validation Accuracy: 0.7987


In [8]:
# Final test predictions
test_preds = model.predict(X_test_scaled)
test_labels = label_encoder.inverse_transform(test_preds)

# Save submission
submission = pd.DataFrame({
    'ID': test_ids,
    'class': test_labels
})
submission.to_csv('submission3.csv', index=False)
print("📁 Submission saved as 'submission.csv'")


📁 Submission saved as 'submission.csv'
