In [11]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

class NDVILandCoverClassifier:
    def __init__(self):
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='median')
        self.model = LogisticRegression(
            max_iter=1000,
            random_state=42,
            class_weight='balanced'
        )
        self.feature_names = []
        
    def create_features(self, df):
        """Create comprehensive features from NDVI time series"""
        features = pd.DataFrame()
        
        # Get NDVI columns (exclude ID and class if present)
        ndvi_cols = [col for col in df.columns if col.endswith('_N')]
        ndvi_data = df[ndvi_cols].copy()
        
        # 1. Basic statistical features
        features['mean_ndvi'] = ndvi_data.mean(axis=1)
        features['std_ndvi'] = ndvi_data.std(axis=1)
        features['median_ndvi'] = ndvi_data.median(axis=1)
        features['min_ndvi'] = ndvi_data.min(axis=1)
        features['max_ndvi'] = ndvi_data.max(axis=1)
        features['range_ndvi'] = features['max_ndvi'] - features['min_ndvi']
        features['cv_ndvi'] = features['std_ndvi'] / (features['mean_ndvi'] + 1e-8)
        
        # 2. Percentile features
        for p in [25, 75, 90]:
            features[f'p{p}_ndvi'] = ndvi_data.quantile(p/100, axis=1)
        
        # 3. Temporal trend features
        time_points = np.arange(len(ndvi_cols))
        trends = []
        for idx in range(len(df)):
            values = ndvi_data.iloc[idx].values
            valid_mask = ~np.isnan(values)
            if np.sum(valid_mask) > 2:
                slope = np.polyfit(time_points[valid_mask], values[valid_mask], 1)[0]
            else:
                slope = 0
            trends.append(slope)
        features['trend_slope'] = trends
        
        # 4. Seasonal features (assuming roughly monthly data)
        # Split into seasons (approximate)
        season_size = len(ndvi_cols) // 4
        for i in range(4):
            start_idx = i * season_size
            end_idx = (i + 1) * season_size if i < 3 else len(ndvi_cols)
            season_cols = ndvi_cols[start_idx:end_idx]
            features[f'season_{i}_mean'] = ndvi_data[season_cols].mean(axis=1)
            features[f'season_{i}_std'] = ndvi_data[season_cols].std(axis=1)
        
        # 5. Difference features (rate of change)
        ndvi_diff = ndvi_data.diff(axis=1)
        features['mean_diff'] = ndvi_diff.mean(axis=1)
        features['std_diff'] = ndvi_diff.std(axis=1)
        features['max_positive_change'] = ndvi_diff.max(axis=1)
        features['max_negative_change'] = ndvi_diff.min(axis=1)
        
        # 6. Spectral features (FFT-based)
        fft_features = []
        for idx in range(len(df)):
            values = ndvi_data.iloc[idx].values
            valid_mask = ~np.isnan(values)
            if np.sum(valid_mask) > 4:
                # Interpolate missing values for FFT
                interp_values = np.interp(
                    np.arange(len(values)),
                    np.where(valid_mask)[0],
                    values[valid_mask]
                )
                fft = np.fft.fft(interp_values)
                # Take magnitude of first few frequency components
                fft_mag = np.abs(fft[:5])
                fft_features.append(fft_mag)
            else:
                fft_features.append(np.zeros(5))
        
        fft_array = np.array(fft_features)
        for i in range(5):
            features[f'fft_mag_{i}'] = fft_array[:, i]
        
        # 7. Vegetation index patterns
        features['vegetation_vigor'] = features['max_ndvi'] - features['min_ndvi']
        features['growing_season_length'] = (ndvi_data > features['mean_ndvi'].values.reshape(-1, 1)).sum(axis=1)
        
        # 8. Missing data patterns (important for noisy data)
        features['missing_count'] = ndvi_data.isnull().sum(axis=1)
        features['missing_ratio'] = features['missing_count'] / len(ndvi_cols)
        
        # 9. Stability features
        rolling_std = ndvi_data.rolling(window=3, axis=1).std()
        features['stability_mean'] = rolling_std.mean(axis=1)
        features['stability_max'] = rolling_std.max(axis=1)
        
        # 10. Peak detection features
        peak_counts = []
        for idx in range(len(df)):
            values = ndvi_data.iloc[idx].values
            valid_mask = ~np.isnan(values)
            if np.sum(valid_mask) > 3:
                valid_values = values[valid_mask]
                # Simple peak detection
                peaks = 0
                for i in range(1, len(valid_values) - 1):
                    if valid_values[i] > valid_values[i-1] and valid_values[i] > valid_values[i+1]:
                        peaks += 1
                peak_counts.append(peaks)
            else:
                peak_counts.append(0)
        features['peak_count'] = peak_counts
        
        return features
    
    def preprocess_data(self, X, fit=True):
        """Handle missing values and scaling"""
        if fit:
            X_imputed = self.imputer.fit_transform(X)
            X_scaled = self.scaler.fit_transform(X_imputed)
        else:
            X_imputed = self.imputer.transform(X)
            X_scaled = self.scaler.transform(X_imputed)
        
        return X_scaled
    
    def fit(self, df, target_col='class'):
        """Train the model"""
        # Create features
        X = self.create_features(df)
        y = df[target_col]
        
        # Store feature names
        self.feature_names = X.columns.tolist()
        
        # Preprocess
        X_processed = self.preprocess_data(X, fit=True)
        
        # Train model
        self.model.fit(X_processed, y)
        
        return self
    
    def predict(self, df):
        """Make predictions"""
        X = self.create_features(df)
        X_processed = self.preprocess_data(X, fit=False)
        return self.model.predict(X_processed)
    
    def predict_proba(self, df):
        """Get prediction probabilities"""
        X = self.create_features(df)
        X_processed = self.preprocess_data(X, fit=False)
        return self.model.predict_proba(X_processed)

# Load and prepare your test data
def load_data(file_path):
    """Load the hackathon data"""
    df = pd.read_csv(file_path)
    return df

# Example usage with your test data
def main():
    # Load your test data
    test_df = pd.read_csv('hacktest.csv')  # Your uploaded file
    
    print("Dataset shape:", test_df.shape)
    print("Columns:", test_df.columns.tolist())
    print("\nFirst few rows:")
    print(test_df.head())
    
    # Since this is test data without labels, let's create a demo
    # In the actual competition, you'd have training data with labels
    
    # For demonstration, let's assume you have training data
    # You would replace this with your actual training data loading
    
    # Create classifier
    classifier = NDVILandCoverClassifier()
    
    # Create features for analysis
    features = classifier.create_features(test_df)
    print(f"\nCreated {len(features.columns)} features:")
    print(features.columns.tolist())
    print(f"\nFeature statistics:")
    print(features.describe())
    
    # If you had training data, you would do:
    # classifier.fit(train_df, 'class')
    # predictions = classifier.predict(test_df)
    
    return classifier, features

# Cross-validation function for when you have training data
def evaluate_model(train_df, n_folds=5):
    """Evaluate model performance using cross-validation"""
    classifier = NDVILandCoverClassifier()
    
    # Create features
    X = classifier.create_features(train_df)
    y = train_df['class']
    
    # Preprocess
    X_processed = classifier.preprocess_data(X, fit=True)
    
    # Cross-validation
    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    scores = cross_val_score(classifier.model, X_processed, y, cv=cv, scoring='accuracy')
    
    print(f"Cross-validation scores: {scores}")
    print(f"Mean CV accuracy: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
    
    return scores

# Feature importance analysis
def analyze_feature_importance(classifier, feature_names):
    """Analyze which features are most important"""
    if hasattr(classifier.model, 'coef_'):
        # For logistic regression, we can look at coefficient magnitudes
        coef_abs = np.abs(classifier.model.coef_).mean(axis=0)
        feature_importance = pd.DataFrame({
            'feature': feature_names,
            'importance': coef_abs
        }).sort_values('importance', ascending=False)
        
        print("Top 10 most important features:")
        print(feature_importance.head(10))
        
        return feature_importance
    
    return None

if __name__ == "__main__":
    classifier, features = main()


Dataset shape: (2845, 29)
Columns: ['Unnamed: 0', 'ID', '20150720_N', '20150602_N', '20150517_N', '20150501_N', '20150415_N', '20150330_N', '20150314_N', '20150226_N', '20150210_N', '20150125_N', '20150109_N', '20141117_N', '20141101_N', '20141016_N', '20140930_N', '20140813_N', '20140626_N', '20140610_N', '20140525_N', '20140509_N', '20140423_N', '20140407_N', '20140322_N', '20140218_N', '20140202_N', '20140117_N', '20140101_N']

First few rows:
   Unnamed: 0  ID  20150720_N  20150602_N  20150517_N  20150501_N  20150415_N  \
0           0   1     7466.42     413.162     5761.00     5625.45     489.403   
1           1   2     7235.26    6037.350     1027.56     6085.14    1618.050   
2           2   3     7425.08    6969.980     1177.94     7408.93     861.061   
3           3   4     7119.12    1731.620     6311.93     6441.61     465.979   
4           4   5     7519.55    8130.260     1482.54     7879.53    1001.210   

   20150330_N  20150314_N  20150226_N  ...  20140610_N  201405

## Training

In [12]:
train_df = pd.read_csv("hacktrain.csv")
test_df = pd.read_csv("hacktest.csv")
classifier = NDVILandCoverClassifier()
classifier.fit(train_df, 'class')
predictions = classifier.predict(test_df)


## Cross-Validation

In [13]:
scores = evaluate_model(train_df)


Cross-validation scores: [0.726875 0.72     0.75125  0.74375  0.7125  ]
Mean CV accuracy: 0.7309 (+/- 0.0290)


## Feature-Analysis

In [14]:
importance = analyze_feature_importance(classifier, classifier.feature_names)


Top 10 most important features:
          feature  importance
1        std_ndvi    2.805326
0       mean_ndvi    1.669619
8        p75_ndvi    1.503018
11  season_0_mean    1.046259
16   season_2_std    0.982182
17  season_3_mean    0.978139
20       std_diff    0.859199
10    trend_slope    0.852722
13  season_1_mean    0.771040
9        p90_ndvi    0.670704


In [15]:
# Final step: Save predictions to CSV
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'class': predictions
})
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")


Submission file created: submission.csv
