In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv
/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv


In [2]:
# ===================================================================
#  IMPORTS
# ===================================================================
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from scipy.signal import savgol_filter
import warnings
warnings.filterwarnings("ignore")

In [3]:


# ===================================================================
#  LOAD TRAINING DATA
# ===================================================================
train_df = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv")

# Drop unwanted columns
if 'Unnamed: 0' in train_df.columns:
    train_df.drop(columns=['Unnamed: 0'], inplace=True)

ndvi_columns = [col for col in train_df.columns if col.endswith('_N')]

In [4]:
# ===================================================================
#  SMOOTHING FUNCTION (SHARED)
# ===================================================================
def smooth(row):
    if row.count() >= 5:
        return savgol_filter(row, window_length=5, polyorder=2)
    return row

# ===================================================================
#  TRAINING PREPROCESSING
# ===================================================================
def preprocess_train(df):
    df = df.copy()
    df[ndvi_columns] = df[ndvi_columns].apply(pd.to_numeric, errors='coerce')
    df[ndvi_columns] = df[ndvi_columns].interpolate(method='linear', axis=1, limit_direction='both')
    df[ndvi_columns] = df[ndvi_columns].fillna(df[ndvi_columns].mean())
    df[ndvi_columns] = df[ndvi_columns].apply(smooth, axis=1, result_type='broadcast')
    return df

train_clean = preprocess_train(train_df)


In [5]:
#  FEATURE ENGINEERING
# ===================================================================
def extract_features(df):
    features = pd.DataFrame()
    ndvi = df[ndvi_columns]

    features["mean"] = ndvi.mean(axis=1)
    features["std"] = ndvi.std(axis=1)
    features["max"] = ndvi.max(axis=1)
    features["min"] = ndvi.min(axis=1)
    features["range"] = features["max"] - features["min"]
    features["median"] = ndvi.median(axis=1)
    features["skew"] = ndvi.skew(axis=1)
    features["kurt"] = ndvi.kurtosis(axis=1)
    features["q75"] = ndvi.quantile(0.75, axis=1)
    features["q25"] = ndvi.quantile(0.25, axis=1)
    features["iqr"] = features["q75"] - features["q25"]
    features["argmax"] = ndvi.values.argmax(axis=1)

    half = ndvi.shape[1] // 2
    features["first_half_mean"] = ndvi.iloc[:, :half].mean(axis=1)
    features["second_half_mean"] = ndvi.iloc[:, half:].mean(axis=1)
    features["growth"] = features["second_half_mean"] - features["first_half_mean"]

    # Additional derived features
    features["peak_to_mean"] = features["max"] / (features["mean"] + 1e-6)
    features["norm_amplitude"] = (features["max"] - features["min"]) / (features["max"] + features["min"] + 1e-6)

    return features

In [6]:
# Prepare train data
X_train = extract_features(train_clean)
y_train = train_clean['class'].astype(str)

# Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)

# Feature expansion and scaling
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif

poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)

# Feature selection
selector = SelectKBest(score_func=f_classif, k='all')
X_train_selected = selector.fit_transform(X_train_poly, y_train_enc)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)

In [7]:

#  MODEL TRAINING
# ===================================================================
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train_scaled, y_train_enc)
train_preds = model.predict(X_train_scaled)
train_acc = accuracy_score(y_train_enc, train_preds)

print(f" Training Accuracy: {train_acc * 100:.2f}%")
print(" Data loaded")

 Training Accuracy: 86.30%
 Data loaded


In [8]:
# ===================================================================
#  LOAD & PREDICT ON TEST DATA
# ===================================================================
test_df = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv")
if 'Unnamed: 0' in test_df.columns:
    test_df.drop(columns=['Unnamed: 0'], inplace=True)

In [9]:

# Apply same preprocessing to test
def preprocess_test(df):
    df = df.copy()
    df[ndvi_columns] = df[ndvi_columns].apply(pd.to_numeric, errors='coerce')
    df[ndvi_columns] = df[ndvi_columns].interpolate(method='linear', axis=1, limit_direction='both')
    df[ndvi_columns] = df[ndvi_columns].fillna(df[ndvi_columns].mean())
    df[ndvi_columns] = df[ndvi_columns].apply(smooth, axis=1, result_type='broadcast')
    return df

test_clean = preprocess_test(test_df)
X_test = extract_features(test_clean)
X_test_poly = poly.transform(X_test)
X_test_selected = selector.transform(X_test_poly)
X_test_scaled = scaler.transform(X_test_selected)
test_preds = model.predict(X_test_scaled)
test_labels = le.inverse_transform(test_preds)

In [10]:
# ===================================================================
#  EVALUATE ON TEST DATA (WITH LABELS)
# ===================================================================
if 'class' in test_df.columns:
    y_test = test_df['class'].astype(str)
    y_test_enc = le.transform(y_test)
    test_acc = accuracy_score(y_test_enc, test_preds)
    print(f" Test Accuracy: {test_acc * 100:.2f}%")

    # Print full ID + predicted class format like leaderboard submission
    print(" Sample Prediction Output (ID,class):")
    for i in range(min(10, len(test_df))):
        print(f"{test_df['ID'].iloc[i]},{test_labels[i]}")

# ===================================================================
#  EXPORT SUBMISSION
# ===================================================================
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'class': test_labels
})
submission.to_csv("submission.csv", index=False)
print(submission.head())


   ID   class
0   1  forest
1   2  forest
2   3  forest
3   4  forest
4   5  forest
