# =============================================================
# SUMMER ANALYTICS 2025 - NDVI LAND COVER CLASSIFICATION 
# =============================================================

# **To build a Logistic Regression model that accurately predicts land cover classes despite noisy NDVI signals**

# Importing Libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Loading dataset

In [6]:
hacktrain=pd.read_csv('hacktrain.csv')
hacktest=pd.read_csv('hacktest.csv')

In [7]:
print("Train shape:", hacktrain.shape)
print("Test shape:", hacktest.shape)

Train shape: (8000, 30)
Test shape: (2845, 29)


In [8]:
hacktrain.head()

Unnamed: 0.1,Unnamed: 0,ID,class,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,1,water,637.595,658.668,-1882.03,-1924.36,997.904,-1739.99,630.087,...,,-1043.16,-1942.49,267.138,,,211.328,-2203.02,-1180.19,433.906
1,1,2,water,634.24,593.705,-1625.79,-1672.32,914.198,-692.386,707.626,...,,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.0,-1360.56,524.075
2,3,4,water,58.0174,-1599.16,,-1052.63,,-1564.63,,...,-1025.88,368.622,,-1227.8,304.621,,369.214,-2202.12,,-1343.55
3,4,5,water,72.518,,380.436,-1256.93,515.805,-1413.18,-802.942,...,-1813.95,155.624,,-924.073,432.15,282.833,298.32,-2197.36,,-826.727
4,7,8,water,1136.44,,,1647.83,1935.8,,2158.98,...,1535.0,1959.43,-279.317,-384.915,-113.406,1020.72,1660.65,-116.801,-568.05,-1357.14


In [9]:
hacktest.head()

Unnamed: 0.1,Unnamed: 0,ID,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,20150226_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,1,7466.42,413.162,5761.0,5625.45,489.403,3923.84,3097.11,6766.42,...,801.184,927.115,4704.14,6378.42,340.949,2695.57,527.268,4736.75,601.843,6639.76
1,1,2,7235.26,6037.35,1027.56,6085.14,1618.05,6668.54,2513.99,1051.69,...,5533.47,5103.04,5216.12,4885.27,4366.79,1234.14,3298.11,6942.68,1070.44,842.101
2,2,3,7425.08,6969.98,1177.94,7408.93,861.061,7644.43,814.458,1504.29,...,1981.39,6204.54,7021.69,5704.41,4897.45,1789.99,2206.1,6928.93,1036.56,831.441
3,3,4,7119.12,1731.62,6311.93,6441.61,465.979,7128.42,1649.12,6935.22,...,959.344,5794.15,1045.57,5572.9,586.287,685.906,1287.0,6734.72,824.584,6883.61
4,4,5,7519.55,8130.26,1482.54,7879.53,1001.21,7937.6,4122.53,1094.51,...,7636.07,6996.76,7413.43,4596.13,4511.7,1413.52,3283.94,7937.68,1857.8,1336.92


In [10]:
print(hacktrain.columns,"\n" )
print(hacktest.columns)

Index(['Unnamed: 0', 'ID', 'class', '20150720_N', '20150602_N', '20150517_N',
       '20150501_N', '20150415_N', '20150330_N', '20150314_N', '20150226_N',
       '20150210_N', '20150125_N', '20150109_N', '20141117_N', '20141101_N',
       '20141016_N', '20140930_N', '20140813_N', '20140626_N', '20140610_N',
       '20140525_N', '20140509_N', '20140423_N', '20140407_N', '20140322_N',
       '20140218_N', '20140202_N', '20140117_N', '20140101_N'],
      dtype='object') 

Index(['Unnamed: 0', 'ID', '20150720_N', '20150602_N', '20150517_N',
       '20150501_N', '20150415_N', '20150330_N', '20150314_N', '20150226_N',
       '20150210_N', '20150125_N', '20150109_N', '20141117_N', '20141101_N',
       '20141016_N', '20140930_N', '20140813_N', '20140626_N', '20140610_N',
       '20140525_N', '20140509_N', '20140423_N', '20140407_N', '20140322_N',
       '20140218_N', '20140202_N', '20140117_N', '20140101_N'],
      dtype='object')


In [11]:
print(hacktrain.isnull().sum(),"\n ----------------------------------------------------------------")
print(hacktest.isnull().sum())

Unnamed: 0       0
ID               0
class            0
20150720_N     560
20150602_N    1200
20150517_N     800
20150501_N     960
20150415_N     480
20150330_N    1120
20150314_N     720
20150226_N    1360
20150210_N     640
20150125_N    1040
20150109_N     880
20141117_N    1280
20141101_N     400
20141016_N    1440
20140930_N     800
20140813_N     560
20140626_N    1600
20140610_N     480
20140525_N     720
20140509_N     880
20140423_N    1760
20140407_N     640
20140322_N    1120
20140218_N    1440
20140202_N     560
20140117_N    1200
20140101_N     400
dtype: int64 
 ----------------------------------------------------------------
Unnamed: 0    0
ID            0
20150720_N    0
20150602_N    0
20150517_N    0
20150501_N    0
20150415_N    0
20150330_N    0
20150314_N    0
20150226_N    0
20150210_N    0
20150125_N    0
20150109_N    0
20141117_N    0
20141101_N    0
20141016_N    0
20140930_N    0
20140813_N    0
20140626_N    0
20140610_N    0
20140525_N    0
20140509_N    

In [12]:
print(hacktrain.info(),"\n **********************************************************************************")
print(hacktest.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  8000 non-null   int64  
 1   ID          8000 non-null   int64  
 2   class       8000 non-null   object 
 3   20150720_N  7440 non-null   float64
 4   20150602_N  6800 non-null   float64
 5   20150517_N  7200 non-null   float64
 6   20150501_N  7040 non-null   float64
 7   20150415_N  7520 non-null   float64
 8   20150330_N  6880 non-null   float64
 9   20150314_N  7280 non-null   float64
 10  20150226_N  6640 non-null   float64
 11  20150210_N  7360 non-null   float64
 12  20150125_N  6960 non-null   float64
 13  20150109_N  7120 non-null   float64
 14  20141117_N  6720 non-null   float64
 15  20141101_N  7600 non-null   float64
 16  20141016_N  6560 non-null   float64
 17  20140930_N  7200 non-null   float64
 18  20140813_N  7440 non-null   float64
 19  20140626_N  6400 non-null  

In [13]:
#Identify NDVI columns
ndvi_cols = [col for col in hacktrain.columns if '_N' in col]
print("NDVI Columns:", ndvi_cols)

NDVI Columns: ['20150720_N', '20150602_N', '20150517_N', '20150501_N', '20150415_N', '20150330_N', '20150314_N', '20150226_N', '20150210_N', '20150125_N', '20150109_N', '20141117_N', '20141101_N', '20141016_N', '20140930_N', '20140813_N', '20140626_N', '20140610_N', '20140525_N', '20140509_N', '20140423_N', '20140407_N', '20140322_N', '20140218_N', '20140202_N', '20140117_N', '20140101_N']


In [14]:
# Handle Missing Values (Imputation)
hacktrain[ndvi_cols] = hacktrain[ndvi_cols].fillna(hacktrain[ndvi_cols].median())
hacktest[ndvi_cols] = hacktest[ndvi_cols].fillna(hacktest[ndvi_cols].median())

In [15]:
# Feature Engineering
for df in [hacktrain, hacktest]:
    df['ndvi_mean'] = df[ndvi_cols].mean(axis=1)
    df['ndvi_std'] = df[ndvi_cols].std(axis=1)
    df['ndvi_min'] = df[ndvi_cols].min(axis=1)
    df['ndvi_max'] = df[ndvi_cols].max(axis=1)
    df['ndvi_range'] = df['ndvi_max'] - df['ndvi_min']

In [16]:
# Encode Target Variable
le = LabelEncoder()
hacktrain['class_label'] = le.fit_transform(hacktrain['class'])

In [17]:
# Prepare Train and Test data
features = ['ndvi_mean', 'ndvi_std', 'ndvi_min', 'ndvi_max', 'ndvi_range']

X = hacktrain[features]
y = hacktrain['class_label']
X_test = hacktest[features]


In [18]:
# Optional Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [19]:
#  Train-Test Split for Validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
# Build Logistic Regression Model
model = LogisticRegression(max_iter=3000, multi_class='multinomial')
model.fit(X_train, y_train)



In [21]:
# Evaluate Model
y_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", acc)
print(classification_report(y_val, y_pred, target_names=le.classes_))

Validation Accuracy: 0.854375
              precision    recall  f1-score   support

        farm       0.43      0.28      0.34       161
      forest       0.89      0.98      0.93      1231
       grass       0.59      0.30      0.40        43
  impervious       0.85      0.66      0.74       141
     orchard       0.00      0.00      0.00         6
       water       0.88      0.83      0.86        18

    accuracy                           0.85      1600
   macro avg       0.61      0.51      0.55      1600
weighted avg       0.83      0.85      0.84      1600



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
#  Predict on Test Data
test_preds = model.predict(X_test)
hacktest['class'] = le.inverse_transform(test_preds)

In [23]:
# Submission File
submission = hacktest[['ID', 'class']]
submission.to_csv('submission1.csv', index=False)
print("Submission File Created Successfully!")

Submission File Created Successfully!


# =============================================================
# SUMMER ANALYTICS 2025 - NDVI LAND COVER CLASSIFICATION BOOSTED
# =============================================================

In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import linregress

In [26]:
# Load Data
train = pd.read_csv('hacktrain.csv')
test = pd.read_csv('hacktest.csv')

In [27]:
ndvi_cols = [col for col in train.columns if '_N' in col]

In [28]:
# Handle missing values
train[ndvi_cols] = train[ndvi_cols].fillna(train[ndvi_cols].median())
test[ndvi_cols] = test[ndvi_cols].fillna(test[ndvi_cols].median())

In [29]:
#  Apply smoothing (reduce noise)
window_size = 3

def smooth_series(row):
    return pd.Series(row).rolling(window=window_size, min_periods=1).mean().values

for df in [train, test]:
    smoothed = df[ndvi_cols].apply(smooth_series, axis=1, result_type='expand')
    smoothed.columns = ndvi_cols  # reassign original column names
    df[ndvi_cols] = smoothed
# 🚀 CRITICAL FIX: Re-impute after smoothing to remove remaining NaNs
for df in [train, test]:
    df[ndvi_cols] = df[ndvi_cols].fillna(df[ndvi_cols].median())

In [30]:
# Feature Engineering: Statistical Features
for df in [train, test]:
    df['ndvi_mean'] = df[ndvi_cols].mean(axis=1)
    df['ndvi_std'] = df[ndvi_cols].std(axis=1)
    df['ndvi_min'] = df[ndvi_cols].min(axis=1)
    df['ndvi_max'] = df[ndvi_cols].max(axis=1)
    df['ndvi_range'] = df['ndvi_max'] - df['ndvi_min']
    df['ndvi_median'] = df[ndvi_cols].median(axis=1)
    df['ndvi_skew'] = df[ndvi_cols].skew(axis=1)
    df['ndvi_kurt'] = df[ndvi_cols].kurt(axis=1)
    df['ndvi_q25'] = df[ndvi_cols].quantile(0.25, axis=1)
    df['ndvi_q75'] = df[ndvi_cols].quantile(0.75, axis=1)

In [31]:
# Feature Engineering: Slope (trend)
def compute_slope(row):
    y = row[ndvi_cols].values
    x = np.arange(len(y))
    slope, intercept, r_value, p_value, std_err = linregress(x, y)
    return slope

for df in [train, test]:
    df['ndvi_slope'] = df[ndvi_cols].apply(compute_slope, axis=1)

In [32]:
# Feature Engineering: Seasonal features
summer_cols = [col for col in ndvi_cols if col[4:6] in ['05', '06', '07', '08']]
winter_cols = [col for col in ndvi_cols if col[4:6] in ['12', '01', '02']]
for df in [train, test]:
    df['summer_mean'] = df[summer_cols].mean(axis=1)
    df['winter_mean'] = df[winter_cols].mean(axis=1)
    df['summer_winter_diff'] = df['summer_mean'] - df['winter_mean']

In [33]:
# Encode target variable
le = LabelEncoder()
train['class_label'] = le.fit_transform(train['class'])

In [34]:
#  Final feature set
feature_cols = ['ndvi_mean', 'ndvi_std', 'ndvi_min', 'ndvi_max', 'ndvi_range',
                 'ndvi_median', 'ndvi_skew', 'ndvi_kurt', 'ndvi_q25', 'ndvi_q75',
                 'ndvi_slope', 'summer_mean', 'winter_mean', 'summer_winter_diff']

X = train[feature_cols]
y = train['class_label']
X_test = test[feature_cols]


In [35]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [36]:
#  Polynomial features (add non-linearity)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_scaled)
X_test_poly = poly.transform(X_test_scaled)

In [37]:
#  Train-test split for internal validation
X_train, X_val, y_train, y_val = train_test_split(X_poly, y, test_size=0.2, random_state=42)

In [38]:
# Train logistic regression
model = LogisticRegression(max_iter=2000, multi_class='multinomial', solver='lbfgs')
model.fit(X_train, y_train)



In [39]:
# Evaluate
y_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", acc)
print(classification_report(y_val, y_pred, target_names=le.classes_))

Validation Accuracy: 0.82875
              precision    recall  f1-score   support

        farm       0.41      0.34      0.37       161
      forest       0.87      0.97      0.92      1231
       grass       0.54      0.16      0.25        43
  impervious       0.83      0.45      0.58       141
     orchard       0.00      0.00      0.00         6
       water       0.79      0.61      0.69        18

    accuracy                           0.83      1600
   macro avg       0.57      0.42      0.47      1600
weighted avg       0.81      0.83      0.81      1600



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [40]:
# Predict on test data
test_preds = model.predict(X_test_poly)
test['class'] = le.inverse_transform(test_preds)

# Submission file
submission = test[['ID', 'class']]
submission.to_csv('submission2.csv', index=False)
print("Submission File Created Successfully!")


Submission File Created Successfully!


# =============================================================
# SUMMER ANALYTICS 2025 - NDVI LAND COVER CLASSIFICATION
# ACCURACY IMPROVED TO 90% 🚀🔥
# =============================================================

In [42]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from scipy.stats import linregress

In [43]:
# 1. Load data
train = pd.read_csv('hacktrain.csv')
test = pd.read_csv('hacktest.csv')

In [44]:
ndvi_cols = [col for col in train.columns if '_N' in col]

In [45]:
# 2. Handle initial missing values
train[ndvi_cols] = train[ndvi_cols].fillna(train[ndvi_cols].median())
test[ndvi_cols] = test[ndvi_cols].fillna(test[ndvi_cols].median())

In [46]:
# 3. Smooth NDVI to reduce noise (rolling window smoothing)
window_size = 3
def smooth_series(row):
    return pd.Series(row).rolling(window=window_size, min_periods=1).mean().values

for df in [train, test]:
    smoothed = df[ndvi_cols].apply(smooth_series, axis=1, result_type='expand')
    smoothed.columns = ndvi_cols
    df[ndvi_cols] = smoothed

In [47]:
# 4. Re-impute after smoothing
for df in [train, test]:
    df[ndvi_cols] = df[ndvi_cols].fillna(df[ndvi_cols].median())

In [48]:
# 5. Apply PCA to NDVI columns
pca = PCA(n_components=5, random_state=42)

pca_train = pca.fit_transform(train[ndvi_cols])
pca_test = pca.transform(test[ndvi_cols])

pca_cols = [f'pca_{i+1}' for i in range(pca.n_components_)]
pca_train_df = pd.DataFrame(pca_train, columns=pca_cols, index=train.index)
pca_test_df = pd.DataFrame(pca_test, columns=pca_cols, index=test.index)

train = pd.concat([train, pca_train_df], axis=1)
test = pd.concat([test, pca_test_df], axis=1)

In [49]:
# 6. Additional statistical feature engineering
def create_features(df):
    df['ndvi_mean'] = df[ndvi_cols].mean(axis=1)
    df['ndvi_std'] = df[ndvi_cols].std(axis=1)
    df['ndvi_min'] = df[ndvi_cols].min(axis=1)
    df['ndvi_max'] = df[ndvi_cols].max(axis=1)
    df['ndvi_range'] = df['ndvi_max'] - df['ndvi_min']
    df['ndvi_median'] = df[ndvi_cols].median(axis=1)
    df['ndvi_skew'] = df[ndvi_cols].skew(axis=1)
    df['ndvi_kurt'] = df[ndvi_cols].kurt(axis=1)
    df['ndvi_q25'] = df[ndvi_cols].quantile(0.25, axis=1)
    df['ndvi_q75'] = df[ndvi_cols].quantile(0.75, axis=1)

    # NDVI slope trend feature
    df['ndvi_slope'] = df[ndvi_cols].apply(
        lambda row: linregress(np.arange(len(row)), row.values)[0], axis=1)

    # Seasonal features
    summer_cols = [col for col in ndvi_cols if col[4:6] in ['05','06','07','08']]
    winter_cols = [col for col in ndvi_cols if col[4:6] in ['12','01','02']]
    df['summer_mean'] = df[summer_cols].mean(axis=1)
    df['winter_mean'] = df[winter_cols].mean(axis=1)
    df['summer_winter_diff'] = df['summer_mean'] - df['winter_mean']

    return df

train = create_features(train)
test = create_features(test)

In [50]:
# 7. Label encoding
le = LabelEncoder()
train['class_label'] = le.fit_transform(train['class'])

In [51]:
# 8. Final feature list
feature_cols = [
    'ndvi_mean', 'ndvi_std', 'ndvi_min', 'ndvi_max', 'ndvi_range',
    'ndvi_median', 'ndvi_skew', 'ndvi_kurt', 'ndvi_q25', 'ndvi_q75',
    'ndvi_slope', 'summer_mean', 'winter_mean', 'summer_winter_diff',
    'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5'
]

X = train[feature_cols].values
y = train['class_label'].values
X_test = test[feature_cols].values


In [52]:
# 9. Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [53]:
# 10. Polynomial Features (add non-linearity while staying in logistic regression)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_scaled)
X_test_poly = poly.transform(X_test_scaled)

In [54]:
# 11. Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds = np.zeros((X_test_poly.shape[0], len(np.unique(y))))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_poly, y)):
    X_train_fold, y_train_fold = X_poly[train_idx], y[train_idx]
    X_val_fold, y_val_fold = X_poly[val_idx], y[val_idx]
    
    model = LogisticRegression(max_iter=3000, multi_class='multinomial', solver='lbfgs')
    model.fit(X_train_fold, y_train_fold)
    
    val_pred = model.predict(X_val_fold)
    val_acc = accuracy_score(y_val_fold, val_pred)
    print(f"Fold {fold+1} Validation Accuracy: {val_acc:.4f}")
    
    test_fold_pred = model.predict_proba(X_test_poly)
    test_preds += test_fold_pred



Fold 1 Validation Accuracy: 0.8931




Fold 2 Validation Accuracy: 0.9000




Fold 3 Validation Accuracy: 0.8969




Fold 4 Validation Accuracy: 0.8875




Fold 5 Validation Accuracy: 0.8950


In [55]:
# 12. Average predictions across folds
test_preds_final = np.argmax(test_preds, axis=1)
test['class'] = le.inverse_transform(test_preds_final)




In [56]:
# 13. Submission file
submission = test[['ID', 'class']]
submission.to_csv('submission.csv', index=False)
print("🚀 Submission file created successfully!")

🚀 Submission file created successfully!
