<a href="https://colab.research.google.com/github/Piyush-code-lab/IITG_Analytics/blob/main/starter_for_hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Imports
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [4]:
 # Load data
 train_df = pd.read_csv('/content/hacktrain.csv')
 test_df = pd.read_csv('/content/hacktest.csv')
 # Drop Unnamed: 0 if present
 train_df = train_df.drop(columns=['Unnamed: 0'], errors='ignore')
 test_df = test_df.drop(columns=['Unnamed: 0'], errors='ignore')
 # Show shape
 print("Train shape:", train_df.shape)
 print("Test shape:", test_df.shape)

Train shape: (8000, 29)
Test shape: (2845, 28)


In [5]:
 # Simple but better: median imputation (median works better for NDVI with outliers)
 train_df.fillna(train_df.median(numeric_only=True), inplace=True)
 test_df.fillna(test_df.median(numeric_only=True), inplace=True)

In [6]:
 # Select NDVI columns
 ndvi_columns = [col for col in train_df.columns if '_N' in col]
 # Add new features:
 # Mean NDVI
 train_df['NDVI_mean'] = train_df[ndvi_columns].mean(axis=1)
 test_df['NDVI_mean'] = test_df[ndvi_columns].mean(axis=1)
 # Standard deviation of NDVI
 train_df['NDVI_std'] = train_df[ndvi_columns].std(axis=1)
 test_df['NDVI_std'] = test_df[ndvi_columns].std(axis=1)
 # Max NDVI - Min NDVI (Amplitude)
 train_df['NDVI_amp'] = train_df[ndvi_columns].max(axis=1) - train_df[ndvi_columns].min(axis=1)
 test_df['NDVI_amp'] = test_df[ndvi_columns].max(axis=1) - test_df[ndvi_columns].min(axis=1)
 # You can add even more features later (seasonality, trends)

In [7]:
 # Encode target class
 label_encoder = LabelEncoder()
 train_df['class'] = label_encoder.fit_transform(train_df['class'])

In [8]:
 # Features and target
 X = train_df.drop(columns=['ID', 'class'])
 y = train_df['class']
 # Split
 X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
 )
 print("Train samples:", X_train.shape[0])
 print("Validation samples:", X_valid.shape[0])

Train samples: 6400
Validation samples: 1600


In [9]:
 # Pipeline: Scaling + Logistic Regression
 pipeline = Pipeline([ ('scaler', StandardScaler()),
    ('clf', LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        max_iter=1000,
        class_weight='balanced', # very important!
        random_state=42
    ))
 ])
 # Train
 pipeline.fit(X_train, y_train)



In [10]:
 # Predict on validation set
 y_valid_pred = pipeline.predict(X_valid)
 # Classification report
 print(classification_report(
    y_valid,
    y_valid_pred,
    labels=list(range(len(label_encoder.classes_))),
    target_names=label_encoder.classes_
 ))

              precision    recall  f1-score   support

        farm       0.55      0.73      0.63       168
      forest       0.98      0.82      0.90      1232
       grass       0.33      0.67      0.44        39
  impervious       0.76      0.80      0.78       134
     orchard       0.08      0.83      0.14         6
       water       0.26      0.81      0.40        21

    accuracy                           0.81      1600
   macro avg       0.49      0.78      0.55      1600
weighted avg       0.89      0.81      0.84      1600



In [11]:
 # 5-fold cross validation (optional but gives a better idea of model stability)
 cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
 print("CV accuracy scores:", cv_scores)
 print("Mean CV accuracy:", np.mean(cv_scores))



CV accuracy scores: [0.71125  0.71375  0.77875  0.78875  0.728125]
Mean CV accuracy: 0.744125


In [12]:
 # Prepare test features
 X_test_final = test_df.drop(columns=['ID'])
 # Predict
 y_test_pred = pipeline.predict(X_test_final)
 # Decode classes
 y_test_decoded = label_encoder.inverse_transform(y_test_pred)
 # Prepare submission
 submission_df = pd.DataFrame({
    'ID': test_df['ID'],
    'class': y_test_decoded
 })
 # Save submission
 submission_df.to_csv('submission.csv', index=False)
 print("Submission file saved!")

Submission file saved!
