In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [8]:
train = pd.read_csv('hacktrain.csv')
print(train.head())
print(train.info())

   Unnamed: 0  ID  class  20150720_N  20150602_N  20150517_N  20150501_N  \
0           0   1  water    637.5950     658.668   -1882.030    -1924.36   
1           1   2  water    634.2400     593.705   -1625.790    -1672.32   
2           3   4  water     58.0174   -1599.160         NaN    -1052.63   
3           4   5  water     72.5180         NaN     380.436    -1256.93   
4           7   8  water   1136.4400         NaN         NaN     1647.83   

   20150415_N  20150330_N  20150314_N  ...  20140610_N  20140525_N  \
0     997.904   -1739.990     630.087  ...         NaN   -1043.160   
1     914.198    -692.386     707.626  ...         NaN    -933.934   
2         NaN   -1564.630         NaN  ...    -1025.88     368.622   
3     515.805   -1413.180    -802.942  ...    -1813.95     155.624   
4    1935.800         NaN    2158.980  ...     1535.00    1959.430   

   20140509_N  20140423_N  20140407_N  20140322_N  20140218_N  20140202_N  \
0   -1942.490     267.138         NaN        

In [9]:
if 'Unnamed: 0' in train.columns:
    train = train.drop(columns=['Unnamed: 0'])

In [10]:
print(train.isnull().sum())
ndvi_cols = [col for col in train.columns if '_N' in col]
train[ndvi_cols] = train[ndvi_cols].apply(lambda x: x.fillna(x.mean()), axis=0)

ID               0
class            0
20150720_N     560
20150602_N    1200
20150517_N     800
20150501_N     960
20150415_N     480
20150330_N    1120
20150314_N     720
20150226_N    1360
20150210_N     640
20150125_N    1040
20150109_N     880
20141117_N    1280
20141101_N     400
20141016_N    1440
20140930_N     800
20140813_N     560
20140626_N    1600
20140610_N     480
20140525_N     720
20140509_N     880
20140423_N    1760
20140407_N     640
20140322_N    1120
20140218_N    1440
20140202_N     560
20140117_N    1200
20140101_N     400
dtype: int64


In [11]:
print(train['class'].value_counts())

class
forest        6159
farm           841
impervious     669
grass          196
water          105
orchard         30
Name: count, dtype: int64


In [12]:
train['ndvi_mean'] = train[ndvi_cols].mean(axis=1)
train['ndvi_std'] = train[ndvi_cols].std(axis=1)

In [13]:
X = train[ndvi_cols + ['ndvi_mean', 'ndvi_std']]
y = train['class']
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [15]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)


In [16]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [17]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train_scaled, y_train)

In [18]:
y_pred = model.predict(X_val_scaled)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred, target_names=le.classes_))
print(confusion_matrix(y_val, y_pred))

Validation Accuracy: 0.904375
              precision    recall  f1-score   support

        farm       0.73      0.59      0.65       168
      forest       0.94      0.97      0.96      1232
       grass       0.88      0.56      0.69        39
  impervious       0.80      0.84      0.82       134
     orchard       1.00      0.17      0.29         6
       water       0.75      0.57      0.65        21

    accuracy                           0.90      1600
   macro avg       0.85      0.62      0.67      1600
weighted avg       0.90      0.90      0.90      1600

[[  99   59    0    9    0    1]
 [  25 1200    1    5    0    1]
 [   1    7   22    9    0    0]
 [   8    9    2  113    0    2]
 [   2    3    0    0    1    0]
 [   1    2    0    6    0   12]]


In [21]:
print(train.head())

   ID  class  20150720_N   20150602_N   20150517_N  20150501_N   20150415_N  \
0   1  water    637.5950   658.668000 -1882.030000    -1924.36   997.904000   
1   2  water    634.2400   593.705000 -1625.790000    -1672.32   914.198000   
2   4  water     58.0174 -1599.160000  4469.837748    -1052.63  2927.586705   
3   5  water     72.5180  4868.091395   380.436000    -1256.93   515.805000   
4   8  water   1136.4400  4868.091395  4469.837748     1647.83  1935.800000   

    20150330_N   20150314_N   20150226_N  ...   20140509_N  20140423_N  \
0 -1739.990000   630.087000  4987.520158  ... -1942.490000     267.138   
1  -692.386000   707.626000 -1670.590000  ...  -625.385000     120.059   
2 -1564.630000  3313.881373   729.790000  ...  3012.178524   -1227.800   
3 -1413.180000  -802.942000   683.254000  ...  3012.178524    -924.073   
4  4924.493648  2158.980000  4987.520158  ...  -279.317000    -384.915   

    20140407_N   20140322_N  20140218_N  20140202_N   20140117_N  20140101_N  \


In [23]:
test = pd.read_csv('hacktest.csv')
if 'Unnamed: 0' in test.columns:
    test = test.drop(columns=['Unnamed: 0'])
test[ndvi_cols] = test[ndvi_cols].apply(lambda x: x.fillna(x.mean()), axis=0)
test['ndvi_mean'] = test[ndvi_cols].mean(axis=1)
test['ndvi_std'] = test[ndvi_cols].std(axis=1)
X_test = test[ndvi_cols + ['ndvi_mean', 'ndvi_std']]
X_test_scaled = scaler.transform(X_test)

In [24]:
test_preds = model.predict(X_test_scaled)
test_preds_labels = le.inverse_transform(test_preds)

In [25]:
submission = pd.DataFrame({'ID': test['ID'], 'class': test_preds_labels})
import csv
submission.to_csv('submission9.csv', index=False, quoting=csv.QUOTE_NONE, escapechar='\\')
# avoiding quotes completely from my csv fiel as it is merging both columns into one by considering each row as a string..

In [26]:
pd.read_csv('submission9.csv').head()

Unnamed: 0,ID,class
0,1,forest
1,2,forest
2,3,orchard
3,4,forest
4,5,forest


In [27]:
submission = pd.read_csv('submission9.csv')
print(submission['class'].value_counts())

class
forest        1673
farm           474
impervious     416
grass          166
water          106
orchard         10
Name: count, dtype: int64


In [28]:
print(len(submission), len(test))

2845 2845


In [29]:
from IPython.display import FileLink
FileLink('submission9.csv')