In [1]:
#Import relevant libraries
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, make_scorer, f1_score

#List all the csv files under the input directory
for dirname, _, filenames in os.walk("C:\\Users\\Lenovo\\Documents\\Summer Analytics 2025\\Hackathon 1"):
    for filename in filenames:
        if filename.lower().endswith('.csv'):
            print(os.path.join(dirname, filename))

C:\Users\Lenovo\Documents\Summer Analytics 2025\Hackathon 1\hacktest.csv
C:\Users\Lenovo\Documents\Summer Analytics 2025\Hackathon 1\hacktrain.csv


In [2]:
#Read in training data
data = pd.read_csv("hacktrain.csv")
data

Unnamed: 0.1,Unnamed: 0,ID,class,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,1,water,637.5950,658.668,-1882.030,-1924.36,997.904,-1739.990,630.087,...,,-1043.160,-1942.490,267.138,,,211.328,-2203.020,-1180.19,433.906
1,1,2,water,634.2400,593.705,-1625.790,-1672.32,914.198,-692.386,707.626,...,,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.000,-1360.56,524.075
2,3,4,water,58.0174,-1599.160,,-1052.63,,-1564.630,,...,-1025.880,368.622,,-1227.800,304.621,,369.214,-2202.120,,-1343.550
3,4,5,water,72.5180,,380.436,-1256.93,515.805,-1413.180,-802.942,...,-1813.950,155.624,,-924.073,432.150,282.833,298.320,-2197.360,,-826.727
4,7,8,water,1136.4400,,,1647.83,1935.800,,2158.980,...,1535.000,1959.430,-279.317,-384.915,-113.406,1020.720,1660.650,-116.801,-568.05,-1357.140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,10537,10538,impervious,1207.7000,984.620,,1166.25,937.478,1072.700,823.896,...,1117.740,1176.600,1044.110,,369.082,465.843,362.882,979.795,,433.659
7996,10538,10539,impervious,2170.3500,1419.720,1361.000,1478.71,983.911,1262.110,1422.860,...,984.634,2128.970,1379.660,,762.633,485.204,446.724,771.747,1589.06,506.936
7997,10541,10542,impervious,1895.6800,1454.740,,1033.56,1930.380,1057.150,1471.600,...,888.408,2093.020,1232.110,1190.830,1441.460,1170.880,1095.000,1818.650,2501.72,1247.770
7998,10542,10543,impervious,3465.7400,1283.320,413.412,4391.05,1146.820,4473.050,1614.750,...,5833.760,4047.320,4515.800,433.177,277.296,744.143,,3759.710,,388.346


In [3]:
#Null data
data.isnull().sum()

Unnamed: 0       0
ID               0
class            0
20150720_N     560
20150602_N    1200
20150517_N     800
20150501_N     960
20150415_N     480
20150330_N    1120
20150314_N     720
20150226_N    1360
20150210_N     640
20150125_N    1040
20150109_N     880
20141117_N    1280
20141101_N     400
20141016_N    1440
20140930_N     800
20140813_N     560
20140626_N    1600
20140610_N     480
20140525_N     720
20140509_N     880
20140423_N    1760
20140407_N     640
20140322_N    1120
20140218_N    1440
20140202_N     560
20140117_N    1200
20140101_N     400
dtype: int64

In [4]:
#Subset the NDVI columns
ndvi_cols = [c for c in data.columns if c.endswith('_N')]
#print(f"NDVI columns ({len(ndvi_cols)}): {ndvi_cols[:5]} ...")

X = data[ndvi_cols]
#print(X)

#Interpolate along the time axis (axis=1)
X_interp = X.interpolate(method='linear', axis=1, limit_direction='both')

#Compute column medians on the training set only
col_medians = X_interp.median(axis=0)

#Fill any remaining NaNs with these medians, and assign back into data
data[ndvi_cols] = X_interp.fillna(col_medians)

# Check if any columns have missing data after fill
print("Num columns with missing data after fill:", data.isnull().sum())

Num columns with missing data after fill: Unnamed: 0    0
ID            0
class         0
20150720_N    0
20150602_N    0
20150517_N    0
20150501_N    0
20150415_N    0
20150330_N    0
20150314_N    0
20150226_N    0
20150210_N    0
20150125_N    0
20150109_N    0
20141117_N    0
20141101_N    0
20141016_N    0
20140930_N    0
20140813_N    0
20140626_N    0
20140610_N    0
20140525_N    0
20140509_N    0
20140423_N    0
20140407_N    0
20140322_N    0
20140218_N    0
20140202_N    0
20140117_N    0
20140101_N    0
dtype: int64


In [5]:
#Drop ID column
data.drop(columns=["ID"], inplace=True)
data

Unnamed: 0.1,Unnamed: 0,class,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,20150226_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,water,637.5950,658.668000,-1882.030000,-1924.360,997.904,-1739.990,630.087,-347.7765,...,-193.6950,-1043.160,-1942.4900,267.1380,248.534667,229.931333,211.3280,-2203.020,-1180.1900,433.906
1,1,water,634.2400,593.705000,-1625.790000,-1672.320,914.198,-692.386,707.626,-1670.5900,...,-133.6655,-933.934,-625.3850,120.0590,364.858000,476.972000,220.8780,-2250.000,-1360.5600,524.075
2,3,water,58.0174,-1599.160000,-1325.895000,-1052.630,-1308.630,-1564.630,-417.420,729.7900,...,-1025.8800,368.622,-429.5890,-1227.8000,304.621000,336.917500,369.2140,-2202.120,-1772.8350,-1343.550
3,4,water,72.5180,226.477000,380.436000,-1256.930,515.805,-1413.180,-802.942,683.2540,...,-1813.9500,155.624,-384.2245,-924.0730,432.150000,282.833000,298.3200,-2197.360,-1512.0435,-826.727
4,7,water,1136.4400,1306.903333,1477.366667,1647.830,1935.800,2047.390,2158.980,1700.9250,...,1535.0000,1959.430,-279.3170,-384.9150,-113.406000,1020.720000,1660.6500,-116.801,-568.0500,-1357.140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,10537,impervious,1207.7000,984.620000,1075.435000,1166.250,937.478,1072.700,823.896,943.6980,...,1117.7400,1176.600,1044.1100,706.5960,369.082000,465.843000,362.8820,979.795,706.7270,433.659
7996,10538,impervious,2170.3500,1419.720000,1361.000000,1478.710,983.911,1262.110,1422.860,1401.9700,...,984.6340,2128.970,1379.6600,1071.1465,762.633000,485.204000,446.7240,771.747,1589.0600,506.936
7997,10541,impervious,1895.6800,1454.740000,1244.150000,1033.560,1930.380,1057.150,1471.600,2082.6150,...,888.4080,2093.020,1232.1100,1190.8300,1441.460000,1170.880000,1095.0000,1818.650,2501.7200,1247.770
7998,10542,impervious,3465.7400,1283.320000,413.412000,4391.050,1146.820,4473.050,1614.750,770.4690,...,5833.7600,4047.320,4515.8000,433.1770,277.296000,744.143000,2251.9265,3759.710,2074.0280,388.346


In [6]:
# Remove duplicate rows
before = len(data)
data.drop_duplicates(inplace=True)
after = len(data)
print(f"Dropped {before - after} exact duplicate rows; remaining: {after}")

Dropped 0 exact duplicate rows; remaining: 8000


In [7]:
# Handle outliers by capping them to IQR boundaries
# Compute Q1, Q3, and IQR for each column
Q1 = data[ndvi_cols].quantile(0.25)
Q3 = data[ndvi_cols].quantile(0.75)
IQR = Q3 - Q1

# Define multiplier
multiplier = 1.5

# Boolean DataFrame of outliers
outlier_mask = ((data[ndvi_cols] < (Q1 - multiplier * IQR)) | (data[ndvi_cols] > (Q3 + multiplier * IQR)))

total_outliers = outlier_mask.sum().sum()
print(f"Detected {total_outliers} outlier values via IQR method (multiplier={multiplier}).")

#Cap outliers to the nearest boundary
for c in ndvi_cols:
    lower = Q1[c] - multiplier * IQR[c]
    upper = Q3[c] + multiplier * IQR[c]
    data[c] = data[c].clip(lower=lower, upper=upper)
print("Capped outliers to IQR boundaries.")

Detected 1322 outlier values via IQR method (multiplier=1.5).
Capped outliers to IQR boundaries.


In [8]:

# Encode class column (if it's categorical)
label_encoder = LabelEncoder()
data['class'] = label_encoder.fit_transform(data['class'])

# Split into features and target
X = data.drop(columns=['class'])
y = data['class']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Fit multinomial logistic regression
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=10
)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Classification report with all original class labels
print(classification_report(
    y_test,
    y_pred,
    labels=list(range(len(label_encoder.classes_))),
    target_names=label_encoder.classes_
))


              precision    recall  f1-score   support

        farm       0.71      0.67      0.69       168
      forest       0.95      0.98      0.97      1232
       grass       0.00      0.00      0.00        39
  impervious       0.66      0.84      0.74       134
     orchard       0.00      0.00      0.00         6
       water       0.00      0.00      0.00        21

    accuracy                           0.90      1600
   macro avg       0.39      0.41      0.40      1600
weighted avg       0.86      0.90      0.88      1600



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
test_data = pd.read_csv("hacktest.csv")
test_data.shape

(2845, 29)

In [11]:
ID=test_data['ID']
test_data.drop(['ID'],axis=1,inplace=True)

In [12]:
y_test = model.predict(test_data)

In [13]:
y_test

array([1, 1, 1, ..., 3, 3, 3])

In [14]:
y_decoded = label_encoder.inverse_transform(y_test)
y_decoded

array(['forest', 'forest', 'forest', ..., 'impervious', 'impervious',
       'impervious'], dtype=object)

In [15]:
result = pd.DataFrame({
    'ID': ID,
    'class': y_decoded
})

In [16]:
result

Unnamed: 0,ID,class
0,1,forest
1,2,forest
2,3,forest
3,4,forest
4,5,forest
...,...,...
2840,2841,impervious
2841,2842,impervious
2842,2843,impervious
2843,2844,impervious


In [17]:
result.to_csv("predictions.csv", index=False) 