In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv("hacktrain.csv")
df #ignore the warnings

Unnamed: 0.1,Unnamed: 0,ID,class,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,1,water,637.5950,658.668,-1882.030,-1924.36,997.904,-1739.990,630.087,...,,-1043.160,-1942.490,267.138,,,211.328,-2203.020,-1180.19,433.906
1,1,2,water,634.2400,593.705,-1625.790,-1672.32,914.198,-692.386,707.626,...,,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.000,-1360.56,524.075
2,3,4,water,58.0174,-1599.160,,-1052.63,,-1564.630,,...,-1025.880,368.622,,-1227.800,304.621,,369.214,-2202.120,,-1343.550
3,4,5,water,72.5180,,380.436,-1256.93,515.805,-1413.180,-802.942,...,-1813.950,155.624,,-924.073,432.150,282.833,298.320,-2197.360,,-826.727
4,7,8,water,1136.4400,,,1647.83,1935.800,,2158.980,...,1535.000,1959.430,-279.317,-384.915,-113.406,1020.720,1660.650,-116.801,-568.05,-1357.140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,10537,10538,impervious,1207.7000,984.620,,1166.25,937.478,1072.700,823.896,...,1117.740,1176.600,1044.110,,369.082,465.843,362.882,979.795,,433.659
7996,10538,10539,impervious,2170.3500,1419.720,1361.000,1478.71,983.911,1262.110,1422.860,...,984.634,2128.970,1379.660,,762.633,485.204,446.724,771.747,1589.06,506.936
7997,10541,10542,impervious,1895.6800,1454.740,,1033.56,1930.380,1057.150,1471.600,...,888.408,2093.020,1232.110,1190.830,1441.460,1170.880,1095.000,1818.650,2501.72,1247.770
7998,10542,10543,impervious,3465.7400,1283.320,413.412,4391.05,1146.820,4473.050,1614.750,...,5833.760,4047.320,4515.800,433.177,277.296,744.143,,3759.710,,388.346


In [3]:
# Count number of samples per class
class_counts = df['class'].value_counts()

print("Number of samples per class:")
print(class_counts)


Number of samples per class:
class
forest        6159
farm           841
impervious     669
grass          196
water          105
orchard         30
Name: count, dtype: int64


In [4]:
df.isnull().sum()

Unnamed: 0       0
ID               0
class            0
20150720_N     560
20150602_N    1200
20150517_N     800
20150501_N     960
20150415_N     480
20150330_N    1120
20150314_N     720
20150226_N    1360
20150210_N     640
20150125_N    1040
20150109_N     880
20141117_N    1280
20141101_N     400
20141016_N    1440
20140930_N     800
20140813_N     560
20140626_N    1600
20140610_N     480
20140525_N     720
20140509_N     880
20140423_N    1760
20140407_N     640
20140322_N    1120
20140218_N    1440
20140202_N     560
20140117_N    1200
20140101_N     400
dtype: int64

In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set visual style
sns.set(style="whitegrid", palette="muted", font_scale=0.85)

# Assuming `df` is already loaded and contains NDVI + class
ndvi_columns = [col for col in df.columns if col.endswith('_N')]

# Create dictionary of class → DataFrame
class_subgroups = {cls: df[df['class'] == cls].copy() for cls in df['class'].unique()}

# Store cleaned class-wise data
median_cleaned_subgroups = {}

for cls_name, df_class in class_subgroups.items():
    df_copy = df_class.copy()

    # Step 1: Replace IQR-based outliers with NaN (per NDVI column)
    for col in ndvi_columns:
        Q1 = df_copy[col].quantile(0.25)
        Q3 = df_copy[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_copy[col] = df_copy[col].mask((df_copy[col] < lower_bound) | (df_copy[col] > upper_bound))

    # Step 2: Fill NaNs with median (per column within class)
    for col in ndvi_columns:
        median_val = df_copy[col].median()
        df_copy[col] = df_copy[col].fillna(median_val)

    # Store cleaned result
    median_cleaned_subgroups[cls_name] = df_copy

# Combine all cleaned class-wise data
complete_df = pd.concat(median_cleaned_subgroups.values(), ignore_index=True)

# Optional: Check result
complete_df.head()


Unnamed: 0.1,Unnamed: 0,ID,class,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,1,water,637.595,658.668,-1882.03,-1924.36,997.904,493.028,630.087,...,1537.14,-1043.16,-1942.49,267.138,719.0115,696.4885,211.328,-2203.02,-1180.19,433.906
1,1,2,water,634.24,593.705,-1625.79,-1672.32,914.198,-692.386,707.626,...,1537.14,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.0,-1360.56,524.075
2,3,4,water,58.0174,-1599.16,-154.48,-1052.63,444.3525,493.028,676.99,...,-1025.88,368.622,804.184,-1227.8,304.621,696.4885,369.214,-2202.12,791.657,-1343.55
3,4,5,water,72.518,665.1335,380.436,-1256.93,515.805,493.028,-802.942,...,-1813.95,155.624,804.184,-924.073,432.15,282.833,298.32,-2197.36,791.657,-826.727
4,7,8,water,1136.44,665.1335,-154.48,1647.83,444.3525,493.028,2158.98,...,1535.0,1959.43,-279.317,-384.915,-113.406,1020.72,519.5155,-116.801,-568.05,-1357.14


In [15]:
print("Null values per column:")
print(complete_df[ndvi_columns].isnull().sum())

print("\nTotal missing values:", complete_df[ndvi_columns].isnull().sum().sum())


Null values per column:
20150720_N    0
20150602_N    0
20150517_N    0
20150501_N    0
20150415_N    0
20150330_N    0
20150314_N    0
20150226_N    0
20150210_N    0
20150125_N    0
20150109_N    0
20141117_N    0
20141101_N    0
20141016_N    0
20140930_N    0
20140813_N    0
20140626_N    0
20140610_N    0
20140525_N    0
20140509_N    0
20140423_N    0
20140407_N    0
20140322_N    0
20140218_N    0
20140202_N    0
20140117_N    0
20140101_N    0
dtype: int64

Total missing values: 0


In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Features and target
ndvi_columns = [col for col in complete_df.columns if col.endswith('_N')]
X = complete_df[ndvi_columns]
y = complete_df['class']

# Step 2: Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Step 4: Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 5: Train Logistic Regression (multinomial)
model = LogisticRegression(
    
    solver='lbfgs',  # or 'saga' for large datasets
    max_iter=1000,
    random_state=42
)
model.fit(X_scaled, y_encoded)

# Step 6: Evaluate


In [23]:
import pandas as pd

# Step 1: Load test data
hacktest = pd.read_csv("hacktest.csv")

# Step 2: Extract NDVI columns
ndvi_columns = [col for col in hacktest.columns if col.endswith('_N')]
X_test = hacktest[ndvi_columns].copy()

# Step 3: Fill missing values with median of each column
for col in ndvi_columns:
    X_test[col] = X_test[col].fillna(X_test[col].median())

# Step 4: Scale features using the same scaler used on training data
X_test_scaled = scaler.transform(X_test)

# Step 5: Predict encoded labels
y_pred_encoded = model.predict(X_test_scaled)

# Step 6: Decode predictions to original class labels
y_pred_labels = le.inverse_transform(y_pred_encoded)

# Step 7: Create submission DataFrame with ID and predicted class
submission_df = pd.DataFrame({
    'ID': hacktest['ID'],
    'predicted_class': y_pred_labels
})

# Step 8: Save to CSV
submission_df.to_csv("submission10.csv", index=False)

# Final preview
print("✅ Predictions saved to submission.csv")
print(submission_df.head())


✅ Predictions saved to submission.csv
   ID predicted_class
0   1            farm
1   2          forest
2   3         orchard
3   4          forest
4   5          forest
