# Training Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_df = pd.read_csv("/content/drive/MyDrive/Summer analytics '25/Hackathon/hacktrain.csv")
train_df.head()

Unnamed: 0.1,Unnamed: 0,ID,class,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,1,water,637.595,658.668,-1882.03,-1924.36,997.904,-1739.99,630.087,...,,-1043.16,-1942.49,267.138,,,211.328,-2203.02,-1180.19,433.906
1,1,2,water,634.24,593.705,-1625.79,-1672.32,914.198,-692.386,707.626,...,,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.0,-1360.56,524.075
2,3,4,water,58.0174,-1599.16,,-1052.63,,-1564.63,,...,-1025.88,368.622,,-1227.8,304.621,,369.214,-2202.12,,-1343.55
3,4,5,water,72.518,,380.436,-1256.93,515.805,-1413.18,-802.942,...,-1813.95,155.624,,-924.073,432.15,282.833,298.32,-2197.36,,-826.727
4,7,8,water,1136.44,,,1647.83,1935.8,,2158.98,...,1535.0,1959.43,-279.317,-384.915,-113.406,1020.72,1660.65,-116.801,-568.05,-1357.14


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  8000 non-null   int64  
 1   ID          8000 non-null   int64  
 2   class       8000 non-null   object 
 3   20150720_N  7440 non-null   float64
 4   20150602_N  6800 non-null   float64
 5   20150517_N  7200 non-null   float64
 6   20150501_N  7040 non-null   float64
 7   20150415_N  7520 non-null   float64
 8   20150330_N  6880 non-null   float64
 9   20150314_N  7280 non-null   float64
 10  20150226_N  6640 non-null   float64
 11  20150210_N  7360 non-null   float64
 12  20150125_N  6960 non-null   float64
 13  20150109_N  7120 non-null   float64
 14  20141117_N  6720 non-null   float64
 15  20141101_N  7600 non-null   float64
 16  20141016_N  6560 non-null   float64
 17  20140930_N  7200 non-null   float64
 18  20140813_N  7440 non-null   float64
 19  20140626_N  6400 non-null  

In [4]:
train_df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
ID,0
class,0
20150720_N,560
20150602_N,1200
20150517_N,800
20150501_N,960
20150415_N,480
20150330_N,1120
20150314_N,720


In [5]:
numerical_cols = train_df.select_dtypes(include=np.number).columns
for col in numerical_cols:
  if train_df[col].isnull().any():
    mean_val = train_df[col].mean()
    train_df[col].fillna(mean_val, inplace=True)

categorical_cols = train_df.select_dtypes(include='object').columns
for col in categorical_cols:
  if train_df[col].isnull().any():
    mode_val = train_df[col].mode()[0]
    train_df[col].fillna(mode_val, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(mean_val, inplace=True)


In [6]:
print("\nAfter filling NA values:")
train_df.info()
print(train_df.isnull().sum())


After filling NA values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  8000 non-null   int64  
 1   ID          8000 non-null   int64  
 2   class       8000 non-null   object 
 3   20150720_N  8000 non-null   float64
 4   20150602_N  8000 non-null   float64
 5   20150517_N  8000 non-null   float64
 6   20150501_N  8000 non-null   float64
 7   20150415_N  8000 non-null   float64
 8   20150330_N  8000 non-null   float64
 9   20150314_N  8000 non-null   float64
 10  20150226_N  8000 non-null   float64
 11  20150210_N  8000 non-null   float64
 12  20150125_N  8000 non-null   float64
 13  20150109_N  8000 non-null   float64
 14  20141117_N  8000 non-null   float64
 15  20141101_N  8000 non-null   float64
 16  20141016_N  8000 non-null   float64
 17  20140930_N  8000 non-null   float64
 18  20140813_N  8000 non-null   float64
 19  2

In [7]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [8]:
X = train_df.drop(columns=['ID', 'class'])
y = train_df['class']

# Feature Engineering
def extract_features(df):
    stats_df = pd.DataFrame()
    stats_df['mean'] = df.mean(axis=1)
    stats_df['std'] = df.std(axis=1)
    stats_df['min'] = df.min(axis=1)
    stats_df['max'] = df.max(axis=1)
    stats_df['median'] = df.median(axis=1)
    stats_df['q25'] = df.quantile(0.25, axis=1)
    stats_df['q75'] = df.quantile(0.75, axis=1)
    stats_df['range'] = stats_df['max'] - stats_df['min']
    stats_df['iqr'] = stats_df['q75'] - stats_df['q25']
    x_vals = np.arange(df.shape[1])
    stats_df['slope'] = df.apply(lambda row: np.polyfit(x_vals, row.fillna(method='ffill').fillna(method='bfill'), 1)[0], axis=1)
    return stats_df

In [9]:
X_combined = pd.concat([X, extract_features(X)], axis=1)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_combined)

  stats_df['slope'] = df.apply(lambda row: np.polyfit(x_vals, row.fillna(method='ffill').fillna(method='bfill'), 1)[0], axis=1)
  stats_df['slope'] = df.apply(lambda row: np.polyfit(x_vals, row.fillna(method='ffill').fillna(method='bfill'), 1)[0], axis=1)
  stats_df['slope'] = df.apply(lambda row: np.polyfit(x_vals, row.fillna(method='ffill').fillna(method='bfill'), 1)[0], axis=1)


In [10]:
# Splitting data for training and evaluation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [11]:
# Logistic Regression
print("Logistic Regression")
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Accuracy:", accuracy_score(y_test, y_pred))

Logistic Regression




Classification Report:
              precision    recall  f1-score   support

        farm       0.89      0.88      0.88       168
      forest       0.99      1.00      1.00      1232
       grass       0.85      0.72      0.78        39
  impervious       0.86      0.92      0.89       134
     orchard       1.00      0.33      0.50         6
       water       0.81      0.81      0.81        21

    accuracy                           0.97      1600
   macro avg       0.90      0.78      0.81      1600
weighted avg       0.97      0.97      0.96      1600

Accuracy: 0.965625


#Test Data

In [12]:
test_df = pd.read_csv("/content/drive/MyDrive/Summer analytics '25/Hackathon/hacktest.csv")
test_df.head()

Unnamed: 0.1,Unnamed: 0,ID,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,20150226_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,1,7466.42,413.162,5761.0,5625.45,489.403,3923.84,3097.11,6766.42,...,801.184,927.115,4704.14,6378.42,340.949,2695.57,527.268,4736.75,601.843,6639.76
1,1,2,7235.26,6037.35,1027.56,6085.14,1618.05,6668.54,2513.99,1051.69,...,5533.47,5103.04,5216.12,4885.27,4366.79,1234.14,3298.11,6942.68,1070.44,842.101
2,2,3,7425.08,6969.98,1177.94,7408.93,861.061,7644.43,814.458,1504.29,...,1981.39,6204.54,7021.69,5704.41,4897.45,1789.99,2206.1,6928.93,1036.56,831.441
3,3,4,7119.12,1731.62,6311.93,6441.61,465.979,7128.42,1649.12,6935.22,...,959.344,5794.15,1045.57,5572.9,586.287,685.906,1287.0,6734.72,824.584,6883.61
4,4,5,7519.55,8130.26,1482.54,7879.53,1001.21,7937.6,4122.53,1094.51,...,7636.07,6996.76,7413.43,4596.13,4511.7,1413.52,3283.94,7937.68,1857.8,1336.92


In [13]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2845 entries, 0 to 2844
Data columns (total 29 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  2845 non-null   int64  
 1   ID          2845 non-null   int64  
 2   20150720_N  2845 non-null   float64
 3   20150602_N  2845 non-null   float64
 4   20150517_N  2845 non-null   float64
 5   20150501_N  2845 non-null   float64
 6   20150415_N  2845 non-null   float64
 7   20150330_N  2845 non-null   float64
 8   20150314_N  2845 non-null   float64
 9   20150226_N  2845 non-null   float64
 10  20150210_N  2845 non-null   float64
 11  20150125_N  2845 non-null   float64
 12  20150109_N  2845 non-null   float64
 13  20141117_N  2845 non-null   float64
 14  20141101_N  2845 non-null   float64
 15  20141016_N  2845 non-null   float64
 16  20140930_N  2845 non-null   float64
 17  20140813_N  2845 non-null   float64
 18  20140626_N  2845 non-null   float64
 19  20140610_N  2845 non-null  

In [14]:
test_df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
ID,0
20150720_N,0
20150602_N,0
20150517_N,0
20150501_N,0
20150415_N,0
20150330_N,0
20150314_N,0
20150226_N,0


In [15]:
test_ids = test_df['ID']

X_test_submission = test_df.drop(columns=['ID'])

# Feature Engineering
X_test_submission_combined = pd.concat([X_test_submission, extract_features(X_test_submission)], axis=1)

X_test_submission_scaled = scaler.transform(X_test_submission_combined)

# Predicting on test data
test_predictions_encoded = lr.predict(X_test_submission_scaled)

# Decoding predictions back to original class labels
test_predictions = label_encoder.inverse_transform(test_predictions_encoded)

# Metrics
print("\nMetrics for the model:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

  stats_df['slope'] = df.apply(lambda row: np.polyfit(x_vals, row.fillna(method='ffill').fillna(method='bfill'), 1)[0], axis=1)
  stats_df['slope'] = df.apply(lambda row: np.polyfit(x_vals, row.fillna(method='ffill').fillna(method='bfill'), 1)[0], axis=1)



Metrics for the model:
Accuracy: 0.965625
Classification Report:
               precision    recall  f1-score   support

        farm       0.89      0.88      0.88       168
      forest       0.99      1.00      1.00      1232
       grass       0.85      0.72      0.78        39
  impervious       0.86      0.92      0.89       134
     orchard       1.00      0.33      0.50         6
       water       0.81      0.81      0.81        21

    accuracy                           0.97      1600
   macro avg       0.90      0.78      0.81      1600
weighted avg       0.97      0.97      0.96      1600



In [16]:
# Submission
submission = pd.DataFrame({
    'ID': test_ids,
    'class': test_predictions
})
submission.to_csv("/content/drive/MyDrive/Summer analytics '25/Hackathon/submission.csv", index=False)
print("Submission file 'submission.csv' created successfully in the specified directory.")

Submission file 'submission.csv' created successfully in the specified directory.
