In [2]:
#Loading python libraries

In [6]:
import pandas as pd
import numpy as np
import scipy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [14]:
#Loading Training data, Test data
#This code loads datasets into Pandas DataFrames to prepare for data processing, model training, and predictions. The datasets are in CSV (Comma-Separated Values) format, which is a standard format for structured data.
training_df = pd.read_csv("train_land_cover_assignment.csv")
test_df = pd.read_csv("test_land_cover_assignment.csv")

In [20]:
#This code prints the column names of the training and test datasets.
#It helps in understanding the structure of the datasets before preprocessing.
print(training_df.columns)
print(test_df.columns)

#training_df.info()

Index(['subid', 'lat', 'lon', 'building', 'cropland', 'wcover', 'bcount', 'x',
       'y', 'bd20', 'bio1', 'bio12', 'bio7', 'bio15', 'cec20', 'dipa', 'dni',
       'dnlt', 'dnpa', 'dor1', 'dor2', 'fpara', 'fpars', 'lcc10', 'lcc11',
       'lcc12', 'lcc13', 'lcc14', 'lcc21', 'lcc8', 'lcc9', 'lstd', 'lstn',
       'mb1', 'mb2', 'mb3', 'mb7', 'mdem', 'mlat', 'mlon', 'nppm', 'npps',
       'ph20', 'sirm', 'sirs', 'slope', 'snd20', 'soc20', 'tim'],
      dtype='object')
Index(['subid', 'lat', 'lon', 'bcount', 'x', 'y', 'bd20', 'bio1', 'bio12',
       'bio7', 'bio15', 'cec20', 'dipa', 'dni', 'dnlt', 'dnpa', 'dor1', 'dor2',
       'fpara', 'fpars', 'lcc10', 'lcc11', 'lcc12', 'lcc13', 'lcc14', 'lcc21',
       'lcc8', 'lcc9', 'lstd', 'lstn', 'mb1', 'mb2', 'mb3', 'mb7', 'mdem',
       'mlat', 'mlon', 'nppm', 'npps', 'ph20', 'sirm', 'sirs', 'slope',
       'snd20', 'soc20', 'tim'],
      dtype='object')


In [22]:
#This command checks for missing (null) values in the training dataset (training_df).
#It helps identify whether there are any empty or missing entries in the dataset,
print(training_df.isnull().sum())#checking for null values

subid        0
lat          0
lon          0
building     0
cropland     0
wcover       0
bcount       0
x            0
y            0
bd20        45
bio1         0
bio12        0
bio7         0
bio15        0
cec20       45
dipa         0
dni          0
dnlt         0
dnpa         0
dor1         0
dor2         0
fpara        0
fpars        0
lcc10        0
lcc11        0
lcc12        0
lcc13        0
lcc14        0
lcc21        0
lcc8         0
lcc9         0
lstd        19
lstn        19
mb1          1
mb2          1
mb3          1
mb7          1
mdem         0
mlat         0
mlon         0
nppm         0
npps         0
ph20        45
sirm         0
sirs         0
slope        0
snd20       45
soc20       45
tim          1
dtype: int64


In [24]:
#This code splits the dataset into features (input variables) and target labels (output variables)
#If any of the specified columns (subid, building, cropland,wcover) are missing, Python won't raise an error
training_elements = training_df.drop(columns=['subid', 'building', 'cropland', 'wcover'], errors='ignore')  # These are target labels.
training_labels = training_df[['building', 'cropland', 'wcover']] #Extracting target labels from the dataframe
print (training_labels)

      building cropland wcover
0           No       No   >60%
1           No      Yes   <30%
2           No      Yes   <30%
3           No       No   <30%
4           No       No   <30%
...        ...      ...    ...
15851       No       No   <30%
15852       No       No   >60%
15853       No       No   <30%
15854       No       No   >60%
15855       No      Yes   >60%

[15856 rows x 3 columns]


In [26]:
#Filling the missing values with median value
training_elements.fillna(training_elements.median(), inplace=True)
test_df.fillna(test_df.median(), inplace=True)

In [30]:
#Normalization of data to ensure all numerical input features have uniform scale
#Standardization (Normalization) ensures that all numerical features have the same scale
scaler = StandardScaler()  # Initialization of standard scaler
training_elements_scaled = scaler.fit_transform(training_elements)
test_features_scaled = scaler.transform(test_df.drop(columns=['subid'], errors='ignore'))

In [32]:
#Encoding to convert categorical labels to numerical format to be read by model
label_encoders = {col: LabelEncoder().fit(training_labels[col]) for col in training_labels.columns}#Creates a dictionary of encoders for each label column
train_labels_encoded = np.column_stack([label_encoders[col].transform(training_labels[col]) for col in training_labels.columns])#Converts each categorical label into numeric values

In [36]:
#Train-validation split to divide the dataset into training and validation subsets.
X_train, X_val, y_train, y_val = train_test_split(training_elements_scaled, train_labels_encoded, test_size=0.2, random_state=42)

In [38]:
#Initialization and training of separate Random Forest Classifier for each land cover category
# Each classifier is trained separatelyto predict whether a given instance belongs to that class.
rf_models = {col: RandomForestClassifier(n_estimators=100, random_state=42) for col in training_labels.columns}
for i, col in enumerate(training_labels.columns):
    rf_models[col].fit(X_train, y_train[:, i])

In [39]:
#Model validation and evaluating its perfomance
y_val_pred = np.column_stack([rf_models[col].predict(X_val) for col in training_labels.columns])  # Predict on validation data
for i, col in enumerate(training_labels.columns):
    print(f"Classification Report for {col}:")  # Print evaluation metric
    print(classification_report(y_val[:, i], y_val_pred[:, i]))

Classification Report for building:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2912
           1       1.00      0.99      1.00       260

    accuracy                           1.00      3172
   macro avg       1.00      1.00      1.00      3172
weighted avg       1.00      1.00      1.00      3172

Classification Report for cropland:
              precision    recall  f1-score   support

           0       0.81      0.90      0.85      2132
           1       0.74      0.57      0.64      1040

    accuracy                           0.79      3172
   macro avg       0.78      0.74      0.75      3172
weighted avg       0.79      0.79      0.79      3172

Classification Report for wcover:
              precision    recall  f1-score   support

           0       0.58      0.62      0.60       839
           1       0.41      0.19      0.26       915
           2       0.63      0.82      0.71      1418

    accuracy          

In [42]:
# Create predictions for test set:
#predicts the probability that a test sample belongs to a given land cover category (Buildings, Cropland, Woody Vegetation Cover).
test_predictions = {col: rf_models[col].predict_proba(test_features_scaled)[:, 1] for col in training_labels.columns}

In [48]:
# Create submission file
submission = pd.DataFrame({'subid': test_df['subid']})  # create submission dataframe
for col in training_labels.columns:
    submission[col] = test_predictions[col]  # Add predicted probabilities
submission.to_csv("submission.csv", index=False)  # Save submission file

print("Submission file generated: submission.csv")  # Confirm completion

Submission file generated: submission.csv
