In [None]:
"""

Author: Neeraj Jaiswal
Team Name: The Revengers
Team Members: Neeraj Jaiswal, Karanbir Singh, Komal Dadwal, Jatin Mahey, Sonu Choubey
Leaderboard Rank: 27

"""

# This is the notebook used for training the model.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import os

# Load dataset
df = pd.read_csv("../data/soil_attributes_output.csv")

# Parse RGB values from 'color' string
def parse_rgb(color_str):
    nums = color_str.strip("rgb()").split(",")
    return [int(x.strip()) for x in nums]

df[['R', 'G', 'B']] = df['color'].apply(lambda x: pd.Series(parse_rgb(x)))

# Features and target
feature_cols = ['R', 'G', 'B', 'texture']
target_col = 'soil_type'

X = df[feature_cols]
y = df[target_col]

# Define preprocessing for categorical features
categorical_features = ['texture']
numeric_features = ['R', 'G', 'B']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # Keep numeric features as is
)

# Build the pipeline
clf_pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Split dataset: 70% train, 15% val, 15% test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, stratify=y_temp, random_state=42)

# Fit the model
clf_pipeline.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred = clf_pipeline.predict(X_val)
print("📊 Validation Classification Report:\n")
print(classification_report(y_val, y_val_pred))

# Optional: Final test evaluation
y_test_pred = clf_pipeline.predict(X_test)
print("\n📊 Test Set Classification Report:\n")
print(classification_report(y_test, y_test_pred))


In [None]:
print("Train classes:\n", y_train.value_counts())
print("Val classes:\n", y_val.value_counts())
print("Test classes:\n", y_test.value_counts())


In [None]:
print("Validation predictions shape:", y_val_pred.shape)
print("Test predictions shape:", y_test_pred.shape)

# For example, print first 5 predictions
print("Validation first 5 preds:", y_val_pred[:5])
print("Test first 5 preds:", y_test_pred[:5])


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

labels = ['Alluvial soil', 'Black Soil', 'Clay soil', 'Red Soil']

# Validation set confusion matrix
cm_val = confusion_matrix(y_val, y_val_pred, labels=labels)
disp_val = ConfusionMatrixDisplay(confusion_matrix=cm_val, display_labels=labels)

plt.figure(figsize=(8, 6))
disp_val.plot(cmap=plt.cm.Blues)
plt.title("Validation Set Confusion Matrix")
plt.show()

# Test set confusion matrix
cm_test = confusion_matrix(y_test, y_test_pred, labels=labels)
disp_test = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=labels)

plt.figure(figsize=(8, 6))
disp_test.plot(cmap=plt.cm.Greens)
plt.title("Test Set Confusion Matrix")
plt.show()


In [None]:
import joblib

# Save the trained pipeline
joblib.dump(clf_pipeline, "../data/soil_classifier_pipeline.joblib")