In [None]:
# Cell 1 — imports and paths (run this cell first)
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# Paths (adjust if your filenames differ)
DATA_CSV = "pest_dataset.csv"               # ensure this CSV is in the same folder
CLEANED_CSV = "pest_dataset_cleaned.csv"
MODEL_OUT = "pest_model.pkl"

print("Notebook ready. Data file expected at:", os.path.abspath(DATA_CSV))


In [None]:
# Cell 2 — load dataset and inspect (run this after Cell 1)
df = pd.read_csv(DATA_CSV)

# Quick head + shape
print("Shape:", df.shape)
display(df.head(10))

# Column names and missing values
print("\nColumns:", df.columns.tolist())
print("\nMissing values per column:")
print(df.isna().sum())

# Detect likely label column and show value counts
label_col = [c for c in df.columns if 'pest' in c.lower() or 'attack' in c.lower() or 'status' in c.lower()]
label_col = label_col[0] if label_col else None
print("\nDetected label column:", label_col)
if label_col:
    print("\nLabel value counts:")
    print(df[label_col].value_counts())

# Keep df in memory for next steps
df.shape


In [None]:
# Cell 3 — cleaning + encoding

# Rename columns to simpler names (optional but recommended)
df = df.rename(columns={
    "MQ135_Air_Quality(ppm)": "mq135",
    "Temperature(C)": "temp",
    "Humidity(%)": "hum",
    "Soil_Moisture(%)": "soil",
    "Crop": "crop",
    "Pest_Attack_Status": "label"
})

# Encode crop (tomato/carrot)
crop_encoder = LabelEncoder()
df["crop_encoded"] = crop_encoder.fit_transform(df["crop"])

# Encode labels (pest categories)
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label"])

print("Crop encoding:", dict(zip(crop_encoder.classes_, crop_encoder.transform(crop_encoder.classes_))))
print("\nLabel encoding:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# Show cleaned dataset head
df.head()


In [None]:
# Cell 4 — feature selection + train-test split

from sklearn.model_selection import train_test_split

# Select features for the ML model
FEATURES = ["mq135", "temp", "hum", "soil", "crop_encoded"]

X = df[FEATURES]
y = df["label_encoded"]

print("Feature matrix shape:", X.shape)
print("Label vector shape:", y.shape)

# Split data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print("Train size:", X_train.shape, " Test size:", X_test.shape)


In [None]:
# Cell 5 — Train Random Forest model

from sklearn.ensemble import RandomForestClassifier

# Create the model
rf = RandomForestClassifier(
    n_estimators=200,     # number of trees
    max_depth=None,      # allow trees to grow fully
    random_state=42,
    class_weight="balanced"   # helps with uneven pest class distribution
)

# Train the model
rf.fit(X_train, y_train)

print("Model trained successfully!")


In [None]:
# Cell 6 — Evaluate model on test set (accuracy, report, confusion matrix + plot)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Predict
y_pred = rf.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {acc:.4f}\n")

# Human-readable labels
label_names = label_encoder.inverse_transform(np.arange(len(label_encoder.classes_)))
print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_names))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", xticklabels=label_names, yticklabels=label_names, cmap="Blues")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.title("Confusion Matrix")
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Optional: show top feature importances
feat_importances = pd.Series(rf.feature_importances_, index=FEATURES).sort_values(ascending=False)
print("\nFeature importances:")
print(feat_importances)


In [None]:
# Cell 7 — Save trained model to local folder

import joblib
import os

# Create model directory if it doesn't exist
os.makedirs("model", exist_ok=True)

MODEL_PATH = "model/pest_rf_model.pkl"
ENCODER_PATH = "model/label_encoder.pkl"

# Save model + encoder
joblib.dump(rf, MODEL_PATH)
joblib.dump(label_encoder, ENCODER_PATH)

print("Model saved successfully!")
print("Model file:", MODEL_PATH)
print("Label encoder file:", ENCODER_PATH)
