In [35]:
# evaluation_results.py
import os
import joblib
import logging
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, roc_curve, auc
)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


In [36]:
# -----------------------------
# Paths (works in script & Jupyter)
# -----------------------------
try:
    # Case: running as a .py script
    BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
except NameError:
    # Case: running inside Jupyter Notebook
    BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))

DATA_PATH = os.path.join(BASE_DIR, "data", "processed", "Crop_recommendation_clean.csv")
RAW_PATH = os.path.join(BASE_DIR, "data", "raw", "Crop_recommendation.csv")
MODEL_DIR = os.path.join(BASE_DIR, "outputs", "models")
BEST_MODEL_PATH = os.path.join(MODEL_DIR, "best_model.pkl")
PLOTS_DIR = os.path.join(BASE_DIR, "outputs", "reports")

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)


In [37]:
# -----------------------------
# Logging setup
# -----------------------------
logging.basicConfig(
    filename=os.path.join(BASE_DIR, "evaluation.log"),
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)

In [43]:
# -----------------------------
# Load and Preprocess Data
# -----------------------------
from sklearn.preprocessing import LabelEncoder

from sklearn.impute import SimpleImputer

def load_data():
    """Load processed dataset or generate from raw if missing, with preprocessing."""
    if not os.path.exists(DATA_PATH):
        if not os.path.exists(RAW_PATH):
            raise FileNotFoundError("❌ Neither processed nor raw data found!")
        logger.warning("⚠️ Processed data not found. Using raw data instead.")
        df = pd.read_csv(RAW_PATH)
        df.to_csv(DATA_PATH, index=False)
    else:
        df = pd.read_csv(DATA_PATH)

    # Separate features and target
    if "label" not in df.columns:
        raise KeyError("❌ Target column 'label' not found in dataset.")
    X = df.drop("label", axis=1)
    y = df["label"]

    # Handle categorical features
    for col in X.columns:
        if X[col].dtype == "object":  # categorical column
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col].astype(str))
            logger.info(f"Encoded column: {col}")

    # Impute missing numeric values
    imputer = SimpleImputer(strategy="mean")  # can also use median
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    logger.info("✅ Missing values imputed with mean")

    # Encode target if categorical
    if y.dtype == "object":
        le_y = LabelEncoder()
        y = le_y.fit_transform(y.astype(str))
        logger.info("Encoded target column 'label'")

    return train_test_split(X, y, test_size=0.2, random_state=42)



In [39]:
def train_and_select_model(X_train, y_train, X_test, y_test):
    """Train multiple models and save the best one."""
    candidates = {
        "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
        "SVM": SVC(kernel="rbf", probability=True, random_state=42)
    }

    best_model, best_score, best_name = None, 0, None

    for name, model in candidates.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        logger.info(f"{name} accuracy: {acc:.4f}")

        if acc > best_score:
            best_score, best_model, best_name = acc, model, name

    joblib.dump(best_model, BEST_MODEL_PATH)
    logger.info(f"✅ Saved best model: {best_name} with accuracy {best_score:.4f}")
    return best_model

In [40]:
def load_or_train_model(X_train, y_train, X_test, y_test):
    """Load model if exists, else train a new one."""
    if os.path.exists(BEST_MODEL_PATH):
        logger.info("📂 Loading existing best model...")
        return joblib.load(BEST_MODEL_PATH)
    else:
        logger.info("⚡ No pre-trained model found. Training new one...")
        return train_and_select_model(X_train, y_train, X_test, y_test)

In [41]:
def evaluate_model(model, X_test, y_test):
    """Evaluate model performance with metrics and plots."""
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="weighted"),
        "recall": recall_score(y_test, y_pred, average="weighted"),
        "f1": f1_score(y_test, y_pred, average="weighted")
    }

    # Save confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    cm_path = os.path.join(PLOTS_DIR, "confusion_matrix.png")
    plt.savefig(cm_path)
    plt.close()

    # Save ROC curve (only if binary classification)
    if y_prob is not None and len(set(y_test)) == 2:
        fpr, tpr, _ = roc_curve(y_test, y_prob[:, 1])
        roc_auc = auc(fpr, tpr)

        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
        plt.plot([0, 1], [0, 1], "r--")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curve")
        plt.legend(loc="lower right")
        roc_path = os.path.join(PLOTS_DIR, "roc_curve.png")
        plt.savefig(roc_path)
        plt.close()

    return metrics

In [44]:
# -----------------------------
# Main Execution
# -----------------------------
if __name__ == "__main__":
    print("🚀 Starting evaluation pipeline...")

    # Load data
    X_train, X_test, y_train, y_test = load_data()

    # Load or train model
    model = load_or_train_model(X_train, y_train, X_test, y_test)

    # Evaluate
    metrics = evaluate_model(model, X_test, y_test)

    print("\n📊 Evaluation Metrics:")
    for k, v in metrics.items():
        print(f"{k.capitalize()}: {v:.4f}")

    print("\n✅ Evaluation completed. Check 'outputs/reports/' and 'evaluation.log'.")

🚀 Starting evaluation pipeline...


  df = pd.read_csv(DATA_PATH)



📊 Evaluation Metrics:
Accuracy: 0.9999
Precision: 0.9999
Recall: 0.9999
F1: 0.9999

✅ Evaluation completed. Check 'outputs/reports/' and 'evaluation.log'.


In [None]:
# ===========================
# Sustainable Crop Recommendation - Robust Notebook
# ===========================

# 1️⃣ Imports
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings("ignore")

# 2️⃣ Paths
RAW_PATH = r"C:\Users\tusha\Downloads\Sustainable-Crop-Recommendation\data\raw"
OUTPUT_PATH = r"C:\Users\tusha\Downloads\Sustainable-Crop-Recommendation\outputs\models"
os.makedirs(OUTPUT_PATH, exist_ok=True)

# 3️⃣ Load all CSV files safely
files = [f for f in os.listdir(RAW_PATH) if f.endswith(".csv")]
if not files:
    raise FileNotFoundError("No CSV files found in raw folder!")

dfs = []
for f in files:
    try:
        df = pd.read_csv(os.path.join(RAW_PATH, f))
        dfs.append(df)
        print(f"✅ Loaded {f} with shape {df.shape}")
    except Exception as e:
        print(f"⚠️ Could not load {f}: {e}")

# Merge datasets safely
try:
    combined_df = pd.concat(dfs, axis=1)
    combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]
    print(f"✅ Combined dataframe shape: {combined_df.shape}")
except Exception as e:
    raise RuntimeError(f"Error combining datasets: {e}")

# 4️⃣ Preprocessing with error handling
try:
    # Handle missing values
    for col in combined_df.columns:
        if combined_df[col].dtype in ['int64', 'float64']:
            combined_df[col].fillna(combined_df[col].median(), inplace=True)
        else:
            combined_df[col].fillna(combined_df[col].mode()[0], inplace=True)

    # Detect target column automatically (last column assumed)
    target_col = combined_df.columns[-1]
    if combined_df[target_col].isnull().all():
        raise ValueError("Target column is completely empty!")

    y = combined_df[target_col]
    if y.dtype == 'object':
        y = LabelEncoder().fit_transform(y)

    # Features
    X = combined_df.drop(columns=[target_col])

    # Encode categorical features
    cat_cols = X.select_dtypes(include='object').columns.tolist()
    for col in cat_cols:
        try:
            X[col] = LabelEncoder().fit_transform(X[col])
        except Exception as e:
            print(f"⚠️ Could not encode {col}: {e}")

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    print("✅ Preprocessing complete")
except Exception as e:
    raise RuntimeError(f"Preprocessing error: {e}")

# 5️⃣ Train-Test Split
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )
except Exception as e:
    raise RuntimeError(f"Error in train-test split: {e}")

# 6️⃣ Model Training & Evaluation
models = {
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "SVM": SVC(kernel='rbf', probability=True, random_state=42)
}

best_acc = 0
best_model_name = None
best_model = None

for name, model in models.items():
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        print(f"--- {name} ---")
        print("Accuracy:", acc)
        print("Classification Report:\n", classification_report(y_test, y_pred))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("\n")

        if acc > best_acc:
            best_acc = acc
            best_model_name = name
            best_model = model
    except Exception as e:
        print(f"⚠️ Error training {name}: {e}")

if best_model is None:
    raise RuntimeError("No model trained successfully!")
print(f"✅ Best Model: {best_model_name} with accuracy {best_acc:.4f}")

# Save model and scaler safely
try:
    joblib.dump(best_model, os.path.join(OUTPUT_PATH, "best_model.pkl"))
    joblib.dump(scaler, os.path.join(OUTPUT_PATH, "scaler.pkl"))
    print("✅ Model and scaler saved")
except Exception as e:
    print(f"⚠️ Could not save model/scaler: {e}")

# 7️⃣ Production Prediction Function with error handling
def predict_new_data(new_data_df):
    try:
        # Fill missing values same as training
        for col in new_data_df.columns:
            if new_data_df[col].dtype in ['int64', 'float64']:
                if col in combined_df.columns:
                    new_data_df[col].fillna(combined_df[col].median(), inplace=True)
                else:
                    new_data_df[col].fillna(new_data_df[col].median(), inplace=True)
            else:
                if col in combined_df.columns:
                    new_data_df[col].fillna(combined_df[col].mode()[0], inplace=True)
                else:
                    new_data_df[col].fillna(new_data_df[col].mode()[0], inplace=True)

        # Encode categorical
        for col in new_data_df.select_dtypes(include='object').columns:
            if col in cat_cols:
                le = LabelEncoder()
                le.fit(combined_df[col])
                new_data_df[col] = le.transform(new_data_df[col])
            else:
                new_data_df[col] = new_data_df[col].astype('category').cat.codes

        # Scale
        new_scaled = scaler.transform(new_data_df)

        # Predict
        preds = best_model.predict(new_scaled)
        return preds
    except Exception as e:
        print(f"⚠️ Prediction error: {e}")
        return None

# 8️⃣ Optional Feature Importance for Random Forest
try:
    if best_model_name == "RandomForest":
        import matplotlib.pyplot as plt
        import seaborn as sns

        feat_imp = pd.Series(best_model.feature_importances_, index=X.columns).sort_values(ascending=False)
        plt.figure(figsize=(10,6))
        sns.barplot(x=feat_imp.values, y=feat_imp.index)
        plt.title("Feature Importance - Random Forest")
        plt.show()
except Exception as e:
    print(f"⚠️ Feature importance error: {e}")


✅ Loaded Crop_recommendation.csv with shape (2200, 8)
✅ Loaded data_core.csv with shape (8000, 9)
✅ Loaded weatherHistory.csv with shape (96453, 12)
✅ Combined dataframe shape: (96453, 25)
✅ Preprocessing complete
