In [1]:
# 02_model_training.py

import os
import sys
import pickle
import warnings

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [2]:
# Add project root to sys.path
sys.path.append(r"C:\Users\tusha\Downloads\Sustainable-Crop-Recommendation")

from src import data_preprocessing, feature_engineering

warnings.filterwarnings("ignore")

In [3]:

# ======================
# Load & preprocess data
# ======================

def load_and_prepare_data(path: str):
    df = pd.read_csv(path)
    df = data_preprocessing.handle_missing_values(df)
    df = data_preprocessing.scale_features(df)

    # Feature engineering
    df = feature_engineering.add_soil_fertility_index(df)
    df = feature_engineering.add_drought_score(df)

    X = df.drop(columns=["label"])
    y = df["label"]

    return train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# ======================
# Model training
# ======================
def train_models(X_train, y_train):
    models = {
        "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
        "LogisticRegression": LogisticRegression(max_iter=500),
        "SVM": SVC(kernel="rbf", probability=True, random_state=42),
    }

    results = {}
    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        scores = cross_val_score(model, X_train, y_train, cv=5)
        results[name] = {
            "model": model,
            "cv_score_mean": np.mean(scores),
            "cv_score_std": np.std(scores),
        }
        print(f"{name} CV Accuracy: {results[name]['cv_score_mean']:.4f} ± {results[name]['cv_score_std']:.4f}")

    return results

In [5]:
# ======================
# Save best model
# ======================
def save_best_model(results, save_path="best_model.pkl"):
    best_model_name = max(results, key=lambda name: results[name]["cv_score_mean"])
    best_model = results[best_model_name]["model"]

    with open(save_path, "wb") as f:
        pickle.dump(best_model, f)

    print(f"\n✅ Best model: {best_model_name} saved at {save_path}")
    return best_model_name

In [6]:
def load_and_prepare_data(path: str):
    """Load dataset, clean it, scale features, and split X/y."""
    df = pd.read_csv(path)

    # ✅ Normalize column names (lowercase -> consistent)
    df.columns = [col.strip().lower() for col in df.columns]

    # Expected columns in dataset
    expected = ["n", "p", "k", "temperature", "humidity", "ph", "rainfall", "label"]
    missing = [col for col in expected if col not in df.columns]
    if missing:
        raise KeyError(f"❌ Missing columns in dataset: {missing}")

    # Handle missing values
    for col in df.select_dtypes(include="number").columns:
        df[col].fillna(df[col].median(), inplace=True)
    for col in df.select_dtypes(include="object").columns:
        df[col].fillna(df[col].mode()[0], inplace=True)

    # Split features + labels
    X = df.drop("label", axis=1)
    y = df["label"]

    # Encode categorical features if any
    from sklearn.preprocessing import LabelEncoder, StandardScaler
    encoders = {}
    for col in X.select_dtypes(include=["object"]).columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        encoders[col] = le

    # Encode target
    le_y = LabelEncoder()
    y = le_y.fit_transform(y)
    encoders["label"] = le_y

    # Scale numeric features
    scaler = StandardScaler()
    numeric_cols = X.select_dtypes(include="number").columns
    X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

    # Train/test split
    from sklearn.model_selection import train_test_split
    min_class_count = pd.Series(y).value_counts().min()
    stratify = y if min_class_count >= 2 else None
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=stratify
    )

    return X_train, X_test, y_train, y_test


In [7]:
import src.data_preprocessing as dp
print(dir(dp))   # shows all functions/variables available
print(dp.__file__)  # tells you which file is being imported


['BASE_DIR', 'PROCESSED_DIR', 'RAW_DIR', 'StandardScaler', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'handle_missing_values', 'load_and_prepare_data', 'load_data', 'os', 'pd', 'preprocess_and_merge', 'scale_features']
C:\Users\tusha\Downloads\Sustainable-Crop-Recommendation\src\data_preprocessing.py
