# src/data_preprocessing

In [17]:
# src/data_preprocessing

import os
import pandas as pd
from sklearn.preprocessing import StandardScaler


In [18]:


BASE_DIR = os.getcwd()  
RAW_DIR = os.path.join(BASE_DIR, "data/raw")
PROCESSED_DIR = os.path.join(BASE_DIR, "data/processed")

os.makedirs(PROCESSED_DIR, exist_ok=True)

print("Base directory:", BASE_DIR)
print("Raw data dir:", RAW_DIR)
print("Processed data dir:", PROCESSED_DIR)


Base directory: C:\Users\tusha
Raw data dir: C:\Users\tusha\data/raw
Processed data dir: C:\Users\tusha\data/processed


In [3]:
# -----------------------
# Basic preprocessing utils
# -----------------------
def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """Fill or drop missing values in the dataset."""
    for col in df.select_dtypes(include="number").columns:
        df[col].fillna(df[col].median(), inplace=True)

    for col in df.select_dtypes(include="object").columns:
        df[col].fillna(df[col].mode()[0], inplace=True)

    return df


def scale_features(df: pd.DataFrame) -> pd.DataFrame:
    """Scale numerical features using StandardScaler."""
    scaler = StandardScaler()
    numeric_cols = df.select_dtypes(include="number").columns
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df


# -----------------------
# Main workflow
# -----------------------
def load_data():
    """Load crop, soil, and weather datasets."""
    try:
        crop_df = pd.read_csv(os.path.join(RAW_DIR, "Crop_recommendation.csv"))
        soil_df = pd.read_csv(os.path.join(RAW_DIR, "data_core.csv"))
        weather_df = pd.read_csv(os.path.join(RAW_DIR, "weatherHistory.csv"))

        print("✅ Datasets loaded successfully!")
        return crop_df, soil_df, weather_df

    except FileNotFoundError as e:
        raise FileNotFoundError(f"❌ Missing file! Check RAW folder. Details: {e}")


In [4]:

def preprocess_and_merge():
    """Merge crop, soil, and weather data into a final dataset."""
    crop_df, soil_df, weather_df = load_data()

    # Merge Crop + Soil
    common_cols = list(set(crop_df.columns) & set(soil_df.columns))
    if common_cols:
        crop_soil_df = pd.merge(crop_df, soil_df, on=common_cols, how="inner")
        print(f"🔗 Crop + Soil merged on columns: {common_cols}")
    else:
        crop_soil_df = pd.concat([crop_df, soil_df], axis=1)
        print("⚠️ No common keys between Crop and Soil. Using concatenation.")

    # Merge with Weather
    common_weather_cols = list(set(crop_soil_df.columns) & set(weather_df.columns))
    if common_weather_cols:
        final_df = pd.merge(crop_soil_df, weather_df, on=common_weather_cols, how="left")
        print(f"🌦️ Crop+Soil + Weather merged on: {common_weather_cols}")
    else:
        final_df = crop_soil_df
        print("⚠️ No common keys with Weather dataset. Skipping merge.")

    # Handle missing values
    final_df = final_df.ffill().bfill().infer_objects(copy=False)

    # Save
    processed_file = os.path.join(PROCESSED_DIR, "final_dataset.csv")
    final_df.to_csv(processed_file, index=False)
    print(f"✅ Final dataset saved at: {processed_file}")

    return final_df



In [22]:
import pandas as pd
import os

def handle_missing_values(df):
    """Fill missing values with median (numeric) or mode (categorical)."""
    for col in df.columns:
        if df[col].dtype in ["float64", "int64"]:
            df[col] = df[col].fillna(df[col].median())
        else:
            df[col] = df[col].fillna(df[col].mode()[0])
    return df

def scale_features(df):
    """Scale numeric features between 0 and 1."""
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df

def load_and_prepare_data(path: str):
    """Load final dataset, clean it, scale features, and split X/y."""
    df = pd.read_csv(path, low_memory=False)

    # Handle missing + scaling
    df = handle_missing_values(df)
    df = scale_features(df)

    # Ensure label column exists
    if "label" not in df.columns:
        raise KeyError(" 'label' column not found in dataset! Make sure preprocessing includes target labels.")

    X = df.drop("label", axis=1)
    y = df["label"]

    return X, y


# -----------------------
# Debug Run
# -----------------------
if __name__ == "__main__":
    processed_path = r"C:\Users\tusha\Downloads\Sustainable-Crop-Recommendation\data\processed\Crop_recommendation_clean.csv"
    
    X, y = load_and_prepare_data(processed_path)

    print("\n Preview of processed dataset:")
    print(X.head())
    print(f"\n Features shape: {X.shape}, Labels shape: {y.shape}")



 Preview of processed dataset:
          N         P         K  temperature  humidity        ph  rainfall  \
0  0.642857  0.289655  0.209756     0.345886  0.790267  0.466264  0.656458   
1  0.607143  0.400000  0.200000     0.371445  0.770633  0.549480  0.741675   
2  0.428571  0.379310  0.214634     0.406854  0.793977  0.674219  0.875710   
3  0.528571  0.241379  0.195122     0.506901  0.768751  0.540508  0.799905   
4  0.557143  0.289655  0.204878     0.324378  0.785626  0.641291  0.871231   

   Temparature  Humidity  Moisture  ...        Summary Precip Type  \
0        0.512  0.010125     0.445  ...  Partly Cloudy        rain   
1        0.512  0.010125     0.445  ...  Partly Cloudy        rain   
2        0.512  0.010125     0.445  ...  Partly Cloudy        rain   
3        0.512  0.010125     0.445  ...  Partly Cloudy        rain   
4        0.512  0.010125     0.445  ...  Partly Cloudy        rain   

  Temperature (C) Apparent Temperature (C) Wind Speed (km/h)  \
0        0.547

# src/model_training

In [6]:
import os
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report


In [7]:
# -----------------------------
# Train a Single Model
# -----------------------------
def train_model(X_train, y_train, model_type="RandomForest", random_state=42):
    """
    Train a single model given training data.

    Args:
        X_train (pd.DataFrame or np.ndarray): Training features
        y_train (pd.Series or np.ndarray): Training labels
        model_type (str): Model type ("RandomForest", "SVM", "DecisionTree")
        random_state (int): Random seed for reproducibility

    Returns:
        sklearn model: Fitted model
    """
    if model_type == "RandomForest":
        model = RandomForestClassifier(n_estimators=100, random_state=random_state)
    elif model_type == "SVM":
        # Use moderate C to avoid long training on large datasets
        model = SVC(kernel="rbf", C=10, probability=True, random_state=random_state)
    elif model_type == "DecisionTree":
        model = DecisionTreeClassifier(random_state=random_state)
    else:
        raise ValueError(f" Unsupported model_type: {model_type}")

    model.fit(X_train, y_train)
    return model

In [8]:

# -----------------------------
# Evaluate Model
# -----------------------------
def evaluate_model(model, X_test, y_test):
    """
    Evaluate a trained model on test data.

    Args:
        model: Trained sklearn model
        X_test (pd.DataFrame): Test features
        y_test (pd.Series): True labels

    Returns:
        dict: {"accuracy": float, "report": str}
    """
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return {"accuracy": acc, "report": report}

In [None]:
# -----------------------------
# Full Pipeline Training (in Jupyter Notebook)
# -----------------------------
import os
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ✅ Use absolute path for Jupyter
csv_path = r"C:\Users\tusha\Downloads\Sustainable-Crop-Recommendation\data\processed\Crop_recommendation_clean.csv"

# -----------------------------
# Helper: Train Model
# -----------------------------
def train_model(X_train, y_train, model_name):
    if model_name == "RandomForest":
        model = RandomForestClassifier(n_estimators=100, random_state=42)
    elif model_name == "SVM":
        model = SVC(kernel="rbf", probability=True, random_state=42)
    elif model_name == "DecisionTree":
        model = DecisionTreeClassifier(random_state=42)
    else:
        raise ValueError(f"❌ Unknown model: {model_name}")

    model.fit(X_train, y_train)
    return model

# -----------------------------
# Helper: Evaluate Model
# -----------------------------
def evaluate_model(model, X_test, y_test, label_encoder, model_name):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
    cm = confusion_matrix(y_test, y_pred)

    print(f"\n📊 Evaluation for {model_name}:")
    print(f"✅ Accuracy: {acc:.4f}")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", cm)

    return {
        "accuracy": acc,
        "report": report,
        "confusion_matrix": cm
    }

# -----------------------------
# Training Pipeline
# -----------------------------
def train_models(csv_path):
    try:
        df = pd.read_csv(csv_path, low_memory=False)
        print("✅ File loaded successfully!")
        print("Shape of dataset:", df.shape)
        display(df.head())
    except FileNotFoundError:
        print("❌ File not found at:", csv_path)
        return {}
    except Exception as e:
        print("⚠️ Error while loading file:", e)
        return {}

    # -----------------------------
    # Handle missing values
    # -----------------------------
    for col in df.select_dtypes(include="number").columns:
        df[col] = df[col].fillna(df[col].median())
    for col in df.select_dtypes(include="object").columns:
        df[col] = df[col].fillna(df[col].mode()[0])

    # Separate features and target
    if "label" not in df.columns:
        raise KeyError("❌ CSV must contain a 'label' column as target.")
    X = df.drop("label", axis=1)
    y = df["label"]

    # Encode categorical features
    encoders = {}
    for col in X.select_dtypes(include=["object"]).columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        encoders[col] = le

    # Encode target
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    encoders["label"] = label_encoder

    # Scale numeric features
    numeric_cols = X.select_dtypes(include="number").columns
    scaler = StandardScaler()
    X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

    # Save encoders
    models_dir = os.path.join(os.getcwd(), "outputs/models")
    os.makedirs(models_dir, exist_ok=True)
    joblib.dump(encoders, os.path.join(models_dir, "encoders.pkl"))
    print("💾 Encoders saved.")

    # Stratified split
    min_class_count = pd.Series(y).value_counts().min()
    stratify = y if min_class_count >= 2 else None
    if stratify is None:
        print("⚠️ Some classes <2 samples. Disabling stratification.")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=stratify
    )

    # Save splits
    splits_dir = os.path.join(os.getcwd(), "data/splits")
    os.makedirs(splits_dir, exist_ok=True)
    pd.concat([X_train, pd.Series(y_train, name="label")], axis=1).to_csv(
        os.path.join(splits_dir, "train.csv"), index=False
    )
    pd.concat([X_test, pd.Series(y_test, name="label")], axis=1).to_csv(
        os.path.join(splits_dir, "test.csv"), index=False
    )
    print(f"💾 Train/Test splits saved in {splits_dir}")

    results = {}

    # -----------------------------
    # Train & Evaluate Multiple Models
    # -----------------------------
    rf = train_model(X_train, y_train, "RandomForest")
    metrics_rf = evaluate_model(rf, X_test, y_test, label_encoder, "RandomForest")
    results["RandomForest"] = metrics_rf
    joblib.dump(rf, os.path.join(models_dir, "random_forest.pkl"))
    print(f"🌳 Random Forest Accuracy: {metrics_rf['accuracy']:.4f}")

    svm = train_model(X_train, y_train, "SVM")
    metrics_svm = evaluate_model(svm, X_test, y_test, label_encoder, "SVM")
    results["SVM"] = metrics_svm
    joblib.dump(svm, os.path.join(models_dir, "svm_model.pkl"))
    print(f"📈 SVM Accuracy: {metrics_svm['accuracy']:.4f}")

    dt = train_model(X_train, y_train, "DecisionTree")
    metrics_dt = evaluate_model(dt, X_test, y_test, label_encoder, "DecisionTree")
    results["DecisionTree"] = metrics_dt
    joblib.dump(dt, os.path.join(models_dir, "decision_tree.pkl"))
    print(f"🌿 Decision Tree Accuracy: {metrics_dt['accuracy']:.4f}")

    # Save evaluation report
    reports_dir = os.path.join(os.getcwd(), "outputs/reports")
    os.makedirs(reports_dir, exist_ok=True)
    report_path = os.path.join(reports_dir, "accuracy_report.txt")
    with open(report_path, "w") as f:
        for model, metrics in results.items():
            f.write(f"{model}: {metrics['accuracy']:.4f}\n")
        f.write("\nClassification Report (Random Forest):\n")
        f.write(metrics_rf["report"])
    print(f"📊 Accuracy report saved at {report_path}")

    return results

# -----------------------------
# Run in Notebook
# -----------------------------
results = train_models(csv_path)
print("\n✅ Training complete. Results:")
for model, metrics in results.items():
    print(f"{model}: {metrics['accuracy']:.4f}")



✅ File loaded successfully!
Shape of dataset: (106653, 25)


Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label,Temparature,Humidity,...,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,90.0,42.0,43.0,20.879744,82.002744,6.502985,202.935536,rice,,,...,,,,,,,,,,
1,85.0,58.0,41.0,21.770462,80.319644,7.038096,226.655537,rice,,,...,,,,,,,,,,
2,60.0,55.0,44.0,23.004459,82.320763,7.840207,263.964248,rice,,,...,,,,,,,,,,
3,74.0,35.0,40.0,26.491096,80.158363,6.980401,242.864034,rice,,,...,,,,,,,,,,
4,78.0,42.0,42.0,20.130175,81.604873,7.628473,262.71734,rice,,,...,,,,,,,,,,


💾 Encoders saved.
💾 Train/Test splits saved in C:\Users\tusha\data/splits

📊 Evaluation for RandomForest:
✅ Accuracy: 0.9999
Classification Report:
               precision    recall  f1-score   support

       apple       1.00      1.00      1.00     20911
      banana       1.00      1.00      1.00        20
   blackgram       1.00      1.00      1.00        20
    chickpea       1.00      1.00      1.00        20
     coconut       1.00      1.00      1.00        20
      coffee       1.00      1.00      1.00        20
      cotton       1.00      1.00      1.00        20
      grapes       1.00      1.00      1.00        20
        jute       1.00      0.95      0.97        20
 kidneybeans       1.00      1.00      1.00        20
      lentil       1.00      0.95      0.97        20
       maize       1.00      1.00      1.00        20
       mango       1.00      1.00      1.00        20
   mothbeans       0.95      1.00      0.98        20
    mungbean       1.00      1.00      1

## cro_predictor

In [None]:


import os
import pandas as pd
import joblib

# Global variable to hold models (optional, keeps memory efficient)
MODELS = None

def load_models(models_path: str = r"C:\Users\tusha\Downloads\Sustainable-Crop-Recommendation\outputs\models"):
    """
    Load trained models and encoders.
    Returns a dict with Random Forest, SVM, and encoders.
    """
    models = {
        "random_forest": joblib.load(os.path.join(models_path, "random_forest.pkl")),
        "svm": joblib.load(os.path.join(models_path, "svm_model.pkl")),
        "encoders": joblib.load(os.path.join(models_path, "encoders.pkl")),
    }
    return models

def preprocess_input(user_input: dict, encoders: dict) -> pd.DataFrame:
    """
    Convert user input dict to DataFrame and encode categorical features
    """
    # Convert keys to lowercase to match training feature names
    user_input = {k.lower(): v for k, v in user_input.items()}
    df = pd.DataFrame([user_input])

    # Encode categorical columns using saved encoders
    for col, le in encoders.items():
        if col != "label" and col in df.columns:
            df[col] = le.transform(df[col])
    return df

def predict_crop(user_input: dict, models: dict) -> dict:
    """
    Predict crop recommendation using all trained models (RandomForest, SVM).
    
    Args:
        user_input (dict): Dictionary of farm features (N, P, K, temperature, humidity, ph, rainfall)
        models (dict): Dictionary containing trained models and encoders

    Returns:
        dict: { "random_forest": "rice", "svm": "wheat" }
    """
    # Preprocess input using saved encoders
    X = preprocess_input(user_input, models["encoders"])

    predictions = {}
    for model_name, model in models.items():
        if model_name == "encoders":
            continue  # skip encoders, not a model
        pred_index = model.predict(X)[0]
        crop = models["encoders"]["label"].inverse_transform([pred_index])[0]
        predictions[model_name] = crop

    return predictions


if __name__ == "__main__":
    # Load models once
    models = load_models()

    # Example input
    sample_input = {
        "n": 90,
        "p": 42,
        "k": 43,
        "temperature": 20.8,
        "humidity": 82,
        "ph": 6.5,
        "rainfall": 200,
    }

    results = predict_crop(sample_input, models)
    for model_name, crop in results.items():
        print(f"🌱 Recommended Crop ({model_name}): {crop}")


## feature_enginerring 

In [None]:
"""
Feature engineering utilities.
Adds extra sustainability-focused features like soil fertility index or drought score.
"""

import pandas as pd


def add_soil_fertility_index(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add a soil fertility index = weighted sum of N, P, K.
    """
    df["soil_fertility_index"] = (
        0.4 * df["n"] + 0.3 * df["p"] + 0.3 * df["k"]
    )
    return df


def add_drought_score(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add drought score based on rainfall & temperature.
    Higher temperature & lower rainfall → higher drought score.
    """
    df["drought_score"] = (
        (df["temperature"] / df["temperature"].max()) * 0.6
        + (1 - df["rainfall"] / df["rainfall"].max()) * 0.4
    )
    return df


def apply_feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply all feature engineering functions.
    """
    df = add_soil_fertility_index(df)
    df = add_drought_score(df)
    return df


## model_evaluation

In [None]:
"""
Model evaluation utilities.
Generate reports, confusion matrices, classification reports, and save them.
"""

import os
import joblib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
)


def evaluate_model(model, X_test, y_test, label_encoder, model_name: str):
    """
    Evaluate model performance and save confusion matrix + report.
    """
    outputs_dir = os.path.join("outputs", "reports")
    os.makedirs(outputs_dir, exist_ok=True)

    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    # Save classification report
    report_path = os.path.join(outputs_dir, f"{model_name}_report.txt")
    with open(report_path, "w") as f:
        f.write(f"Accuracy: {acc:.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(
            classification_report(
                y_test,
                y_pred,
                target_names=label_encoder.classes_,
            )
        )

    # Save confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=label_encoder.classes_,
        yticklabels=label_encoder.classes_,
    )
    plt.title(f"Confusion Matrix - {model_name}")
    plt.xlabel("Predicted")
    plt.ylabel("True")

    cm_path = os.path.join(outputs_dir, f"{model_name}_confusion_matrix.png")
    plt.savefig(cm_path)
    plt.close()

    print(f" {model_name} evaluated. Accuracy: {acc:.4f}")
    print(f" Reports saved to {outputs_dir}")

    return acc


## visualization 

In [None]:
"""
Visualization utilities.
Generate correlation heatmaps and feature importance plots.
"""

import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib


def plot_correlation_heatmap(df: pd.DataFrame, save_path="outputs/reports/correlation_heatmap.png"):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    plt.figure(figsize=(10, 8))
    sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Correlation Heatmap of Features")
    plt.savefig(save_path)
    plt.close()
    print(f"📊 Correlation heatmap saved at {save_path}")


def plot_feature_importance(model, feature_names, save_path="outputs/reports/feature_importance.png"):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    importances = model.feature_importances_
    sorted_idx = importances.argsort()

    plt.figure(figsize=(10, 6))
    plt.barh(range(len(sorted_idx)), importances[sorted_idx], align="center")
    plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
    plt.title("Feature Importance")
    plt.savefig(save_path)
    plt.close()
    print(f"🌟 Feature importance plot saved at {save_path}")
