In [1]:
# import pandas as pd
# import numpy as np

In [2]:
# df=pd.read_csv("fertilizer_dataset_cleaned.csv")
# df.head()


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv("fertilizer_dataset_cleaned.csv")

# Display basic info
print("Total rows in dataset:", len(data))

# Split the data (80% train, 20% test)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Save the datasets
train_data.to_csv("train.csv", index=False)
test_data.to_csv("test.csv", index=False)

print("Train and Test split done successfully!")
print("Train rows:", len(train_data))
print("Test rows:", len(test_data))

Total rows in dataset: 300
Train and Test split done successfully!
Train rows: 240
Test rows: 60


In [None]:
# train_model.py

import os
import joblib
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# ---------- Config ----------
FULL_DATA_PATH = "fertilizer_dataset_cleaned_final.csv"
TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"
MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, "fertilizer_rf.joblib")

RANDOM_STATE = 42
TEST_SIZE = 0.2


os.makedirs(MODEL_DIR, exist_ok=True)

def load_data():
    # prefer pre-split files if they exist
    if os.path.exists(TRAIN_PATH) and os.path.exists(TEST_PATH):
        print("Loading existing train.csv and test.csv ...")
        train_df = pd.read_csv(TRAIN_PATH)
        test_df = pd.read_csv(TEST_PATH)
    else:
        if not os.path.exists(FULL_DATA_PATH):
            raise FileNotFoundError(f"No input file found. Place your dataset as '{FULL_DATA_PATH}' or provide train.csv & test.csv.")
        print(f"Loading full dataset from {FULL_DATA_PATH} and splitting into train/test ...")
        df = pd.read_csv(FULL_DATA_PATH)
        train_df, test_df = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE, shuffle=True, stratify=df['fertilizer_name'] if 'fertilizer_name' in df.columns else None)
        train_df.to_csv(TRAIN_PATH, index=False)
        test_df.to_csv(TEST_PATH, index=False)
        print(f"Saved split files as {TRAIN_PATH} and {TEST_PATH}")

    print("Train shape:", train_df.shape)
    print("Test shape:", test_df.shape)
    return train_df, test_df

def preprocess_train_test(train_df, test_df, target_col='fertilizer_name'):

    # Standardize column names
    train_df.columns = train_df.columns.str.strip().str.lower().str.replace(' ', '_')
    test_df.columns = test_df.columns.str.strip().str.lower().str.replace(' ', '_')

    # Identify features
    # Assume target is 'fertilizer_name' and rest are features
    if target_col not in train_df.columns:
        raise ValueError(f"Target column '{target_col}' not found in training data.")

    X_train = train_df.drop(columns=[target_col])
    y_train = train_df[target_col].astype(str)

    X_test = test_df.drop(columns=[target_col])
    y_test = test_df[target_col].astype(str)

    # List categorical columns to one-hot (common in our dataset)
    categorical_cols = [c for c in ['soil_type', 'crop_type'] if c in X_train.columns]
    numeric_cols = [c for c in X_train.columns if c not in categorical_cols]

    # One-hot encode categorical columns using pandas.get_dummies
    X_train_enc = pd.get_dummies(X_train, columns=categorical_cols, drop_first=False)
    X_test_enc  = pd.get_dummies(X_test,  columns=categorical_cols, drop_first=False)

    # Align columns (add any missing columns to test/train with zeros)
    X_train_enc, X_test_enc = X_train_enc.align(X_test_enc, join='outer', axis=1, fill_value=0)

    # Ensure numeric columns present and convert if needed
    for col in numeric_cols:
        if col in X_train_enc.columns:
            X_train_enc[col] = pd.to_numeric(X_train_enc[col], errors='coerce').fillna(X_train_enc[col].median())
        if col in X_test_enc.columns:
            X_test_enc[col] = pd.to_numeric(X_test_enc[col], errors='coerce').fillna(X_train_enc[col].median())

    feature_columns = list(X_train_enc.columns)
    return X_train_enc, X_test_enc, y_train, y_test, feature_columns

def train_and_evaluate(X_train, X_test, y_train, y_test):
    # Train RandomForest
    print("Training RandomForestClassifier ...")
    clf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1)
    clf.fit(X_train, y_train)

    # Predict
    y_pred = clf.predict(X_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    print(f"\nTest Accuracy: {acc:.4f}\n")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    return clf

def save_artifacts(model, feature_columns, model_path=MODEL_PATH):

    artifact = {
        "model": model,
        "feature_columns": feature_columns,
        "random_state": RANDOM_STATE
    }
    joblib.dump(artifact, model_path)
    print(f"Saved model + metadata to {model_path}")

def main():
    train_df, test_df = load_data()
    X_train, X_test, y_train, y_test, feature_columns = preprocess_train_test(train_df, test_df)
    model = train_and_evaluate(X_train, X_test, y_train, y_test)
    save_artifacts(model, feature_columns)

    # Example: single-row prediction demo
    example_row = X_test.iloc[[0]]  # keep as DataFrame
    example_pred = model.predict(example_row)[0]
    print("\nExample prediction on a test row ->", example_pred)

if __name__ == "__main__":
    main()


Loading existing train.csv and test.csv ...
Train shape: (240, 9)
Test shape: (60, 9)
Training RandomForestClassifier ...

Test Accuracy: 0.9333

Classification Report:
              precision    recall  f1-score   support

    10-26-26       0.00      0.00      0.00         1
    14-35-14       0.71      1.00      0.83         5
    17-17-17       1.00      0.86      0.92         7
     28-28-0       1.00      1.00      1.00         4
         DAP       1.00      1.00      1.00        16
         MOP       1.00      0.75      0.86         8
        Urea       0.90      1.00      0.95        19

    accuracy                           0.93        60
   macro avg       0.80      0.80      0.79        60
weighted avg       0.93      0.93      0.93        60

Confusion Matrix:
[[ 0  1  0  0  0  0  0]
 [ 0  5  0  0  0  0  0]
 [ 0  0  6  0  0  0  1]
 [ 0  0  0  4  0  0  0]
 [ 0  0  0  0 16  0  0]
 [ 0  1  0  0  0  6  1]
 [ 0  0  0  0  0  0 19]]
Saved model + metadata to models\fertilizer_rf.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
# predict_single_clean.py
import joblib
import pandas as pd
import os

MODEL_PATH = "models/fertilizer_rf.joblib"  # path where train script saved artifact

def load_artifact(path=MODEL_PATH):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Model artifact not found at {path}. Run training first.")
    artifact = joblib.load(path)
    return artifact

def prepare_input_df(df_raw, feature_columns):
    df = df_raw.copy()
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

    expected = ['n','p','k','temperature','humidity','moisture','soil_type','crop_type']
    missing = [c for c in expected if c not in df.columns]
    if missing:
        raise ValueError(f"Input is missing required columns: {missing}")

    for col in ['n','p','k','temperature','humidity','moisture']:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    for col in ['soil_type','crop_type']:
        df[col] = df[col].astype(str).str.strip()

    df_enc = pd.get_dummies(df, columns=['soil_type','crop_type'], drop_first=False)

    for col in feature_columns:
        if col not in df_enc.columns:
            df_enc[col] = 0
    df_enc = df_enc[feature_columns]
    return df_enc

def interactive_input():
    print("Enter the values (press Enter for default 0 or 'Unknown'):")
    def rnum(prompt, default=0):
        v = input(prompt).strip()
        return float(v) if v != "" else default

    N = rnum("Nitrogen (N): ")
    P = rnum("Phosphorus (P): ")
    K = rnum("Potassium (K): ")
    temp = rnum("Temperature (Â°C): ")
    humidity = rnum("Humidity (%): ")
    moisture = rnum("Soil moisture (%): ")
    soil = input("Soil type (Sandy/Loamy/Clayey): ").strip() or "Sandy"
    crop = input("Crop type (Wheat/Rice/Maize/...): ").strip() or "Wheat"

    return pd.DataFrame([{
        'n': N, 'p': P, 'k': K,
        'temperature': temp, 'humidity': humidity, 'moisture': moisture,
        'soil_type': soil, 'crop_type': crop
    }])

def predict_from_df(df_input, artifact):
    model = artifact['model']
    feature_columns = artifact['feature_columns']
    X = prepare_input_df(df_input, feature_columns)
    preds = model.predict(X)
    probs = None
    if hasattr(model, 'predict_proba'):
        probs_raw = model.predict_proba(X)[0]
        classes = model.classes_
        # Clean probabilities: round to 3 decimals and remove near-zero
        probs = {cls: round(float(p), 3) for cls, p in zip(classes, probs_raw) if p > 0.001}
    return preds, probs

def main():
    artifact = load_artifact(MODEL_PATH)
    df_single = interactive_input()
    preds, probs = predict_from_df(df_single, artifact)

    print("\n=== Prediction Result ===")
    print("Input:", df_single.to_dict(orient='records')[0])
    print("Predicted fertilizer:", preds[0])
    if probs:
        print("Predicted probabilities:", probs)

if __name__ == "__main__":
    main()


Enter the values (press Enter for default 0 or 'Unknown'):

=== Prediction Result ===
Input: {'n': 25.0, 'p': 32.0, 'k': 65.0, 'temperature': 23.0, 'humidity': 22.0, 'moisture': 36.0, 'soil_type': '32', 'crop_type': 'ruce'}
Predicted fertilizer: Urea
Predicted probabilities: {'14-35-14': 0.005, '17-17-17': 0.01, 'DAP': 0.065, 'Urea': 0.92}
