In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

data = pd.read_csv("/content/merged_ready3.csv")

data.head()


Unnamed: 0,Crop Type,District,Agro_Climactic_Zone,Soil_Type,Year,AREA(ha),Rainfall_Sowing_Kharif(mm),Rainfall_Peak_Kharif(mm),Rainfall_Flowering_Kharif(mm),Total_Rainfall(mm),...,Water_Deficit_Peak_Kharif(m³/ha),Water_Deficit_Flowering_Kharif(m³/ha),YIELD(kg/ha),N Ratio(kg/ha),P Ratio(kg/ha),K Ratio(kg/ha),Soil_PH,Soil N (kg/ha),Soill P (kg/ha),Soil K (kg/ha)
0,CASTOR,Balasore,North Eastern Coastal Plain,Alluvial Soil,1993,1.49,379.0,453.2,467.4,1299.6,...,2940.837153,1053.902766,597.32,19,19,19,5.8,120,11,190
1,CHICKPEA,Balasore,North Eastern Coastal Plain,Alluvial Soil,1993,0.74,379.0,453.2,467.4,1299.6,...,2940.837153,1053.902766,378.38,19,19,19,5.8,120,11,190
2,GROUNDNUT,Balasore,North Eastern Coastal Plain,Alluvial Soil,1993,9.72,379.0,453.2,467.4,1299.6,...,2940.837153,1053.902766,1157.41,19,19,19,5.8,120,11,190
3,MAIZE,Balasore,North Eastern Coastal Plain,Alluvial Soil,1993,1.65,379.0,453.2,467.4,1299.6,...,2940.837153,1053.902766,1248.48,19,19,19,5.8,120,11,190
4,MINOR PULSES,Balasore,North Eastern Coastal Plain,Alluvial Soil,1993,99.39,379.0,453.2,467.4,1299.6,...,2940.837153,1053.902766,467.35,19,19,19,5.8,120,11,190


In [None]:
!pip install xgboost lightgbm catboost --quiet

import time, warnings
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

warnings.filterwarnings("ignore")

CSV_PATH = "/content/merged_ready3.csv"
POSSIBLE_TARGETS = ["YIELD", "YIELD(kg/ha)", "YIELD(kg/ha) ", "YIELD (kg/ha)"]
TEST_SIZE = 0.20
RANDOM_STATE = 42
TOP_K_FEATURES = None

df = pd.read_csv(CSV_PATH)
print("Loaded shape:", df.shape)
print("Columns:", df.columns.tolist())

target_col = None
for t in POSSIBLE_TARGETS:
    if t in df.columns:
        target_col = t
        break
if target_col is None:
    for c in df.columns:
        if 'yield' in c.lower():
            target_col = c
            break
if target_col is None:
    raise ValueError("Could not find target column. Update POSSIBLE_TARGETS or rename your YIELD column.")

print("Using target column:", target_col)

y = df[target_col].values
X = df.drop(columns=[target_col]).copy()

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
print("Numeric cols:", len(num_cols), "Categorical cols:", len(cat_cols))

for c in num_cols:
    if X[c].isna().any():
        X[c].fillna(X[c].median(), inplace=True)
for c in cat_cols:
    if X[c].isna().any():
        X[c].fillna("missing", inplace=True)

le_dict = {}
for c in cat_cols:
    le = LabelEncoder()
    X[c] = le.fit_transform(X[c].astype(str))
    le_dict[c] = le

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

base_selector = ExtraTreesRegressor(n_estimators=100, n_jobs=-1, random_state=RANDOM_STATE)
base_selector.fit(X_train, y_train)
importances = pd.Series(base_selector.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nTop 15 features by importance:\n", importances.head(15).to_string())

if TOP_K_FEATURES is not None and TOP_K_FEATURES < X.shape[1]:
    top_features = importances.head(TOP_K_FEATURES).index.tolist()
    X_train = X_train[top_features].copy()
    X_test  = X_test[top_features].copy()
    print(f"\nUsing TOP {TOP_K_FEATURES} features.")
else:
    top_features = X.columns.tolist()

models = {
    "RandomForest": RandomForestRegressor(n_estimators=150, max_depth=12, n_jobs=-1, random_state=RANDOM_STATE),
    "ExtraTrees": ExtraTreesRegressor(n_estimators=150, max_depth=12, n_jobs=-1, random_state=RANDOM_STATE),
    "XGBoost": xgb.XGBRegressor(n_estimators=200, max_depth=8, n_jobs=-1, verbosity=0, tree_method="hist", random_state=RANDOM_STATE),
    "LightGBM": lgb.LGBMRegressor(n_estimators=200, max_depth=10, n_jobs=-1, random_state=RANDOM_STATE),
    "CatBoost": cb.CatBoostRegressor(iterations=200, depth=8, verbose=0, thread_count=-1, random_state=RANDOM_STATE)
}

results = []
for name, model in models.items():
    print(f"\n{name} — training ...")
    t0 = time.time()
    model.fit(X_train, y_train)
    t1 = time.time()
    train_time = t1 - t0
    pred = model.predict(X_test)
    r2 = r2_score(y_test, pred)
    mae = mean_absolute_error(y_test, pred)
    mse = mean_squared_error(y_test, pred)
    rmse = mse ** 0.5
    results.append((name, r2, mae, rmse, train_time))
    print(f"{name} done in {train_time:.1f}s  — R2: {r2:.3f}, MAE: {mae:.2f}, RMSE: {rmse:.2f}")

results_df = pd.DataFrame(results, columns=["model","R2","MAE","RMSE","train_time_s"]).sort_values("R2", ascending=False)
print("\n=== Summary (sorted by R2) ===")
print(results_df.to_string(index=False))

if "RandomForest" in models:
    try:
        rf_imp = pd.Series(models['RandomForest'].feature_importances_, index=top_features).sort_values(ascending=False)
        print("\nTop features from RandomForest:\n", rf_imp.head(12).to_string())
    except Exception as e:
        print("Could not print RF importances:", e)


Loaded shape: (3472, 23)
Columns: ['Crop Type', 'District', 'Agro_Climactic_Zone', 'Soil_Type', 'Year', 'AREA(ha)', 'Rainfall_Sowing_Kharif(mm)', 'Rainfall_Peak_Kharif(mm)', 'Rainfall_Flowering_Kharif(mm)', 'Total_Rainfall(mm)', 'Avg_Water_Deficit(m³/ha)', 'Rainfall_Water_Ratio', 'Water_Deficit_Sowing_Kharif(m³/ha)', 'Water_Deficit_Peak_Kharif(m³/ha)', 'Water_Deficit_Flowering_Kharif(m³/ha)', 'YIELD(kg/ha)', 'N Ratio(kg/ha)', 'P Ratio(kg/ha)', 'K Ratio(kg/ha)', 'Soil_PH', 'Soil N (kg/ha)', 'Soill P (kg/ha)', 'Soil K (kg/ha)']
Using target column: YIELD(kg/ha)
Numeric cols: 17 Categorical cols: 5
Train shape: (2777, 22) Test shape: (695, 22)

Top 15 features by importance:
 Crop Type                                0.523001
AREA(ha)                                 0.181158
Year                                     0.124261
District                                 0.017504
Agro_Climactic_Zone                      0.017078
Soil_Type                                0.016098
Rainfall_Flowering

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [150, 300, 500],
    "max_depth": [10, 12, 15, None],
    "min_samples_split": [2, 5, 10]
}

search = GridSearchCV(
    ExtraTreesRegressor(n_jobs=-1, random_state=42),
    param_grid,
    scoring="r2",
    cv=3,
    verbose=1
)

search.fit(X_train, y_train)
print("Best params:", search.best_params_)
print("Best CV R2:", search.best_score_)


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 150}
Best CV R2: 0.8709053146539599


In [None]:
import joblib
joblib.dump(search.best_estimator_, "crop_yield_model.pkl")


['crop_yield_model.pkl']

In [None]:
model = joblib.load("crop_yield_model.pkl")


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import joblib

FERT_CSV = "/content/Final_Fertilizer.csv"
MASTER_CSV = "/content/merged_ready3.csv"

fert_df = pd.read_csv(FERT_CSV)

fert_df.columns = fert_df.columns.str.strip()
fert_df['District'] = fert_df['District'].astype(str).str.strip()
fert_df['Major Crops'] = fert_df['Major Crops'].astype(str).str.strip()
fert_df['Typical NPK Ratio (kg/ha) of Products'] = fert_df.get('Typical NPK Ratio (kg/ha) of Products', fert_df.columns[-1])


if 'X' in globals() and 'le_dict' in globals():
    feature_cols = list(X.columns)
    encoders = le_dict
    print("Reusing feature columns and encoders from session.")
else:
    master = pd.read_csv(MASTER_CSV)
    target_col = None
    for c in master.columns:
        if 'yield' in c.lower():
            target_col = c
            break
    if target_col is None:
        raise RuntimeError("Cannot auto-detect target column in master CSV; set up 'X' and 'le_dict' in the session.")
    feature_cols = [c for c in master.columns if c != target_col]
    encoders = {}
    for c in master.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        master[c] = master[c].fillna("missing").astype(str)
        le.fit(master[c])
        encoders[c] = le
    print("Built encoders from master CSV (fallback).")

if 'model' not in globals():
    # try to choose best from models dict if present
    if 'models' in globals() and 'ExtraTrees' in models:
        model = models['ExtraTrees']
        print("Using ExtraTrees model from session.")
    else:
        raise RuntimeError("No trained model found in session. Ensure `model` or `models['ExtraTrees']` exists.")

def build_input_row(base_conditions: dict):
    """
    base_conditions: dict of column->value provided by user (strings for categorical, numbers for numeric)
    returns: DataFrame with 1 row, numeric encoded, columns = feature_cols
    """
    row = {}
    if 'X' in globals():
        X_ref = X
    else:
        master = pd.read_csv(MASTER_CSV)
        X_ref = master.drop(columns=[c for c in master.columns if 'yield' in c.lower()])  # best-effort
    for col in feature_cols:
        if col in X_ref.columns:
            if X_ref[col].dtype.kind in 'biufc':  # numeric
                row[col] = float(X_ref[col].median())
            else:
                row[col] = str(X_ref[col].mode().iloc[0]) if not X_ref[col].mode().empty else "missing"
        else:
            row[col] = 0.0
    for k, v in base_conditions.items():
        if k in row:
            row[k] = v
        else:
            matches = [c for c in feature_cols if c.lower().replace(" ", "") == k.lower().replace(" ", "")]
            if matches:
                row[matches[0]] = v

    df_row = pd.DataFrame([row], columns=feature_cols)
    for col, le in encoders.items():
        if col in df_row.columns:
            val = df_row.at[0, col]
            val = "missing" if pd.isna(val) else str(val)
            try:
                df_row[col] = le.transform([val])[0]
            except Exception:
                df_row[col] = le.transform([le.classes_[0]])[0]
    for col in df_row.columns:
        if df_row[col].dtype == object:
            try:
                df_row[col] = pd.to_numeric(df_row[col])
            except:
                pass
    return df_row

def recommend_fertilizer_safe(district, crop, base_conditions, N_range=None, P_range=None, K_range=None, top_k=5):
    """
    district, crop: strings (for lookup into fert_df)
    base_conditions: dictionary with keys matching feature column names (e.g. 'AREA(ha)', 'Soil_Type', 'Rainfall_Sowing_Kharif(mm)', ...)
    N_range/P_range/K_range: iterables (kg/ha) to test; if None defaults are used
    top_k: number of top combinations to return
    """
    N_range = N_range if N_range is not None else list(range(10, 31, 2))
    P_range = P_range if P_range is not None else list(range(10, 31, 2))
    K_range = K_range if K_range is not None else list(range(10, 31, 2))

    fert_match = fert_df[fert_df['District'].str.lower() == str(district).strip().lower()]
    fert_info = fert_match.iloc[0] if not fert_match.empty else None

    base_row = base_conditions.copy()
    sample_row = build_input_row(base_row)

    results = []
    for n in N_range:
        for p in P_range:
            for k in K_range:
                test_row = sample_row.copy()
                if 'N Ratio(kg/ha)' in sample_row.columns:
                    test_row['N Ratio(kg/ha)'] = n
                elif 'N Ratio' in sample_row.columns:
                    test_row['N Ratio'] = n
                if 'P Ratio(kg/ha)' in sample_row.columns:
                    test_row['P Ratio(kg/ha)'] = p
                elif 'P Ratio' in sample_row.columns:
                    test_row['P Ratio'] = p
                if 'K Ratio(kg/ha)' in sample_row.columns:
                    test_row['K Ratio(kg/ha)'] = k
                elif 'K Ratio' in sample_row.columns:
                    test_row['K Ratio'] = k
                pred = model.predict(test_row)[0]
                results.append((n, p, k, pred))
    res_df = pd.DataFrame(results, columns=['N','P','K','Predicted_Yield']).sort_values('Predicted_Yield', ascending=False)

    top = res_df.head(top_k).copy()
    if fert_info is not None:
        top['Typical_Products'] = str(fert_info.get('Typical NPK Ratio (kg/ha) of Products', ''))
        top['Companies'] = str(fert_info.get('Recommended Fertilizer Companies', ''))
    return top

example_base = {
    "AREA(ha)": 2.5,
    "Soil_Type": "Alluvial Soil",
    "Rainfall_Sowing_Kharif(mm)": 350,
    "Rainfall_Peak_Kharif(mm)": 450,
    "Rainfall_Flowering_Kharif(mm)": 470,
    "Soil_PH": 5.8,
}

top_recs = recommend_fertilizer_safe("Balasore", "Rice", example_base, N_range=[10,15,19,20,22], P_range=[10,15,19,20], K_range=[10,15,19,20,22], top_k=6)
print(top_recs.to_string(index=False))


Reusing feature columns and encoders from session.
 N  P  K  Predicted_Yield                              Typical_Products                 Companies
19 20 22       593.213733 20-40-20 (PPL), 18-46-0 (DAP), 19-19-19 (NPK) Paradeep Phosphates, IFCO
19 19 22       593.213733 20-40-20 (PPL), 18-46-0 (DAP), 19-19-19 (NPK) Paradeep Phosphates, IFCO
19 20 20       593.213733 20-40-20 (PPL), 18-46-0 (DAP), 19-19-19 (NPK) Paradeep Phosphates, IFCO
19 20 19       593.213733 20-40-20 (PPL), 18-46-0 (DAP), 19-19-19 (NPK) Paradeep Phosphates, IFCO
20 19 20       593.213733 20-40-20 (PPL), 18-46-0 (DAP), 19-19-19 (NPK) Paradeep Phosphates, IFCO
20 19 22       593.213733 20-40-20 (PPL), 18-46-0 (DAP), 19-19-19 (NPK) Paradeep Phosphates, IFCO


In [None]:
print("N unique values:", np.sort(X['N Ratio(kg/ha)'].unique())[:20], "count:", X['N Ratio(kg/ha)'].nunique())
print("P unique values:", np.sort(X['P Ratio(kg/ha)'].unique())[:20], "count:", X['P Ratio(kg/ha)'].nunique())
print("K unique values:", np.sort(X['K Ratio(kg/ha)'].unique())[:20], "count:", X['K Ratio(kg/ha)'].nunique())

sample = X_test.sample(1, random_state=7).copy()
base = sample.iloc[[0]].copy()
vals = []
for n in [10,15,19,20,25,30]:
    for p in [10,15,19,20,25]:
        for k in [10,15,19,20,25]:
            r = base.copy()
            r['N Ratio(kg/ha)'] = n
            r['P Ratio(kg/ha)'] = p
            r['K Ratio(kg/ha)'] = k
            vals.append(((n,p,k), float(model.predict(r)[0])))
vals_df = pd.DataFrame([{'N':t[0][0],'P':t[0][1],'K':t[0][2],'pred':t[1]} for t in vals])
print("Predicted yield range when varying NPK:", vals_df['pred'].min(), "->", vals_df['pred'].max())
print(vals_df.groupby(['N']).pred.agg(['min','max','mean']).head())


N unique values: [18 19] count: 2
P unique values: [18 19] count: 2
K unique values: [18 19] count: 2
Predicted yield range when varying NPK: 1215.0822666666666 -> 1224.1114000000002
            min          max         mean
N                                        
10  1215.082267  1217.849733  1216.720067
15  1215.082267  1217.849733  1216.720067
19  1221.343933  1224.111400  1222.981733
20  1221.343933  1224.111400  1222.981733
25  1221.343933  1224.111400  1222.981733


In [None]:
import pandas as pd, math

fert = pd.read_csv("Final_Fertilizer.csv")
fert['District_norm'] = fert['District'].str.lower().str.strip()

def parse_products(prod_text):
    import re
    matches = re.findall(r'(\d+)[-](\d+)[-](\d+)', str(prod_text))
    return [tuple(map(int,m)) for m in matches]

def choose_closest_product(district, target_npk):
    d = district.lower().strip()
    rows = fert[fert['District_norm']==d]
    products = []
    for _,r in rows.iterrows():
        for npk in parse_products(r['Typical NPK Ratio (kg/ha) of Products']):
            products.append({'npk': npk, 'text': r['Typical NPK Ratio (kg/ha) of Products'], 'company': r['Recommended Fertilizer Companies']})
    if not products:
        return None
    def dist(a,b): return math.sqrt(sum((a[i]-b[i])**2 for i in range(3)))
    return sorted(products, key=lambda p: dist(p['npk'], target_npk))[0]

def dosage_for_product(npk_tuple, target_total_kg_per_ha=120):
    n_frac = npk_tuple[0] / 100.0
    p_frac = npk_tuple[1] / 100.0
    k_frac = npk_tuple[2] / 100.0
    return {'product_kg_per_ha': target_total_kg_per_ha,
            'delivered_N_kg_per_ha': round(target_total_kg_per_ha * n_frac,1),
            'delivered_P_kg_per_ha': round(target_total_kg_per_ha * p_frac,1),
            'delivered_K_kg_per_ha': round(target_total_kg_per_ha * k_frac,1)}

best = choose_closest_product("Balasore", (20,19,22))
dos = dosage_for_product(best['npk'], target_total_kg_per_ha=120)
print(best, dos)


{'npk': (19, 19, 19), 'text': '20-40-20 (PPL), 18-46-0 (DAP), 19-19-19 (NPK)', 'company': 'Paradeep Phosphates, IFCO'} {'product_kg_per_ha': 120, 'delivered_N_kg_per_ha': 22.8, 'delivered_P_kg_per_ha': 22.8, 'delivered_K_kg_per_ha': 22.8}


In [None]:
pest = pd.read_csv("Final_Pesticide.csv")
pest['Crop_norm'] = pest['Crop'].str.lower().str.strip()

def recommend_pesticide(crop, district=None):
    c = crop.lower().strip()
    candidates = pest[pest['Crop_norm']==c]
    if district:
        d = district.lower().strip()
        candidates = candidates[candidates['District'].str.lower().str.contains(d, na=False)] if 'District' in candidates.columns else candidates
    if candidates.empty:
        return "No district-specific pesticide found. Show crop-specific list."
    out = candidates[['Major Pest','Recommended Pesticide']].drop_duplicates().head(5)
    return out

recommend_pesticide("Rice","Balasore")


Unnamed: 0,Major Pest,Recommended Pesticide
0,Brown Plant Hopper,Fipronil; Buprofezin; Flonicamid
1,Leaf Folder,Imidacloprid; Buprofezin
2,Gall Midge,Flonicamid; Dichlorvos
3,Thrips,Neem Oil; NSKE 5%


In [None]:
import pandas as pd
import re

raw = pd.read_csv("/content/Final_Pesticide.csv")

raw.columns = [c.strip() for c in raw.columns]

for col in ["Crop","Major Disease","Disease Solution","Major Pest","Recommended Pesticide","Dose Instruction","District"]:
    if col not in raw.columns:
        raw[col] = ""

raw['Crop_norm'] = raw['Crop'].astype(str).str.strip()
raw['Major_Pest_norm'] = raw['Major Pest'].astype(str).str.strip().str.lower()
raw['Major_Disease_norm'] = raw['Major Disease'].astype(str).str.strip().str.lower()

rows = []
for _, r in raw.iterrows():
    crop = r['Crop_norm']
    disease = r['Major Disease']
    disease_sol = r['Disease Solution']
    pest = r['Major Pest']
    rec = str(r['Recommended Pesticide'])
    dose = str(r.get('Dose Instruction','')).strip()
    district = r.get('District','')
    parts = re.split(r';|,', rec)
    parts = [p.strip() for p in parts if p and p.strip()]
    if not parts:
        parts = ['']
    for prod in parts:
        rows.append({
            "Crop": crop,
            "Major_Disease": disease,
            "Disease_Solution": disease_sol,
            "Major_Pest": pest,
            "Recommended_Pesticide": prod,
            "Dose_Instruction": dose if dose else "Follow product label",
            "District": district
        })

pest_df = pd.DataFrame(rows)

pest_df['Recommended_Pesticide'] = pest_df['Recommended_Pesticide'].str.replace(r'\s+', ' ', regex=True).str.strip()
pest_df['Major_Pest'] = pest_df['Major_Pest'].astype(str).str.strip()
pest_df['Crop'] = pest_df['Crop'].astype(str).str.strip()

pest_df.to_csv("/content/pesticide_clean2.csv", index=False)
print("Saved pesticide_clean.csv (rows):", len(pest_df))
pest_df.head(10)


Saved pesticide_clean.csv (rows): 28


Unnamed: 0,Crop,Major_Disease,Disease_Solution,Major_Pest,Recommended_Pesticide,Dose_Instruction,District
0,Rice,Blast (Pyricularia grisea),Tricyclazole; Botanicals (Neem/Tulsi extracts),Brown Plant Hopper,Fipronil,Follow product label,
1,Rice,Blast (Pyricularia grisea),Tricyclazole; Botanicals (Neem/Tulsi extracts),Brown Plant Hopper,Buprofezin,Follow product label,
2,Rice,Blast (Pyricularia grisea),Tricyclazole; Botanicals (Neem/Tulsi extracts),Brown Plant Hopper,Flonicamid,Follow product label,
3,Rice,Bacterial Leaf Blight,Copper oxychloride; Streptocycline,Leaf Folder,Imidacloprid,Follow product label,
4,Rice,Bacterial Leaf Blight,Copper oxychloride; Streptocycline,Leaf Folder,Buprofezin,Follow product label,
5,Rice,Sheath Blight,Carbendazim + Mancozeb,Gall Midge,Flonicamid,Follow product label,
6,Rice,Sheath Blight,Carbendazim + Mancozeb,Gall Midge,Dichlorvos,Follow product label,
7,Rice,False Smut,Propiconazole; Carbendazim,Thrips,Neem Oil,Follow product label,
8,Rice,False Smut,Propiconazole; Carbendazim,Thrips,NSKE 5%,Follow product label,
9,Tomato,Damping Off/Collar Rot,Trichoderma spp.; Copper oxychloride,Fruit Borer,Imidacloprid,Follow product label,


In [None]:
import pandas as pd

pest_df = pd.read_csv("/content/pesticide_clean2.csv")

def recommend_pesticide_with_solution(crop, district=None, top_k_pests=3):
    c = str(crop).strip().lower()
    d = str(district).strip().lower() if district else None

    if d:
        cand = pest_df[(pest_df['Crop'].str.lower()==c) & (pest_df['District'].astype(str).str.lower().str.contains(d, na=False))]
        if cand.empty:
            cand = pest_df[pest_df['Crop'].str.lower()==c]
    else:
        cand = pest_df[pest_df['Crop'].str.lower()==c]

    if cand.empty:
        return {"status":"not_found", "message": f"No pesticide data found for crop '{crop}'."}

    out = []
    grouped = cand.groupby(['Major_Disease','Major_Pest'])
    for (disease, pest), group in grouped:
        products = group['Recommended_Pesticide'].unique().tolist()
        doses = group['Dose_Instruction'].unique().tolist()
        ds_solutions = group['Disease_Solution'].unique().tolist()
        out.append({
            "Major_Disease": disease,
            "Major_Pest": pest,
            "Disease_Solution": "; ".join([s for s in ds_solutions if str(s).strip()]),
            "Recommended_Products": products,
            "Dose_Instructions": doses
        })
    out = out[:top_k_pests]
    formatted = []
    for item in out:
        text = f"Pest/Disease: {item['Major_Pest']} (Disease: {item['Major_Disease']})\n"
        text += f"  Disease solution / cultural practices: {item['Disease_Solution']}\n"
        text += f"  Recommended pesticides: {', '.join(item['Recommended_Products'])}\n"
        text += f"  Dose guidance: {'; '.join(item['Dose_Instructions'])}\n"
        text += "  Safety: Wear PPE; do not spray before rain; follow label for pre-harvest interval.\n"
        formatted.append(text)
    return {"status":"ok", "recommendations": out, "text_blocks": formatted}

res = recommend_pesticide_with_solution("Rice", district="Balasore")
for t in res['text_blocks']:
    print(t)


Pest/Disease: Leaf Folder (Disease: Bacterial Leaf Blight)
  Disease solution / cultural practices: Copper oxychloride; Streptocycline
  Recommended pesticides: Imidacloprid, Buprofezin
  Dose guidance: Follow product label
  Safety: Wear PPE; do not spray before rain; follow label for pre-harvest interval.

Pest/Disease: Brown Plant Hopper (Disease: Blast (Pyricularia grisea))
  Disease solution / cultural practices: Tricyclazole; Botanicals (Neem/Tulsi extracts)
  Recommended pesticides: Fipronil, Buprofezin, Flonicamid
  Dose guidance: Follow product label
  Safety: Wear PPE; do not spray before rain; follow label for pre-harvest interval.

Pest/Disease: Thrips (Disease: False Smut)
  Disease solution / cultural practices: Propiconazole; Carbendazim
  Recommended pesticides: Neem Oil, NSKE 5%
  Dose guidance: Follow product label
  Safety: Wear PPE; do not spray before rain; follow label for pre-harvest interval.



In [None]:
print("Unique crops:", pest_df['Crop'].nunique(), pest_df['Crop'].unique())

counts = pest_df.groupby('Crop').District.nunique().sort_values(ascending=False)
print(counts.head(20))

print("Rows missing dose:", pest_df['Dose_Instruction'].isna().sum(), " — set to default 'Follow label' if many.")


Unique crops: 10 ['Rice' 'Tomato' 'Potato' 'Brinjal' 'Groundnut' 'Mustard' 'Maize' 'Cotton'
 'Sugarcane' 'Sunflower']
Crop
Brinjal      0
Cotton       0
Groundnut    0
Maize        0
Mustard      0
Potato       0
Rice         0
Sugarcane    0
Sunflower    0
Tomato       0
Name: District, dtype: int64
Rows missing dose: 0  — set to default 'Follow label' if many.


In [None]:
import pandas as pd
pest_df = pd.read_csv("/content/pesticide_clean2.csv")

def format_pesticide_advisory(crop, top_k=5):
    c = str(crop).strip().lower()
    df = pest_df[pest_df['Crop'].str.lower().str.strip()==c]
    if df.empty:
        return f"No pesticide data found for crop '{crop}'."
    grouped = df.groupby(['Major_Disease','Major_Pest'])
    blocks = []
    for (disease, pest), g in grouped:
        products = ", ".join(sorted(set(g['Recommended_Pesticide'].astype(str).tolist())))
        dose = "; ".join(sorted(set(g['Dose_Instruction'].astype(str).tolist())))
        sol = "; ".join(sorted(set(g['Disease_Solution'].astype(str).tolist())))
        block = (
            f"Pest/Disease: {pest}  (Disease: {disease})\n"
            f"  Disease solution / cultural practices: {sol}\n"
            f"  Recommended pesticides: {products}\n"
            f"  Dose guidance: {dose}\n"
            "  Safety: Wear PPE; do not spray before or during rain; follow product label and pre-harvest interval.\n"
        )
        blocks.append(block)
        if len(blocks) >= top_k: break
    return "\n".join(blocks)

print(format_pesticide_advisory("Rice", top_k=6))


Pest/Disease: Leaf Folder  (Disease: Bacterial Leaf Blight)
  Disease solution / cultural practices: Copper oxychloride; Streptocycline
  Recommended pesticides: Buprofezin, Imidacloprid
  Dose guidance: Follow product label
  Safety: Wear PPE; do not spray before or during rain; follow product label and pre-harvest interval.

Pest/Disease: Brown Plant Hopper  (Disease: Blast (Pyricularia grisea))
  Disease solution / cultural practices: Tricyclazole; Botanicals (Neem/Tulsi extracts)
  Recommended pesticides: Buprofezin, Fipronil, Flonicamid
  Dose guidance: Follow product label
  Safety: Wear PPE; do not spray before or during rain; follow product label and pre-harvest interval.

Pest/Disease: Thrips  (Disease: False Smut)
  Disease solution / cultural practices: Propiconazole; Carbendazim
  Recommended pesticides: NSKE 5%, Neem Oil
  Dose guidance: Follow product label
  Safety: Wear PPE; do not spray before or during rain; follow product label and pre-harvest interval.

Pest/Disease

In [None]:
import joblib, json
joblib.dump(models['ExtraTrees'], "crop_yield_model.pkl")
joblib.dump(X.columns.tolist(), "feature_cols.pkl")
joblib.dump(le_dict, "encoders.pkl")


['encoders.pkl']