In [None]:
!pip install pandas numpy scikit-learn faker joblib matplotlib seaborn streamlit

import pandas as pd
import numpy as np
import random
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    mean_squared_error, r2_score
)

import joblib, json, math


In [None]:
DATA_SYNTHETIC = Path("data/agriculture_suitability.csv")
DATA_SYNTHETIC.parent.mkdir(exist_ok=True)

n_rows = 200
crop_types = ["Wheat", "Rice", "Maize", "Cotton", "Sugarcane"]
soil_types = ["Loamy", "Sandy", "Clay", "Alluvial", "Black"]
irrigation_types = ["Canal", "Tube-well", "Rain-fed", "Drip"]
seasons = ["Kharif", "Rabi", "Zaid"]

data = []
for i in range(1, n_rows + 1):
    crop = random.choice(crop_types)
    soil = random.choice(soil_types)
    irrigation = random.choice(irrigation_types)
    season = random.choice(seasons)

    farm_area = round(random.uniform(1, 50), 2)
    fertilizer = round(random.uniform(0.1, 5.0), 2)
    pesticide = round(random.uniform(0.5, 20.0), 2)
    water_usage = round(random.uniform(100, 10000), 2)

    base_yield = {"Wheat": 3.0, "Rice": 4.5, "Maize": 3.8, "Cotton": 2.5, "Sugarcane": 6.0}[crop]
    yield_tons = round(base_yield * farm_area * random.uniform(0.6, 1.4), 2)

    suitable = (
        (soil in ["Loamy", "Alluvial", "Black"])
        and (water_usage > 500)
        and (fertilizer >= 0.5)
        and (yield_tons / farm_area > 2.0)
    )
    suitability = "Suitable" if suitable else "Not Suitable"

    data.append([
        f"FARM_{i:04d}", crop, farm_area, irrigation, fertilizer, pesticide,
        yield_tons, soil, season, water_usage, suitability
    ])

columns = [
    "Farm_ID", "Crop_Type", "Farm_Area_acres", "Irrigation_Type",
    "Fertilizer_Used_tons", "Pesticide_Used_kg", "Yield_tons",
    "Soil_Type", "Season", "Water_Usage_cubic_meters", "Suitability"
]

df = pd.DataFrame(data, columns=columns)
df.to_csv(DATA_SYNTHETIC, index=False)
df.head()

In [None]:
df = pd.read_csv(DATA_SYNTHETIC)

X = df.drop(columns=["Farm_ID", "Suitability", "Yield_tons"])
y_clf = df["Suitability"].map({"Not Suitable": 0, "Suitable": 1})
y_reg = df["Yield_tons"]

numeric_cols = ["Farm_Area_acres", "Fertilizer_Used_tons", "Pesticide_Used_kg", "Water_Usage_cubic_meters"]
categorical_cols = ["Crop_Type", "Irrigation_Type", "Soil_Type", "Season"]

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ])

clf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=500))
])

reg_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

X_train, X_test, yc_train, yc_test, yr_train, yr_test = train_test_split(
    X, y_clf, y_reg, test_size=0.2, random_state=42
)

clf_pipeline.fit(X_train, yc_train)
reg_pipeline.fit(X_train, yr_train)

Path("models").mkdir(exist_ok=True)
joblib.dump(clf_pipeline, "models/suitability_pipeline.joblib")
joblib.dump(reg_pipeline, "models/yield_pipeline.joblib")
with open("models/metadata.json", "w") as f:
    json.dump({"numeric_features": numeric_cols, "categorical_features": categorical_cols}, f, indent=2)

print("✅ Models trained and saved.")


In [None]:
# Classification evaluation
y_pred_clf = clf_pipeline.predict(X_test)
print("Classification Accuracy:", accuracy_score(yc_test, y_pred_clf))
print(classification_report(yc_test, y_pred_clf))

cm = confusion_matrix(yc_test, y_pred_clf)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Not Suitable", "Suitable"],
            yticklabels=["Not Suitable", "Suitable"])
plt.title("Confusion Matrix")
plt.show()

# Regression evaluation
y_pred_reg = reg_pipeline.predict(X_test)
rmse = math.sqrt(mean_squared_error(yr_test, y_pred_reg))
r2 = r2_score(yr_test, y_pred_reg)
print(f"Regression RMSE: {rmse:.2f}")
print(f"Regression R²: {r2:.2f}")


In [None]:
sample = X_test.sample(5, random_state=1)
pred_suit = clf_pipeline.predict(sample)
pred_yield = reg_pipeline.predict(sample)

sample_out = sample.copy()
sample_out["Predicted_Suitability"] = ["Suitable" if p==1 else "Not Suitable" for p in pred_suit]
sample_out["Predicted_Yield"] = pred_yield
sample_out


In [None]:
app_code = """
import streamlit as st
import pandas as pd
import joblib
from pathlib import Path
import json

MODELS_DIR = Path("models")
SUIT_MODEL = MODELS_DIR / "suitability_pipeline.joblib"
YIELD_MODEL = MODELS_DIR / "yield_pipeline.joblib"
META_PATH = MODELS_DIR / "metadata.json"

st.set_page_config(page_title="Agriculture Suitability & Yield Predictor", layout="wide")

COLUMN_MAP = {
    "Farm_Area(acres)": "Farm_Area_acres",
    "Fertilizer_Used(tons)": "Fertilizer_Used_tons",
    "Pesticide_Used(kg)": "Pesticide_Used_kg",
    "Water_Usage(cubic meters)": "Water_Usage_cubic_meters",
}

@st.cache_data
def load_artifacts():
    if not SUIT_MODEL.exists() or not YIELD_MODEL.exists() or not META_PATH.exists():
        raise FileNotFoundError("Model artifacts not found. Run training first.")
    clf = joblib.load(SUIT_MODEL)
    reg = joblib.load(YIELD_MODEL)
    meta = json.loads(META_PATH.read_text())
    return clf, reg, meta

def predict(df):
    clf, reg, meta = load_artifacts()
    df = df.rename(columns=COLUMN_MAP)
    features = meta["numeric_features"] + meta["categorical_features"]
    missing = [c for c in features if c not in df.columns]
    if missing:
        st.error(f"Missing required columns: {missing}")
        return None
    X = df[features]
    suit_pred = clf.predict(X)
    yield_pred = reg.predict(X)
    df_out = df.copy()
    df_out["Predicted_Suitability"] = ["Suitable" if p == 1 else "Not Suitable" for p in suit_pred]
    df_out["Predicted_Yield_tons"] = yield_pred
    return df_out

def main():
    st.title("🌾 Agriculture Suitability & Yield Predictor")
    st.markdown("Upload a CSV file with the same columns as training data.")

    try:
        clf, reg, meta = load_artifacts()
        st.sidebar.write("Detected features:")
        st.sidebar.write("Numeric:", meta["numeric_features"])
        st.sidebar.write("Categorical:", meta["categorical_features"])
    except Exception as e:
        st.sidebar.error(str(e))
        return

    uploaded = st.file_uploader("Upload CSV", type=["csv"])
    if uploaded:
        df = pd.read_csv(uploaded)
        st.subheader("Preview")
        st.dataframe(df.head(10))
        if st.button("Run Predictions"):
            df_out = predict(df)
            if df_out is not None:
                st.subheader("Predictions")
                st.dataframe(df_out.head(20))
                csv = df_out.to_csv(index=False).encode("utf-8")
                st.download_button("Download predictions CSV", csv, file_name="predictions.csv")

if __name__ == "__main__":
    main()
"""

with open("app.py", "w", encoding="utf-8") as f:
    f.write(app_code)

print("✅ app.py file created. Run it from terminal using: streamlit run app.py")
