
# Crop Yield Prediction — Colab Notebook
This notebook trains **Linear Regression**, **Random Forest**, and **XGBoost** models on a crop yield dataset and reports **R²** and **RMSE**. It mirrors the steps in your PDF.

> **How to use:** Open this notebook in **Google Colab** and run each cell from top to bottom.


In [None]:

# Install dependencies (Colab-friendly)
!pip install -q xgboost scikit-learn pandas matplotlib joblib pytest


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import joblib

print("Libraries imported.")


In [None]:

# Load dataset
# Primary: Public dataset (YBI Foundation)
# Fallback: Generate a small synthetic dataset if the URL isn't reachable.

DATA_URL = "https://raw.githubusercontent.com/ybifoundation/Dataset/main/Crop%20Yield.csv"

def load_data():
    try:
        df = pd.read_csv(DATA_URL)
        print("Loaded dataset from URL:", DATA_URL)
        return df
    except Exception as e:
        print("Failed to load remote dataset. Reason:", e)
        print("Falling back to a small synthetic dataset (for demo).")
        rng = np.random.default_rng(42)
        n = 300
        df = pd.DataFrame({
            "Temperature": rng.normal(26, 4, n),
            "Rainfall": rng.normal(900, 150, n),
            "Soil_Type": rng.choice(["Sandy", "Loam", "Clay"], size=n),
            "Fertilizer": rng.normal(80, 15, n),
            "Pesticide": rng.normal(5, 1.5, n),
        })
        # Nonlinear relation + noise (toy)
        soil_bonus = df["Soil_Type"].map({"Sandy":-2, "Loam":3, "Clay":1}).astype(float)
        df["Crop_Yield"] = (
            0.5*df["Temperature"]
            + 0.01*df["Rainfall"]
            + 0.2*df["Fertilizer"]
            - 0.1*df["Pesticide"]
            + soil_bonus
            + rng.normal(0, 2.5, n)
        )
        return df

df = load_data()
df.head()


In [None]:

# Encode categorical variables and split
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col] = le.fit_transform(df[col])

X = df.drop('Crop_Yield', axis=1)
y = df['Crop_Yield']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape


In [None]:

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=200, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1)
}

results = []
best_name, best_score, best_model = None, -np.inf, None

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    results.append((name, r2, rmse))
    print(f"\n{name}")
    print(f"R² Score: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")
    if r2 > best_score:
        best_score, best_name, best_model = r2, name, model

print(f"\nBest model: {best_name} (R²={best_score:.4f})")


In [None]:

# Save best model
joblib.dump(best_model, "model.pkl")
print("Saved best model to model.pkl")

# Simple feature importance/coefficients plot (if available)
def plot_importances(model, feature_names):
    values = None
    title = "Feature Importance"
    if hasattr(model, "feature_importances_"):
        values = model.feature_importances_
    elif hasattr(model, "coef_"):
        coef = getattr(model, "coef_")
        values = np.abs(coef) if np.ndim(coef)==1 else np.abs(coef).mean(axis=0)
        title = "Absolute Coefficients"
    if values is None:
        print("This model doesn't expose importances/coefficients.")
        raise SystemExit

    order = np.argsort(values)[::-1]
    plt.figure(figsize=(6,4))
    plt.bar(range(len(values)), values[order])
    plt.xticks(range(len(values)), np.array(feature_names)[order], rotation=45, ha="right")
    plt.title(title)
    plt.tight_layout()
    plt.show()

plot_importances(best_model, X_train.columns.tolist())


In [None]:

# Quick 'tests' inline (simulate CI checks)
assert df.isnull().sum().sum() == 0, "Data contains nulls!"
print("Test passed: No null values in dataset.")

# Check minimum performance (soft check; won't fail hard if using synthetic data)
try:
    # On the real dataset, this should be reasonably high; synthetic may vary
    assert best_score > 0.70, f"R² too low: {best_score:.2f}"
    print("Test passed: R² above threshold.")
except AssertionError as e:
    print("Warning:", e)



### Notes
- When you run this in Colab, the dataset will be fetched from the public URL.  
- If the URL is temporarily unavailable, the notebook will **fall back to a synthetic dataset** so you can still see outputs.
- The trained best model is saved as `model.pkl` in the Colab runtime.
