In [1]:
import os
import pickle
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Models
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


In [2]:
! pip install six




[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: C:\Users\majji\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
import six
print(six.__version__)

1.17.0


In [4]:
import sys
!{sys.executable} -m pip install kaggle six





[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: C:\Users\majji\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [5]:
!python --version
!where python
!pip show six

Python 3.10.11
c:\Users\majji\AppData\Local\Microsoft\WindowsApps\python.exe
C:\Users\majji\AppData\Local\Programs\Python\Python311\python.exe
Name: six
Version: 1.17.0
Summary: Python 2 and 3 compatibility utilities
Home-page: https://github.com/benjaminp/six
Author: Benjamin Peterson
Author-email: benjamin@python.org
License: MIT
Location: c:\users\majji\appdata\local\packages\pythonsoftwarefoundation.python.3.10_qbz5n2kfra8p0\localcache\local-packages\python310\site-packages
Requires: 
Required-by: catboost, kaggle, python-dateutil


In [6]:
from kaggle.api.kaggle_api_extended import KaggleApi
import os
import zipfile

# Set up and authenticate API
api = KaggleApi()
api.authenticate()

# Define paths
dataset_path = "data/"
zip_path = os.path.join(dataset_path, "agriculture-crop-yield.zip")

# Make sure data folder exists
os.makedirs(dataset_path, exist_ok=True)

# Download the dataset ZIP
print("📦 Downloading from Kaggle...")
api.dataset_download_files(
    'samuelotiattakorah/agriculture-crop-yield',
    path=dataset_path,
    unzip=False  # we’ll unzip manually
)

# ✅ Extract manually
if os.path.exists(zip_path):
    print("📂 Unzipping downloaded file...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(dataset_path)
    print("✅ Unzipped successfully!")

    # (Optional) Delete zip to save space
    os.remove(zip_path)
else:
    print("❌ ZIP file not found after download.")


📦 Downloading from Kaggle...
Dataset URL: https://www.kaggle.com/datasets/samuelotiattakorah/agriculture-crop-yield
📂 Unzipping downloaded file...
✅ Unzipped successfully!


In [7]:
# Load dataset
df = pd.read_csv("D:\project\ML\Crop_Yield_Prediction\data\Crop_Yield.csv")

In [8]:
numerical_cols = ["Rainfall_mm", "Temperature_Celsius", "Days_to_Harvest"]
categorical_cols = ["Region", "Soil_Type", "Crop", "Weather_Condition"]

# Scale numerical columns
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Encode categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Convert boolean columns
df["Fertilizer_Used"] = df["Fertilizer_Used"].astype(int)
df["Irrigation_Used"] = df["Irrigation_Used"].astype(int)

# Optional: Downcast to reduce memory usage
for col in df.select_dtypes(include=["float64", "int64"]).columns:
    df[col] = pd.to_numeric(df[col], downcast="float")


In [9]:
os.makedirs("data", exist_ok=True)
with open("data/label_encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)
with open("data/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)


In [10]:
X = df.drop(columns=["Yield_tons_per_hectare"])
y = df["Yield_tons_per_hectare"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(),
    "GradientBoosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(),
    "LightGBM": LGBMRegressor(),
    "CatBoost": CatBoostRegressor(verbose=0)
}

os.makedirs("models", exist_ok=True)

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print(f"{name:15} R²: {r2:.4f} | RMSE: {rmse:.4f}")
    pickle.dump(model, open(f"models/{name}.pkl", "wb"))

Linear          R²: 0.9130 | RMSE: 0.5008
Ridge           R²: 0.9130 | RMSE: 0.5008
GradientBoosting R²: 0.9125 | RMSE: 0.5024
XGBoost         R²: 0.9124 | RMSE: 0.5025
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006288 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 624
[LightGBM] [Info] Number of data points in the train set: 800000, number of used features: 9
[LightGBM] [Info] Start training from score 4.649019
LightGBM        R²: 0.9128 | RMSE: 0.5013
CatBoost        R²: 0.9128 | RMSE: 0.5013


In [13]:
def predict_yield_selected_models(input_data):
    import pickle
    import pandas as pd
    import os

    # Load preprocessing objects
    with open("data/label_encoders.pkl", "rb") as f:
        label_encoders = pickle.load(f)
    with open("data/scaler.pkl", "rb") as f:
        scaler = pickle.load(f)

    # Convert to DataFrame
    input_df = pd.DataFrame([input_data])

    # Preprocessing
    for col in ["Region", "Soil_Type", "Crop", "Weather_Condition"]:
        input_df[col] = label_encoders[col].transform(input_df[col])

    input_df[["Rainfall_mm", "Temperature_Celsius", "Days_to_Harvest"]] = scaler.transform(
        input_df[["Rainfall_mm", "Temperature_Celsius", "Days_to_Harvest"]]
    )

    input_df["Fertilizer_Used"] = int(input_df["Fertilizer_Used"])
    input_df["Irrigation_Used"] = int(input_df["Irrigation_Used"])

    # Select lightweight models only
    model_files = [
        "Linear.pkl",
        "Ridge.pkl",
        "GradientBoosting.pkl",
        "LightGBM.pkl",
        "CatBoost.pkl",
        "XGBoost.pkl"
    ]

    predictions = {}

    for model_file in model_files:
        model_name = model_file.replace(".pkl", "")
        with open(os.path.join("models", model_file), "rb") as f:
            model = pickle.load(f)
            predictions[model_name] = model.predict(input_df)[0]

    return predictions


In [14]:
if __name__ == "__main__":
    example_data = {
        "Region": "West",
        "Soil_Type": "Sandy",
        "Crop": "Cotton",
        "Rainfall_mm": 897.08,
        "Temperature_Celsius": 27.68,
        "Fertilizer_Used": False,
        "Irrigation_Used": True,
        "Weather_Condition": "Cloudy",
        "Days_to_Harvest": 122,
    }

    predictions = predict_yield_selected_models(example_data)

    print("\n🌾 Crop Yield Predictions (tons/ha):")
    for model, yield_value in predictions.items():
        print(f"{model:>20}: {yield_value:.5f}")



🌾 Crop Yield Predictions (tons/ha):
              Linear: 6.23728
               Ridge: 6.23728
    GradientBoosting: 6.23112
            LightGBM: 6.26000
            CatBoost: 6.22566
             XGBoost: 6.14534


  input_df["Fertilizer_Used"] = int(input_df["Fertilizer_Used"])
  input_df["Irrigation_Used"] = int(input_df["Irrigation_Used"])
