In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

# Load dataset
df = pd.read_csv("dataset.csv")

# Selecting all relevant columns for prediction
categorical_features = ["Crop", "Sowing Month", "Harvest Month", "State", "District"]
numerical_features = ["Rainfall (mm)", "Mean Temp (°C)", "Humidity (%)", "Soil pH", "Nitrogen (kg/ha)", "Phosphorus (kg/ha)", "Potassium (kg/ha)", "Market Price (₹/kg)", "Cultivation Area (ha)", "Production Volume (tons)"]
target = "Crop Yield (tons/ha)"

# Handling categorical data
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_cat = encoder.fit_transform(df[categorical_features])
feature_names = encoder.get_feature_names_out(categorical_features)
X_cat_df = pd.DataFrame(X_cat, columns=feature_names)

# Handling numerical data
scaler = StandardScaler()
X_num = scaler.fit_transform(df[numerical_features])
X_num_df = pd.DataFrame(X_num, columns=numerical_features)

# Combining categorical and numerical features
X = pd.concat([X_cat_df, X_num_df], axis=1)
y = df[target]

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=7, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")
print(f"R² Score: {r2}")

# Save model & encoders
joblib.dump(model, "crop_yield_model.pkl")
joblib.dump(encoder, "onehot_encoder.pkl")
joblib.dump(scaler, "scaler.pkl")
print("Model saved successfully!")


Mean Absolute Error: 0.017553575996291766
R² Score: 0.9996975726784321
Model saved successfully!
