In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import joblib
import os

# Set working directory
os.chdir(r"D:\Cropyieldintership")
print("Current working directory:", os.getcwd())

# Load dataset
data_path = r"D:\Cropyieldintership\data\crop_yield_predicted.csv"
if not os.path.exists(data_path):
    raise FileNotFoundError(f"File not found at {data_path}")

df = pd.read_csv(data_path)
print("Dataset loaded successfully:", df.shape)

# Encode Crop_Type if exists
if 'Crop_Type' in df.columns:
    le = LabelEncoder()
    df['Crop_Type_encoded'] = le.fit_transform(df['Crop_Type'])

# Define features and target
X_cols = ['Temperature', 'Humidity', 'Soil_Quality', 'NPK_Ratio', 'Fertility_Index']
if 'Crop_Type_encoded' in df.columns:
    X_cols.append('Crop_Type_encoded')

X = df[X_cols]
y = df['Crop_Yield']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Random Forest": RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42),
    "Linear Regression": LinearRegression()
}

best_r2 = -np.inf
best_model_name = None
best_model = None

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Training metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_r2 = r2_score(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_acc = train_r2 * 100  # approximate accuracy
    
    # Testing metrics
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_r2 = r2_score(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_acc = test_r2 * 100  # approximate accuracy
    
    print(f"\n{name} Performance:")
    print(f"Training -> RMSE: {train_rmse:.2f}, MAE: {train_mae:.2f}, R2: {train_r2:.2f}, Accuracy: {train_acc:.2f}%")
    print(f"Testing  -> RMSE: {test_rmse:.2f}, MAE: {test_mae:.2f}, R2: {test_r2:.2f}, Accuracy: {test_acc:.2f}%")
    
    # Check if this model is best based on test R2
    if test_r2 > best_r2:
        best_r2 = test_r2
        best_model_name = name
        best_model = model

# Print selected model with both training and testing metrics
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_r2 = r2_score(y_train, y_train_pred)
train_acc = train_r2 * 100
test_r2 = r2_score(y_test, y_test_pred)
test_acc = test_r2 * 100

print(f"\n✅ Selected Model: {best_model_name}")
print(f"Training R2: {train_r2:.2f}, Training Accuracy: {train_acc:.2f}%")
print(f"Testing R2: {test_r2:.2f}, Testing Accuracy: {test_acc:.2f}%")

# Save the selected model
os.makedirs("model", exist_ok=True)
joblib.dump(best_model, "model/crop_yield_model.pkl")
print("Model saved as 'model/crop_yield_model.pkl'")


Current working directory: D:\Cropyieldintership
Dataset loaded successfully: (36520, 7)

Random Forest Performance:
Training -> RMSE: 4.50, MAE: 2.56, R2: 0.97, Accuracy: 96.94%
Testing  -> RMSE: 4.71, MAE: 2.77, R2: 0.97, Accuracy: 96.67%

XGBoost Performance:
Training -> RMSE: 3.65, MAE: 2.25, R2: 0.98, Accuracy: 97.99%
Testing  -> RMSE: 3.95, MAE: 2.44, R2: 0.98, Accuracy: 97.66%

Linear Regression Performance:
Training -> RMSE: 15.30, MAE: 11.89, R2: 0.65, Accuracy: 64.62%
Testing  -> RMSE: 15.38, MAE: 11.99, R2: 0.65, Accuracy: 64.52%

✅ Selected Model: XGBoost
Training R2: 0.98, Training Accuracy: 97.99%
Testing R2: 0.98, Testing Accuracy: 97.66%
Model saved as 'model/crop_yield_model.pkl'
