In [1]:
# This installs the known compatible versions and creates our recipe file
!pip install scikit-learn==1.6.1 tensorflow==2.16.1 numpy==1.26.4 joblib==1.4.2
!pip freeze > requirements.txt



In [5]:
# --- 1. Import Necessary Libraries ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import os

print("--- Starting Yield Model Training with Crop Encoding ---")
os.makedirs('models', exist_ok=True)

# --- 2. Load the Base Dataset ---
df = pd.read_csv('Crop_recommendation.csv')

# --- 3. Synthesize Yield Data ---
crop_yield_data = {
    'rice': 3.8, 'maize': 3.5, 'chickpea': 0.9, 'kidneybeans': 1.2,
    'pigeonpeas': 0.8, 'mothbeans': 0.5, 'mungbean': 0.6, 'blackgram': 0.7,
    'lentil': 0.8, 'cotton': 1.5, 'jute': 2.2, 'coffee': 0.7,
    'pomegranate': 10.5, 'banana': 60.0, 'mango': 7.5, 'grapes': 22.0,
    'orange': 15.0, 'papaya': 40.0, 'coconut': 9.0, 'apple': 8.0,
    'watermelon': 25.0, 'muskmelon': 20.0,
}
def synthesize_yield(row):
    crop = row['label']
    base_yield = crop_yield_data.get(crop, 0)
    variation = base_yield * np.random.uniform(-0.15, 0.15)
    return round(base_yield + variation, 2)
df['yield'] = df.apply(synthesize_yield, axis=1)
print("Step 1: Yield column synthesized.")

# --- 4. Add Extra Crops (Barley & Millet) ---
new_data = []
for _ in range(100):
    new_data.append({
        'N': np.random.randint(30, 50), 'P': np.random.randint(20, 40), 'K': np.random.randint(15, 25),
        'temperature': np.random.uniform(18, 22), 'humidity': np.random.uniform(60, 70),
        'ph': np.random.uniform(7.5, 8.0), 'rainfall': np.random.uniform(20, 40),
        'label': 'barley', 'yield': round(2.5 * np.random.uniform(0.85, 1.15), 2)
    })
    new_data.append({
        'N': np.random.randint(20, 40), 'P': np.random.randint(20, 40), 'K': np.random.randint(15, 25),
        'temperature': np.random.uniform(26, 30), 'humidity': np.random.uniform(50, 60),
        'ph': np.random.uniform(6.5, 7.5), 'rainfall': np.random.uniform(40, 60),
        'label': 'millet', 'yield': round(1.8 * np.random.uniform(0.85, 1.15), 2)
    })
df_new = pd.DataFrame(new_data)
df_expanded = pd.concat([df, df_new], ignore_index=True)
print("Step 2: Expanded dataset created.")

# --- 5. Encode Crop Labels ---
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_labels = encoder.fit_transform(df_expanded[['label']])

# --- FIX IS HERE ---
# Call get_feature_names_out() without arguments
encoded_labels_df = pd.DataFrame(encoded_labels, columns=encoder.get_feature_names_out())

X = pd.concat([
    df_expanded.drop(['label', 'yield'], axis=1).reset_index(drop=True),
    encoded_labels_df.reset_index(drop=True)
], axis=1)
y = df_expanded['yield']
print("Step 3: Features and labels prepared.")

# --- 6. Split Dataset ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Step 4: Train/test split complete.")

# --- 7. Train the Model ---
yield_model = GradientBoostingRegressor(n_estimators=200, random_state=42)
print("Step 5: Training Gradient Boosting Regressor...")
yield_model.fit(X_train, y_train)
print("         ...Training complete.")

# --- 8. Evaluate Model ---
y_pred = yield_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"Step 6: Model Evaluation")
print(f"    R² Score : {r2:.2f}")
print(f"    RMSE     : {rmse:.2f}")
print(f"    MAE      : {mae:.2f}")

# --- 9. Save Model and Encoder ---
joblib.dump(yield_model, 'models/yield_predictor_v2.pkl')
joblib.dump(encoder, 'models/crop_label_encoder.pkl')
print("Step 7: Model and encoder saved.")

# --- 10. Test Prediction ---
# This part of the script will also need to be updated to match the new column names
# For example, the new columns will be 'label_rice', 'label_barley', etc.
# This is just for testing in the notebook; the main.py file will handle this correctly.

--- Starting Yield Model Training with Crop Encoding ---
Step 1: Yield column synthesized.
Step 2: Expanded dataset created.
Step 3: Features and labels prepared.
Step 4: Train/test split complete.
Step 5: Training Gradient Boosting Regressor...
         ...Training complete.
Step 6: Model Evaluation
    R² Score : 0.98
    RMSE     : 2.06
    MAE      : 1.10
Step 7: Model and encoder saved.
