In [None]:
# This installs the known compatible versions and creates our recipe file
!pip install scikit-learn==1.6.1 tensorflow==2.16.1 numpy==1.26.4 joblib==1.4.2
!pip freeze > requirements.txt



In [None]:
import pandas as pd
import numpy as np

# Load the original dataset
df = pd.read_csv('Crop_recommendation.csv')

# --- Add Soil Type ---
# Define which soil types are suitable for which crops (simplified for this example)
soil_map = {
    'Loamy': ['rice', 'jute', 'maize', 'lentil', 'chickpea', 'kidneybeans', 'pigeonpeas'],
    'Sandy': ['watermelon', 'muskmelon', 'mothbeans', 'mungbean'],
    'Clayey': ['cotton', 'blackgram'],
    'Black': ['soybean', 'cotton'], # Some crops thrive in multiple types
    'Red': ['coffee', 'pomegranate', 'mango', 'grapes', 'orange', 'papaya', 'apple', 'banana']
}

# Create a reverse map for easier lookup
crop_to_soil = {crop: soil for soil, crops in soil_map.items() for crop in crops}
df['soil_type'] = df['label'].map(crop_to_soil)

# --- Add Market Price (Synthesized) ---
# Define base prices (in INR per quintal) and add some random variation
price_map = {
    'rice': 2200, 'maize': 2100, 'chickpea': 5400, 'kidneybeans': 7000,
    'pigeonpeas': 7000, 'mothbeans': 8500, 'mungbean': 8550, 'blackgram': 6950,
    'lentil': 6000, 'cotton': 7000, 'jute': 5000, 'coffee': 8000,
    'pomegranate': 15000, 'banana': 1500, 'mango': 4000, 'grapes': 5000,
    'orange': 3500, 'papaya': 1200, 'coconut': 2500, 'apple': 9000,
    'watermelon': 1000, 'muskmelon': 1800, 'soybean': 4600
}

def get_price(crop):
    base_price = price_map.get(crop, 2000)
    return round(base_price * np.random.uniform(0.9, 1.1), 2)

df['market_price'] = df['label'].apply(get_price)

# Save the new, enriched dataset to a new file
df.to_csv('Crop_recommendation_enriched.csv', index=False)

print("✅ Enriched dataset created successfully: 'Crop_recommendation_enriched.csv'")
print("\nNew columns added:")
print(df.head())

✅ Enriched dataset created successfully: 'Crop_recommendation_enriched.csv'

New columns added:
    N   P   K  temperature   humidity        ph    rainfall label soil_type  \
0  90  42  43    20.879744  82.002744  6.502985  202.935536  rice     Loamy   
1  85  58  41    21.770462  80.319644  7.038096  226.655537  rice     Loamy   
2  60  55  44    23.004459  82.320763  7.840207  263.964248  rice     Loamy   
3  74  35  40    26.491096  80.158363  6.980401  242.864034  rice     Loamy   
4  78  42  42    20.130175  81.604873  7.628473  262.717340  rice     Loamy   

   market_price  
0       2291.78  
1       2161.35  
2       2245.89  
3       1994.49  
4       2167.71  


In [None]:
# --- 1. Install Libraries and Import ---
!pip install xgboost scikit-learn pandas

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder # Import LabelEncoder
from sklearn.pipeline import Pipeline
import xgboost as xgb
import joblib

# --- 2. Load the Enriched Dataset ---
df = pd.read_csv('Crop_recommendation_enriched.csv')

# --- 3. Prepare the Data for Training ---
X = df.drop('label', axis=1)
y = df['label']

# --- FIX IS HERE ---
# Convert text labels to integers
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Save the fitted LabelEncoder to a file
joblib.dump(le, 'label_encoder.pkl')
print("✅ LabelEncoder saved to label_encoder.pkl")

# Identify categorical and numerical features
categorical_features = ['soil_type']
numerical_features = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'market_price']

# Create a preprocessor to handle different data types
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# --- 4. Create and Train the XGBoost Model using a Pipeline ---
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(
        objective='multi:softmax',
        n_estimators=150,
        learning_rate=0.1,
        max_depth=5,
        random_state=42,
        use_label_encoder=False,
        eval_metric='mlogloss'
    ))
])

# Split the data, using the new y_encoded
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Train the entire pipeline
print("Training the full pipeline with XGBoost model...")
model_pipeline.fit(X_train, y_train)
print("Training complete!")

# --- 5. Evaluate and Save the Model ---
accuracy = model_pipeline.score(X_test, y_test)
print(f"Model Accuracy on enriched data: {accuracy * 100:.2f}%")

model_filename = 'crop_recommender_xgb.pkl'
joblib.dump(model_pipeline, model_filename)
print(f"Enriched XGBoost model saved to {model_filename}")

✅ LabelEncoder saved to label_encoder.pkl
Training the full pipeline with XGBoost model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training complete!
Model Accuracy on enriched data: 100.00%
Enriched XGBoost model saved to crop_recommender_xgb.pkl
