#### Crop Recommendation using XGBoost

In [175]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import re

loading the dataset

In [178]:
df = pd.read_csv('crop_recommendation_dataset.csv')
print(df.head())

  Crop_Name Soil_Type Rainfall_Requirement Temperature_Range Previous_Crop  \
0      Rice      Loam         1701-1764 mm           29-33°C         Maize   
1      Rice      Clay         1677-1686 mm           25-30°C          Rice   
2      Rice      Loam         1247-1878 mm           25-42°C         Jowar   
3      Rice      Clay         1554-1321 mm           25-30°C        Pulses   
4      Rice      Clay         1148-2114 mm           29-41°C         Wheat   

   Next_Crop Month_Planted   Month_Harvested Growth_Duration       Region  \
0     Coffee   June-August  October-December        149 days  West Bengal   
1     Coffee   June-August  October-December        120 days  West Bengal   
2     Pulses   June-August  October-December        144 days  West Bengal   
3     Pulses   June-August  October-December        112 days   Tamil Nadu   
4  Sugarcane   June-August  October-December        132 days  West Bengal   

   Soil_pH  Irrigation  
0      5.9           1  
1      5.7        

data preprocessing and label encoding

In [183]:
# Parse ranges
def parse_range(value, unit):
    numbers = re.findall(r'\d+', value)
    return int(numbers[0]), int(numbers[1])

df[['Rainfall_Min', 'Rainfall_Max']] = df['Rainfall_Requirement'].apply(lambda x: pd.Series(parse_range(x, 'mm')))
df[['Temp_Min', 'Temp_Max']] = df['Temperature_Range'].apply(lambda x: pd.Series(parse_range(x, '°C')))
df['Growth_Duration'] = df['Growth_Duration'].str.replace(' days', '').astype(int)

# New derived features
df['Rainfall_Range'] = df['Rainfall_Max'] - df['Rainfall_Min']
df['Temp_Range'] = df['Temp_Max'] - df['Temp_Min']
df['Temp_Avg'] = (df['Temp_Min'] + df['Temp_Max']) / 2

# Seasonal encoding
month_to_season = {
    'March': 'Spring', 'April': 'Spring', 'May': 'Spring',
    'June': 'Monsoon', 'July': 'Monsoon', 'August': 'Monsoon',
    'September': 'Autumn', 'October': 'Autumn', 'November': 'Autumn',
    'December': 'Winter', 'January': 'Winter', 'February': 'Winter'
}
df['Season_Planted'] = df['Month_Planted'].apply(lambda x: month_to_season[x.split('-')[0]])
df['Season_Harvested'] = df['Month_Harvested'].apply(lambda x: month_to_season[x.split('-')[0]])

# Crop rotation feature
df['Rotation_Pair'] = df['Previous_Crop'] + '_' + df['Next_Crop']

# Encode categorical variables
categorical_cols = ['Soil_Type', 'Previous_Crop', 'Next_Crop', 'Season_Planted', 'Season_Harvested', 'Region', 'Rotation_Pair']
df_encoded = pd.get_dummies(df[categorical_cols], drop_first=True)
df = pd.concat([df.drop(categorical_cols + ['Rainfall_Requirement', 'Temperature_Range', 'Month_Planted', 'Month_Harvested'], axis=1), df_encoded], axis=1)

defining the target variable

In [186]:
# Step 3: Define Features and Target
X = df.drop(columns=['Crop_Name'])
y = df['Crop_Name']

In [188]:
# Encode target
le_y = LabelEncoder()
y_encoded = le_y.fit_transform(y)

train and test split

In [191]:
# Step 4: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

##### XGBoost model training

In [197]:
# Step 5: Train XGBoost Model
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(y.unique()),
    learning_rate=0.1,
    n_estimators=200,
    max_depth=6,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='mlogloss'
)
xgb_model.fit(X_train, y_train)

In [199]:
# Step 6: Make Predictions
y_pred = xgb_model.predict(X_test)

model evaluation

In [202]:
# Step 7: Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le_y.classes_))


Accuracy: 97.00%

Classification Report:
              precision    recall  f1-score   support

      Coffee       1.00      1.00      1.00        10
      Cotton       1.00      1.00      1.00        10
   Groundnut       1.00      0.90      0.95        10
       Jowar       0.82      0.90      0.86        10
       Maize       0.90      0.90      0.90        10
      Pulses       1.00      1.00      1.00        10
        Rice       1.00      1.00      1.00        10
   Sugarcane       1.00      1.00      1.00        10
         Tea       1.00      1.00      1.00        10
       Wheat       1.00      1.00      1.00        10

    accuracy                           0.97       100
   macro avg       0.97      0.97      0.97       100
weighted avg       0.97      0.97      0.97       100



In [204]:
# Step 8: Feature Importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance (Top 10):")
print(feature_importance.head(10))


Feature Importance (Top 10):
                    Feature  Importance
41            Region_Punjab    0.170394
42         Region_Rajasthan    0.118747
43        Region_Tamil Nadu    0.105729
46       Region_West Bengal    0.098343
39    Region_Madhya Pradesh    0.081225
36           Region_Haryana    0.078413
45     Region_Uttar Pradesh    0.040664
33  Season_Harvested_Winter    0.033248
34             Region_Assam    0.029491
37         Region_Karnataka    0.022661


sample testing

In [207]:
# Step 9: Example Prediction
sample_input = X_test.iloc[0].values.reshape(1, -1)
predicted_crop = le_y.inverse_transform(xgb_model.predict(sample_input))[0]
print(f"\nPredicted Crop for sample input: {predicted_crop}")


Predicted Crop for sample input: Rice
