In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


data = {
    'Soil_Type': ['Clay', 'Loam', 'Silt', 'Sand', 'Pit', 'Chalk'],
    'Nutrition_Value': [30, 40, 25, 35, 45, 30],
    'Temperature': [25, 28, 20, 30, 22, 28],
    'Humidity': [60, 70, 50, 80, 45, 75],
    'Crop': ['Wheat', 'Maize', 'Rice', 'Barley', 'Corn', 'Soybean']
}

# Load dataset
try:
    df = pd.read_csv('crop_prediction_dataset.csv')
    print("Dataset loaded from CSV file")
except FileNotFoundError:
    print("CSV file not found. Using sample dataset.")
    df = pd.DataFrame(data)

print(f"Dataset shape: {df.shape}")
print(f"Dataset columns: {df.columns.tolist()}")

# Convert categorical variables into numerical using one-hot encoding
df = pd.get_dummies(df, columns=['Soil_Type'])

# Split the dataset into features (X) and target variable (y)
X = df.drop('Crop', axis=1)
y = df['Crop']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on training data
rf_classifier.fit(X_train, y_train)

# Make predictions on test data
y_pred = rf_classifier.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2f} ({accuracy*100:.2f}%)")

# Display additional evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_classifier.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

# Get feature names (excluding Soil_Type columns)
feature_names = [col for col in X.columns if 'Soil_Type' not in col]

# Get user input for each feature (including Soil_Type)
print("\n" + "="*50)
print("CROP PREDICTION FOR NEW INPUT")
print("="*50)

user_input = {}
for feature in feature_names:
    user_input[feature] = float(input(f"Enter value for {feature}: "))

# Get user input for Soil_Type separately
soil_type_input = input("Enter Soil Type (Clay/Loam/Silt/Sand/Pit/Chalk): ")

# Create a dataframe with user input
user_df = pd.DataFrame([user_input])

# Add one-hot encoded Soil_Type columns based on user input
for soil_type in ['Clay', 'Loam', 'Silt', 'Sand', 'Pit', 'Chalk']:
    user_df[f"Soil_Type_{soil_type}"] = 1 if soil_type == soil_type_input else 0

# Reorder columns to match the order during training
user_df = user_df[X.columns]

# Make prediction on user input
prediction = rf_classifier.predict(user_df)
prediction_proba = rf_classifier.predict_proba(user_df)

print(f"\nPredicted Crop: {prediction[0]}")
print(f"Prediction Confidence: {max(prediction_proba[0]):.2f} ({max(prediction_proba[0])*100:.2f}%)")

# Show probability for all classes
print("\nProbabilities for all crops:")
for i, crop in enumerate(rf_classifier.classes_):
    print(f"{crop}: {prediction_proba[0][i]:.3f} ({prediction_proba[0][i]*100:.1f}%)")

Dataset loaded from CSV file
Dataset shape: (1000, 5)
Dataset columns: ['Soil_Type', 'Nutrition_Value', 'Temperature', 'Humidity', 'Crop']
Training set size: 800
Testing set size: 200

Model Accuracy: 0.72 (72.00%)

Classification Report:
              precision    recall  f1-score   support

      Barley       0.91      0.91      0.91        33
        Corn       0.63      0.56      0.59        34
       Maize       0.59      0.47      0.52        34
        Rice       1.00      0.97      0.98        33
     Soybean       0.47      0.55      0.51        33
       Wheat       0.72      0.88      0.79        33

    accuracy                           0.72       200
   macro avg       0.72      0.72      0.72       200
weighted avg       0.72      0.72      0.72       200


Confusion Matrix:
[[30  0  0  0  0  3]
 [ 0 19  2  0  7  6]
 [ 0  4 16  0 13  1]
 [ 0  0  1 32  0  0]
 [ 0  7  7  0 18  1]
 [ 3  0  1  0  0 29]]

Feature Importance:
           feature  importance
2         Humidity  

In [5]:
user_df.columns

Index(['Nutrition_Value', 'Temperature', 'Humidity', 'Soil_Type_Chalk',
       'Soil_Type_Clay', 'Soil_Type_Loam', 'Soil_Type_Pit', 'Soil_Type_Sand',
       'Soil_Type_Silt'],
      dtype='object')

In [6]:
import joblib

# Save the trained model to a file
joblib.dump(rf_classifier, 'random_forest_model.pkl')


['random_forest_model.pkl']

In [7]:
import pickle
with open('model.pkl', 'wb') as file:
  pickle.dump(rf_classifier, file)