In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# Load dataset
df = pd.read_csv("Crop_Recommendation.csv")

# Define numerical and target columns
numerical_data = ['Nitrogen', 'Phosphorus', 'Potassium', 'Temperature', 'Humidity', 'pH_Value', 'Rainfall']
x = df[numerical_data]
y = df['Crop']

# Standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(x)  # Scaling 7 features

# Apply PCA
pca = PCA(n_components=5)  # Reduce dimensions to 5
X_pca = pca.fit_transform(X_scaled)

# Convert PCA results to DataFrame (optional for visualization)
pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(pca.n_components_)])
print("PCA Transformed Data:\n", pca_df.head())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)  # ✅ FIXED ERROR

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

# Save trained models & scaler
joblib.dump(rf_model, "rf_recommendation.pkl")  # Save trained Random Forest model
joblib.dump(pca, "pca_recommendation.pkl")      # Save PCA transformation
joblib.dump(scaler, "scaler_recommendation.pkl")  # ✅ FIXED: Save scaler instead of X_scaled

print(f"Before PCA, StandardScaler Input Shape: {X_scaled.shape}")  # Should print (num_samples, 7)


PCA Transformed Data:
         PC1       PC2       PC3       PC4       PC5
0 -0.582869 -0.844586  1.373343 -1.614129  0.308224
1 -0.474635 -0.784895  1.252178 -1.792762  1.107745
2 -0.634068 -0.694522  1.179332 -1.818106  2.523263
3 -1.047920 -1.087658  1.393351 -0.982401  1.448781
4 -0.873258 -0.658673  1.455685 -2.335012  1.959633
Accuracy: 0.8886363636363637
Classification Report:
               precision    recall  f1-score   support

       Apple       1.00      1.00      1.00        23
      Banana       1.00      1.00      1.00        21
   Blackgram       0.79      0.75      0.77        20
    ChickPea       0.96      0.96      0.96        26
     Coconut       0.96      1.00      0.98        27
      Coffee       0.87      0.76      0.81        17
      Cotton       0.94      0.88      0.91        17
      Grapes       1.00      1.00      1.00        14
        Jute       0.69      0.96      0.80        23
 KidneyBeans       0.90      0.95      0.93        20
      Lentil     

In [3]:
df.shape

(2200, 8)