# 🌱 Sustainable Crop Recommendation System
**Author:** Tushar Kapoor  
**Project:** Predict the best crop for a region based on soil, climate, and season.  
**Purpose:** Final submission-ready notebook demonstrating end-to-end workflow.


In [3]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Paths
BASE_DIR = "../data"
RAW_DIR = os.path.join(BASE_DIR, "raw")
PROCESSED_DIR = os.path.join(BASE_DIR, "processed")

# Raw files
crop_file = os.path.join(RAW_DIR, "crop_recommendation.csv")
weather_file = os.path.join(RAW_DIR, "weatherHistory.csv")
core_file = os.path.join(RAW_DIR, "data_core.csv")  # adjust if name is different

# Processed file (optional: save combined cleaned data)
PROCESSED_PATH = os.path.join(PROCESSED_DIR, "cleaned_data.csv")


In [4]:
# Paths
BASE_DIR = "../data"
RAW_DIR = os.path.join(BASE_DIR, "raw")
PROCESSED_DIR = os.path.join(BASE_DIR, "processed")

# Raw files
crop_file = os.path.join(RAW_DIR, "crop_recommendation.csv")
weather_file = os.path.join(RAW_DIR, "weatherHistory.csv")
core_file = os.path.join(RAW_DIR, "data_core.csv")  # adjust if name is different

# Processed file (optional: save combined cleaned data)
PROCESSED_PATH = os.path.join(PROCESSED_DIR, "cleaned_data.csv")


In [8]:
# Example: merge datasets on a common column if exists
# Here, we’ll assume 'date' or 'location' might exist
# Adjust merge keys according to actual datasets
combined_df = crop_df.copy()  # start with crop dataset

# Merge with core data
if 'location' in core_df.columns:
    combined_df = combined_df.merge(core_df, on='location', how='left')

# Merge with weather data
if 'date' in weather_df.columns and 'date' in combined_df.columns:
    combined_df = combined_df.merge(weather_df, on='date', how='left')

# Quick info
print(combined_df.info())


NameError: name 'crop_df' is not defined

In [9]:
# ===============================
# 04_final_project.ipynb
# World-class, ready-to-run notebook
# ===============================

# 1️⃣ Import Libraries
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------
# 2️⃣ Define Paths
BASE_DIR = "../data"
RAW_DIR = os.path.join(BASE_DIR, "raw")
MODEL_DIR = "../outputs/models"

os.makedirs(MODEL_DIR, exist_ok=True)

crop_file = os.path.join(RAW_DIR, "crop_recommendation.csv")
weather_file = os.path.join(RAW_DIR, "weatherHistory.csv")
core_file = os.path.join(RAW_DIR, "data_core.csv")

# ===============================
# 3️⃣ Load Datasets
crop_df = pd.read_csv(crop_file)
weather_df = pd.read_csv(weather_file)
core_df = pd.read_csv(core_file)

print("Crop dataset shape:", crop_df.shape)
print("Weather dataset shape:", weather_df.shape)
print("Core dataset shape:", core_df.shape)

# ===============================
# 4️⃣ Merge/Combine Datasets
# Start with crop dataset
combined_df = crop_df.copy()

# Merge with core_df if common column exists
common_cols_core = set(combined_df.columns).intersection(core_df.columns)
if common_cols_core:
    merge_col = list(common_cols_core)[0]  # pick first common column
    combined_df = combined_df.merge(core_df, on=merge_col, how='left')

# Merge with weather_df if common column exists
common_cols_weather = set(combined_df.columns).intersection(weather_df.columns)
if common_cols_weather:
    merge_col = list(common_cols_weather)[0]
    combined_df = combined_df.merge(weather_df, on=merge_col, how='left')

print("Combined dataset shape:", combined_df.shape)

# ===============================
# 5️⃣ Preprocessing
# Handle missing values
combined_df.fillna(combined_df.median(numeric_only=True), inplace=True)
combined_df.fillna(method='ffill', inplace=True)
combined_df.fillna(method='bfill', inplace=True)

# Encode categorical variables
label_encoders = {}
for col in combined_df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    combined_df[col] = le.fit_transform(combined_df[col])
    label_encoders[col] = le

# Features & Target
# Assume last column is the target (adjust if needed)
X = combined_df.iloc[:, :-1]
y = combined_df.iloc[:, -1]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# ===============================
# 6️⃣ Train Multiple Models

models = {
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "SVM": SVC(kernel='rbf', probability=True, random_state=42)
}

best_model_name = None
best_accuracy = 0
best_model = None

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")
    
    if acc > best_accuracy:
        best_accuracy = acc
        best_model_name = name
        best_model = model

print(f"\n✅ Best Model: {best_model_name} with Accuracy: {best_accuracy:.4f}")

# ===============================
# 7️⃣ Save Best Model
model_path = os.path.join(MODEL_DIR, f"{best_model_name.lower()}_model.pkl")
joblib.dump(best_model, model_path)
print(f"Saved best model at: {model_path}")

# ===============================
# 8️⃣ Evaluate Model
y_pred = best_model.predict(X_test)

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f"{best_model_name} Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ===============================
# 9️⃣ Summary
print(f"\nModel Training Completed. Best Model: {best_model_name}")
print("All preprocessing, training, evaluation done successfully.")


Crop dataset shape: (2200, 8)
Weather dataset shape: (96453, 12)
Core dataset shape: (8000, 9)
Combined dataset shape: (219358, 27)


  combined_df.fillna(method='ffill', inplace=True)
  combined_df.fillna(method='ffill', inplace=True)
  combined_df.fillna(method='bfill', inplace=True)
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


ValueError: Input y contains NaN.

In [None]:
# ===========================
# Sustainable Crop Recommendation - Production-Ready Notebook
# ===========================

# 1️⃣ Imports
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# 2️⃣ Paths
RAW_PATH = r"C:\Users\tusha\Downloads\Sustainable-Crop-Recommendation\data\raw"
OUTPUT_PATH = r"C:\Users\tusha\Downloads\Sustainable-Crop-Recommendation\outputs\models"
os.makedirs(OUTPUT_PATH, exist_ok=True)

# 3️⃣ Load all CSV files
files = [f for f in os.listdir(RAW_PATH) if f.endswith(".csv")]
dfs = [pd.read_csv(os.path.join(RAW_PATH, f)) for f in files]

# Combine datasets (adjust keys if needed)
combined_df = pd.concat(dfs, axis=1)
combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]

# 4️⃣ Preprocessing
# Handle missing values
for col in combined_df.columns:
    if combined_df[col].dtype in ['int64', 'float64']:
        combined_df[col].fillna(combined_df[col].median(), inplace=True)
    else:
        combined_df[col].fillna(combined_df[col].mode()[0], inplace=True)

# Drop rows with missing target (assume last column is target)
target_col = combined_df.columns[-1]
combined_df = combined_df.dropna(subset=[target_col])

# Encode categorical features
cat_cols = combined_df.select_dtypes(include='object').columns.tolist()
if target_col in cat_cols:
    cat_cols.remove(target_col)

for col in cat_cols:
    combined_df[col] = LabelEncoder().fit_transform(combined_df[col])

# Encode target if categorical
y = combined_df[target_col]
if y.dtype == 'object':
    y = LabelEncoder().fit_transform(y)

# Features
X = combined_df.drop(columns=[target_col])

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5️⃣ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# 6️⃣ Model Training & Evaluation
models = {
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "SVM": SVC(kernel='rbf', probability=True, random_state=42)
}

best_acc = 0
best_model_name = None
best_model = None

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"--- {name} ---")
    print("Accuracy:", acc)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n")
    
    if acc > best_acc:
        best_acc = acc
        best_model_name = name
        best_model = model

print(f"✅ Best Model: {best_model_name} with accuracy {best_acc:.4f}")

# Save best model and scaler
joblib.dump(best_model, os.path.join(OUTPUT_PATH, "best_model.pkl"))
joblib.dump(scaler, os.path.join(OUTPUT_PATH, "scaler.pkl"))

# 7️⃣ Production Prediction Function
def predict_new_data(new_data_df):
    """
    new_data_df: pandas DataFrame with the same features as training data (excluding target)
    returns: predictions as list
    """
    # Fill missing values same way as training
    for col in new_data_df.columns:
        if new_data_df[col].dtype in ['int64', 'float64']:
            new_data_df[col].fillna(combined_df[col].median(), inplace=True)
        else:
            new_data_df[col].fillna(combined_df[col].mode()[0], inplace=True)
    
    # Encode categorical features same way
    for col in new_data_df.select_dtypes(include='object').columns:
        if col in cat_cols:
            le = LabelEncoder()
            le.fit(combined_df[col])
            new_data_df[col] = le.transform(new_data_df[col])
    
    # Scale
    new_scaled = scaler.transform(new_data_df)
    
    # Predict
    preds = best_model.predict(new_scaled)
    return preds

# 8️⃣ Optional: Feature Importance for Random Forest
if best_model_name == "RandomForest":
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    feat_imp = pd.Series(best_model.feature_importances_, index=X.columns).sort_values(ascending=False)
    plt.figure(figsize=(10,6))
    sns.barplot(x=feat_imp.values, y=feat_imp.index)
    plt.title("Feature Importance - Random Forest")
    plt.show()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df[col].fillna(combined_df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df[col].fillna(combined_df[col].mode()[0], inplace=True)


--- RandomForest ---
Accuracy: 0.6106992898242704


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.80      0.89         5
           1       1.00      1.00      1.00         5
           2       1.00      0.60      0.75         5
           3       0.75      0.60      0.67         5
           4       0.80      0.80      0.80         5
           5       0.75      0.60      0.67         5
           6       0.40      0.40      0.40         5
           7       0.43      0.60      0.50         5
           8       0.86      0.43      0.57        14
           9       1.00      0.33      0.50         9
          10       1.00      0.20      0.33         5
          11       1.00      0.80      0.89         5
          12       1.00      1.00      1.00         5
          13       1.00      0.80      0.89         5
          14       1.00      0.40      0.57         5
          15       1.00      0.20      0.33         5
          16       0.75      0.60      0.67         5
   

In [None]:
# ===========================
# Sustainable Crop Recommendation - Robust Notebook
# ===========================

# 1️⃣ Imports
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings("ignore")

# 2️⃣ Paths
RAW_PATH = r"C:\Users\tusha\Downloads\Sustainable-Crop-Recommendation\data\raw"
OUTPUT_PATH = r"C:\Users\tusha\Downloads\Sustainable-Crop-Recommendation\outputs\models"
os.makedirs(OUTPUT_PATH, exist_ok=True)

# 3️⃣ Load all CSV files safely
files = [f for f in os.listdir(RAW_PATH) if f.endswith(".csv")]
if not files:
    raise FileNotFoundError("No CSV files found in raw folder!")

dfs = []
for f in files:
    try:
        df = pd.read_csv(os.path.join(RAW_PATH, f))
        dfs.append(df)
        print(f"✅ Loaded {f} with shape {df.shape}")
    except Exception as e:
        print(f"⚠️ Could not load {f}: {e}")

# Merge datasets safely
try:
    combined_df = pd.concat(dfs, axis=1)
    combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]
    print(f"✅ Combined dataframe shape: {combined_df.shape}")
except Exception as e:
    raise RuntimeError(f"Error combining datasets: {e}")

# 4️⃣ Preprocessing with error handling
try:
    # Handle missing values
    for col in combined_df.columns:
        if combined_df[col].dtype in ['int64', 'float64']:
            combined_df[col].fillna(combined_df[col].median(), inplace=True)
        else:
            combined_df[col].fillna(combined_df[col].mode()[0], inplace=True)

    # Detect target column automatically (last column assumed)
    target_col = combined_df.columns[-1]
    if combined_df[target_col].isnull().all():
        raise ValueError("Target column is completely empty!")

    y = combined_df[target_col]
    if y.dtype == 'object':
        y = LabelEncoder().fit_transform(y)

    # Features
    X = combined_df.drop(columns=[target_col])

    # Encode categorical features
    cat_cols = X.select_dtypes(include='object').columns.tolist()
    for col in cat_cols:
        try:
            X[col] = LabelEncoder().fit_transform(X[col])
        except Exception as e:
            print(f"⚠️ Could not encode {col}: {e}")

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    print("✅ Preprocessing complete")
except Exception as e:
    raise RuntimeError(f"Preprocessing error: {e}")

# 5️⃣ Train-Test Split
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )
except Exception as e:
    raise RuntimeError(f"Error in train-test split: {e}")

# 6️⃣ Model Training & Evaluation
models = {
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "SVM": SVC(kernel='rbf', probability=True, random_state=42)
}

best_acc = 0
best_model_name = None
best_model = None

for name, model in models.items():
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        print(f"--- {name} ---")
        print("Accuracy:", acc)
        print("Classification Report:\n", classification_report(y_test, y_pred))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("\n")

        if acc > best_acc:
            best_acc = acc
            best_model_name = name
            best_model = model
    except Exception as e:
        print(f"⚠️ Error training {name}: {e}")

if best_model is None:
    raise RuntimeError("No model trained successfully!")
print(f"✅ Best Model: {best_model_name} with accuracy {best_acc:.4f}")

# Save model and scaler safely
try:
    joblib.dump(best_model, os.path.join(OUTPUT_PATH, "best_model.pkl"))
    joblib.dump(scaler, os.path.join(OUTPUT_PATH, "scaler.pkl"))
    print("✅ Model and scaler saved")
except Exception as e:
    print(f"⚠️ Could not save model/scaler: {e}")

# 7️⃣ Production Prediction Function with error handling
def predict_new_data(new_data_df):
    try:
        # Fill missing values same as training
        for col in new_data_df.columns:
            if new_data_df[col].dtype in ['int64', 'float64']:
                if col in combined_df.columns:
                    new_data_df[col].fillna(combined_df[col].median(), inplace=True)
                else:
                    new_data_df[col].fillna(new_data_df[col].median(), inplace=True)
            else:
                if col in combined_df.columns:
                    new_data_df[col].fillna(combined_df[col].mode()[0], inplace=True)
                else:
                    new_data_df[col].fillna(new_data_df[col].mode()[0], inplace=True)

        # Encode categorical
        for col in new_data_df.select_dtypes(include='object').columns:
            if col in cat_cols:
                le = LabelEncoder()
                le.fit(combined_df[col])
                new_data_df[col] = le.transform(new_data_df[col])
            else:
                new_data_df[col] = new_data_df[col].astype('category').cat.codes

        # Scale
        new_scaled = scaler.transform(new_data_df)

        # Predict
        preds = best_model.predict(new_scaled)
        return preds
    except Exception as e:
        print(f"⚠️ Prediction error: {e}")
        return None

# 8️⃣ Optional Feature Importance for Random Forest
try:
    if best_model_name == "RandomForest":
        import matplotlib.pyplot as plt
        import seaborn as sns

        feat_imp = pd.Series(best_model.feature_importances_, index=X.columns).sort_values(ascending=False)
        plt.figure(figsize=(10,6))
        sns.barplot(x=feat_imp.values, y=feat_imp.index)
        plt.title("Feature Importance - Random Forest")
        plt.show()
except Exception as e:
    print(f"⚠️ Feature importance error: {e}")
