In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import sys
import joblib # <-- Import joblib to save the model


In [2]:

def load_data(filepath):
    """
    Loads the crop yield data from a CSV file.
    """
    try:
        df = pd.read_csv(filepath)
        print(f"Successfully loaded data from '{filepath}'.")
        print(f"Data shape: {df.shape}")
        return df
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
        print("Please make sure 'crop_yield.csv' is in the same directory as this script.")
        sys.exit(1) # Exit the script if file not found
    except Exception as e:
        print(f"An error occurred while loading the data: {e}")
        sys.exit(1)

def preprocess_data(df):
    """
    Cleans the data and separates features (X) and target (y).
    """
    print("\n--- Preprocessing Data ---")
    
    # 1. Handle missing values (if any)
    # Let's check for NaNs and drop them for this model
    nan_count = df.isnull().sum().sum()
    if nan_count > 0:
        print(f"Found and dropping {nan_count} missing values.")
        df = df.dropna()
    else:
        print("No missing values found. Data is clean.")
        
    # 2. Define Features (X) and Target (y)
    
    # --- CRITICAL STEP: Prevent Data Leakage ---
    # 'Yield' is often calculated as 'Production' / 'Area'. 
    # Using 'Production' to predict 'Yield' would be "cheating" as it
    # already contains the answer. We MUST drop it.
    if 'Production' in df.columns:
        print("Dropping 'Production' column to prevent data leakage.")
        X = df.drop(['Yield', 'Production'], axis=1)
    else:
        X = df.drop('Yield', axis=1)
        
    y = df['Yield']
    
    # 3. Identify feature types
    categorical_features = ['Crop', 'Season', 'State']
    numeric_features = ['Crop_Year', 'Area', 'Annual_Rainfall', 'Fertilizer', 'Pesticide']
    
    # Verify all columns are accounted for
    all_features = categorical_features + numeric_features
    missing_cols = set(X.columns) - set(all_features)
    if missing_cols:
        print(f"Warning: The following columns are in the data but not used in the model: {missing_cols}")
        
    print(f"Target (y): 'Yield'")
    print(f"Categorical Features (X): {categorical_features}")
    print(f"Numerical Features (X): {numeric_features}")

    return X, y, categorical_features, numeric_features


In [3]:

def build_model_pipeline(categorical_features, numeric_features):
    """
    Creates a scikit-learn pipeline to process data and train the model.
    """
    print("\n--- Building ML Pipeline ---")

    # Create a transformer for numerical features
    # For Random Forest, scaling isn't necessary, so we'll just use 'passthrough'
    # If we were using an SVM or Neural Net, we'd use StandardScaler() here.
    numeric_transformer = 'passthrough'

    # Create a transformer for categorical features
    # This will convert text (like 'Assam', 'Rice') into numbers (0s and 1s)
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Create a "preprocessor" that applies the correct transformer to the correct columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough' # Pass through any columns we didn't explicitly mention
    )
    
    # --- Define the Model ---
    # We use a RandomForestRegressor. It's powerful, fast, and good at 
    # explaining its predictions.
    # n_jobs=-1 uses all available CPU cores for training.
    model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=15)
    
    # --- Create the Full Pipeline ---
    # This pipeline will first run the preprocessor, then train the model.
    ml_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    print("Pipeline built successfully.")
    return ml_pipeline

def train_and_evaluate(ml_pipeline, X, y):
    """
    Splits data, trains the model, and evaluates its performance.
    """
    print("\n--- Training and Evaluating Model ---")
    
    # 1. Split data into training and testing sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training data size: {X_train.shape[0]} samples")
    print(f"Testing data size:  {X_test.shape[0]} samples")

    # 2. Train the model
    print("Training model... (This may take a moment)")
    ml_pipeline.fit(X_train, y_train)
    print("Model training complete.")

    # 3. Evaluate the model
    print("\n--- Model Evaluation (on Test Data) ---")
    y_pred = ml_pipeline.predict(X_test)
    
    # R-squared: "Coefficient of Determination". 1.0 is a perfect score.
    # It explains how much of the variance in Yield is "explained" by our features.
    r2 = r2_score(y_test, y_pred)
    
    # Mean Absolute Error (MAE): The average error in our prediction.
    # E.g., an MAE of 0.5 means our predictions are, on average, 
    # off by 0.5 (in whatever unit 'Yield' is in).
    mae = mean_absolute_error(y_test, y_pred)
    
    print(f"R-squared (R²):     {r2:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    
    if r2 < 0.5:
        print("Warning: R-squared is low. The model may not be capturing all patterns.")
    elif r2 > 0.8:
        print("Excellent! R-squared is high. The model explains the data well.")
        
    return ml_pipeline # Return the trained pipeline

def save_model(pipeline, filepath):
    """
    Saves the trained ML pipeline to a file.
    """
    print(f"\n--- Saving Model ---")
    try:
        joblib.dump(pipeline, filepath)
        print(f"Model successfully saved to '{filepath}'")
    except Exception as e:
        print(f"Error saving model: {e}")

def show_feature_importance(ml_pipeline, categorical_features, numeric_features):
    """
    Extracts and displays the most important features from the trained model.
    """
    print("\n--- Top 20 Most Important Features ---")
    
    # Get the components from the pipeline
    preprocessor = ml_pipeline.named_steps['preprocessor']
    regressor = ml_pipeline.named_steps['regressor']
    
    # Get the raw feature importances
    importances_raw = regressor.feature_importances_
    
    # Get the feature names *after* one-hot encoding
    try:
        onehot_features = list(preprocessor.named_transformers_['cat']
                               .named_steps['onehot']
                               .get_feature_names_out(categorical_features))
        
        # Combine numeric and one-hot encoded feature names
        all_feature_names = numeric_features + onehot_features
        
        # Create a pandas Series for easy sorting
        importances = pd.Series(importances_raw, index=all_feature_names)
        
        # Sort and display top 20
        top_20 = importances.sort_values(ascending=False).head(20)
        
        print(top_20)
        
    except Exception as e:
        print(f"Could not retrieve feature names. Error: {e}")
        print("Note: Feature importance display requires scikit-learn 0.24 or newer.")



In [4]:
# --- Main execution ---
if __name__ == "__main__":
    
    # Define the path to your data
    data_filepath = 'crop_yield.csv'
    model_save_path = 'crop_yield_model.joblib'
    
    # 1. Load Data
    df = load_data(data_filepath)
    
    # 2. Preprocess Data
    X, y, cat_features, num_features = preprocess_data(df)
    
    # 3. Build Pipeline
    pipeline = build_model_pipeline(cat_features, num_features)
    
    # 4. Train and Evaluate
    trained_pipeline = train_and_evaluate(pipeline, X, y)
    
    # 5. Show what the model learned
    show_feature_importance(trained_pipeline, cat_features, num_features)
    
    # 6. Save the trained model for the API
    save_model(trained_pipeline, model_save_path)
    
    print("\n--- ML Workflow Complete ---")



Successfully loaded data from 'crop_yield.csv'.
Data shape: (19689, 10)

--- Preprocessing Data ---
No missing values found. Data is clean.
Dropping 'Production' column to prevent data leakage.
Target (y): 'Yield'
Categorical Features (X): ['Crop', 'Season', 'State']
Numerical Features (X): ['Crop_Year', 'Area', 'Annual_Rainfall', 'Fertilizer', 'Pesticide']

--- Building ML Pipeline ---
Pipeline built successfully.

--- Training and Evaluating Model ---
Training data size: 15751 samples
Testing data size:  3938 samples
Training model... (This may take a moment)
Model training complete.

--- Model Evaluation (on Test Data) ---
R-squared (R²):     0.9795
Mean Absolute Error (MAE): 10.1345
Excellent! R-squared is high. The model explains the data well.

--- Top 20 Most Important Features ---
Crop_Coconut            0.846147
Annual_Rainfall         0.031777
Pesticide               0.021613
Area                    0.021390
State_Karnataka         0.019742
Fertilizer              0.013082
St