In [1]:
pip install pandas numpy scikit-learn joblib


Note: you may need to restart the kernel to use updated packages.


In [2]:
# Imports
import pandas as pd
import numpy as np
import logging
import os
import traceback
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Setup logging
log_folder = "logs"
os.makedirs(log_folder, exist_ok=True)
log_file = os.path.join(log_folder, "crop_price_prediction.log")

logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

def log_exception():
    """Logs the last exception with traceback info."""
    exc_info = traceback.format_exc()
    logging.error(f"Exception occurred: {exc_info}")

try:
    # Load dataset
    dataset_path = "India_Crop_Price_Prediction_Dataset.csv"
    df = pd.read_csv(dataset_path)
    logging.info("Dataset loaded successfully.")
    df = df.drop(columns=['Year'])
    # Features and target
    X = df.drop(columns=["Next_Year_Price"])
    y = df["Next_Year_Price"]
    logging.info("Features and target variable separated.")

    # Identify categorical and numerical columns
    categorical_cols = ["Crop", "Region"]
    numerical_cols = [col for col in X.columns if col not in categorical_cols]
    logging.info(f"Categorical columns: {categorical_cols}")
    logging.info(f"Numerical columns: {numerical_cols}")

    # Preprocessing for numerical and categorical data
    numeric_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

    # Random Forest Regression pipeline
    rf_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
    ])

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    logging.info("Data split into training and testing sets.")

    # Train Random Forest Regression
    rf_pipeline.fit(X_train, y_train)
    y_pred_rf = rf_pipeline.predict(X_test)
    logging.info("Random Forest Regression model trained and predictions made.")

    # Evaluation function
    def evaluate_model(y_true, y_pred, model_name):
        r2 = r2_score(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)

        logging.info(f"{model_name} - R2: {r2:.4f}, MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}")

        print(f"\n----- {model_name} -----")
        print("R2 Score:", r2)
        print("Mean Absolute Error:", mae)
        print("Mean Squared Error:", mse)
        print("Root Mean Squared Error:", rmse)

    # Evaluate Random Forest Regression
    evaluate_model(y_test, y_pred_rf, "Random Forest Regression")

    # Confusion matrix equivalent for regression (using bins)
    def regression_confusion_matrix(y_true, y_pred, bins=5, model_name="Model"):
        y_true_binned = pd.cut(y_true, bins=bins, labels=False)
        y_pred_binned = pd.cut(y_pred, bins=bins, labels=False)

        conf_matrix = pd.crosstab(y_true_binned, y_pred_binned, rownames=['Actual Bin'], colnames=['Predicted Bin'])

        logging.info(f"{model_name} - Regression confusion matrix:\n{conf_matrix}")

        print(f"\nConfusion Matrix (Binned) - {model_name}")
        print(conf_matrix)

    regression_confusion_matrix(y_test, y_pred_rf, bins=5, model_name="Random Forest Regression")

    # ---------- SAVE MODEL AND PREPROCESSORS SEPARATELY ---------- #

    # Folder to save models
    models_folder = r"C:\Users\VINIL\Desktop\Market_price_ML\flask_app\models"
    os.makedirs(models_folder, exist_ok=True)

    # Define file paths
    model_path = os.path.join(models_folder, "random_forest_model.pkl")
    encoder_path = os.path.join(models_folder, "onehot_encoder.pkl")
    scaler_path = os.path.join(models_folder, "standard_scaler.pkl")

    # Extract trained components from the pipeline
    trained_preprocessor = rf_pipeline.named_steps['preprocessor']
    trained_numeric_scaler = trained_preprocessor.named_transformers_['num']
    trained_categorical_encoder = trained_preprocessor.named_transformers_['cat']

    trained_rf_model = rf_pipeline.named_steps['regressor']

    # Save Random Forest model
    with open(model_path, "wb") as model_file:
        pickle.dump(trained_rf_model, model_file)

    # Save the OneHotEncoder
    with open(encoder_path, "wb") as encoder_file:
        pickle.dump(trained_categorical_encoder, encoder_file)

    # Save the StandardScaler
    with open(scaler_path, "wb") as scaler_file:
        pickle.dump(trained_numeric_scaler, scaler_file)

    logging.info("✅ Random Forest model and preprocessing objects saved successfully.")
    print("✅ Model and encoders saved successfully in:", models_folder)

except Exception as e:
    log_exception()
    print(f"An error occurred: {e}")



----- Random Forest Regression -----
R2 Score: 0.9253205705440292
Mean Absolute Error: 183.40730519480522
Mean Squared Error: 50941.98953409091
Root Mean Squared Error: 225.70332193853707

Confusion Matrix (Binned) - Random Forest Regression
Predicted Bin   0   1   2   3   4
Actual Bin                       
0              49  10   0   0   0
1               1  81  13   0   0
2               0   8  34  21   0
3               0   0   8  35  18
4               0   0   0   8  22
✅ Model and encoders saved successfully in: C:\Users\VINIL\Desktop\Market_price_ML\flask_app\models
