In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.utils.validation import check_is_fitted
import numpy as np
import pandas as pd

# Custom transformer for frequency encoding
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.freq_dict_ = {}

    def fit(self, X, y=None):
        # Compute the frequency of each category in each column
        X_df = pd.DataFrame(X)  # Convert numpy array to DataFrame
        self.freq_dict_ = {col: X_df[col].value_counts(normalize=True).to_dict() for col in X_df.columns}
        return self

    def transform(self, X, y=None):
        check_is_fitted(self)
        X_transformed = pd.DataFrame(X).copy()  # Convert numpy array to DataFrame
        # Replace each category with its frequency
        for col in X_transformed.columns:
            X_transformed[col] = X_transformed[col].map(self.freq_dict_[col])
        return X_transformed.values  # Return numpy array

# Pipelines for frequency encoding and one hot encoding
freq_enc_pipeline = Pipeline(steps=[('freq_enc', FrequencyEncoder())])
one_hot_enc_pipeline = Pipeline(steps=[('one_hot_enc', OneHotEncoder(sparse_output=False))])

# Function to compute ratio, perform logarithmic transformation, and apply Standard Scaling
def compute_ratio_log_drop(df):
    df_copy = df.copy()
    df_copy["Log_Pesticide_per_hectar"] = np.log1p(df["Pesticide"] / df["Area"])
    df_copy["Log_Fertilizer_per_hectar"] = np.log1p(df["Fertilizer"] / df["Area"])
    df_copy = df_copy.drop(columns=["Pesticide", "Fertilizer", "Area"])
    return df_copy

# Function to perform logarithmic transformation and apply Standard Scaling
def log_transform_drop(df):
    df_copy = df.copy()
    df_copy["Log_annual_rainfall"] = np.log1p(df["Annual_Rainfall"])
    df_copy = df_copy.drop(columns=["Annual_Rainfall"])
    return df_copy

ratio_pipeline = make_pipeline(FunctionTransformer(compute_ratio_log_drop), StandardScaler())

log_pipiline = make_pipeline(FunctionTransformer(log_transform_drop), StandardScaler())

# Update the preprocessing pipeline
preprocessing = ColumnTransformer([
    ("compute_ratio_log_and_scale", ratio_pipeline, ["Pesticide", "Area", "Fertilizer"]),
    ("log_transform_and_scale", log_pipiline, ["Annual_Rainfall"]),
    ("freq_enc", freq_enc_pipeline, ["Crop", "State"]),
    ("one_hot_enc", one_hot_enc_pipeline, ["Season"])
])


In [None]:
import joblib 

model_3_loaded = joblib.load("Model_vesrion_3_Log.pkl")

In [17]:
def cleaning():
    df = pd.read_csv('crop_yield.csv', header=0, index_col='ID')
    df.drop(columns=['Crop_Year', 'Production'], inplace=True)
    df = df[(df['Yield'] > 0.1) & (df['Yield'] <= 50)]
    df = df[df["Crop"] != 'Coconut']
    return df

In [23]:
new_data = cleaning()

new_data_for_pred = new_data.drop(columns='Yield')

In [24]:
predictions = model_3_loaded.predict(new_data_for_pred)

In [25]:
df_check = pd.DataFrame({"Actual_3": new_data['Yield'],"Predicted_3": predictions})

In [28]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from math import sqrt


def calculate_metrics_as_df(df, model_name):
    actual_col = f'Actual_{model_name}'
    predicted_col = f'Predicted_{model_name}'
    actual = df[actual_col]
    predicted = df[predicted_col]
    mae = mean_absolute_error(actual, predicted)
    mse = mean_squared_error(actual, predicted)
    rmse = sqrt(mse)
    r2 = r2_score(actual, predicted)
    mape = mean_absolute_percentage_error(actual, predicted)
    return pd.DataFrame({'Model': f'Model_{model_name}', 'MAE': [mae], 'MSE': [mse], 'RMSE': [rmse], 'R2': [r2], 'MAPE': [mape]})


metrics = calculate_metrics_as_df(df_check, 3)

metrics

Unnamed: 0,Model,MAE,MSE,RMSE,R2,MAPE
0,Model_3,2.003518,31.477605,5.610491,0.081636,0.375837
