<a href="https://colab.research.google.com/github/SahilLokhande2604/Crop_Yield_Prediction/blob/main/Deep_Learning_Model_Crop_Yield_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras import layers
from scikeras.wrappers import KerasRegressor

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout

# Load your dataset (replace with actual file path if needed)
df = pd.read_csv("/content/crop_yield.csv")

# Assuming 'Yield' is your target column, and the rest are features
X = df.drop('Yield', axis=1)
y = df['Yield']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define numeric and categorical feature groups
numeric_features = ['Crop_Year', 'Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide']
categorical_features = ['Season', 'Crop', 'State']

# # Preprocessing: OneHotEncode categorical, scale numeric
# numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
# categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Apply preprocessor to determine the input shape after transformation
preprocessed_X_train = preprocessor.fit_transform(X_train)
input_shape = preprocessed_X_train.shape[1]


In [None]:
#  Neural network model
def build_model():
    model = tf.keras.Sequential([
        layers.InputLayer(input_shape=(input_shape,)),  # Updated input shape
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)  # Output layer for regression
    ])

    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Build and train pipeline
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', KerasRegressor(model=build_model, epochs=100, batch_size=32,callbacks=[early_stop]))])

# Train the model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = pipeline.predict(X_test)




Epoch 1/100
[1m493/493[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - loss: 731609.5000 - mae: 86.3943
Epoch 2/100
[1m 11/493[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 5ms/step - loss: 138671.6406 - mae: 57.4619 

  current = self.get_monitor_value(logs)


[1m493/493[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 253444.6094 - mae: 57.7482
Epoch 3/100
[1m493/493[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 89379.0547 - mae: 29.9137
Epoch 4/100
[1m493/493[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 29666.1719 - mae: 20.5023
Epoch 5/100
[1m493/493[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - loss: 31300.7656 - mae: 16.9383
Epoch 6/100
[1m493/493[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 27334.5742 - mae: 17.3800
Epoch 7/100
[1m493/493[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 43203.2617 - mae: 18.8264
Epoch 8/100
[1m493/493[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - loss: 30781.3477 - mae: 17.1379
Epoch 9/100
[1m493/493[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 23573.8926 - mae: 15.7825
Epoch 10/100
[1m493/493[0m [32m━━━━━━━━━

In [None]:
# Evaluate
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')



# Fit the entire pipeline (preprocessor + model) on the training data
# pipeline.fit(X_train, y_train)

# Predict for multiple samples
n_samples = 5  # Number of samples you want to predict
samples = X_test.iloc[:n_samples]

# Preprocess the samples using the fitted pipeline
predictions = pipeline.predict(samples)

# Display the input data, predicted yield, actual yield, and absolute error
for i in range(n_samples):
    print(f"\nSample {i+1}:")
    print(f"Input: {samples.iloc[i].to_dict()}")
    print(f"Predicted yield: {predictions[i]:.2f}")
    print(f"Actual yield: {y_test.iloc[i]:.2f}")
    print(f"Absolute error: {abs(predictions[i] - y_test.iloc[i]):.2f}")


from sklearn.metrics import mean_squared_error, r2_score
y_pred = pipeline.predict(X_test)  # Predict on the entire test set

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print(f"\nMean Squared Error (MSE): {mse:.2f}")

# Calculate R-squared score
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R²) score: {r2:.2f}")

Root Mean Squared Error: 116.27109388194101
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step

Sample 1:
Input: {'Crop': 'Peas & beans (Pulses)', 'Crop_Year': 2016, 'Season': 'Kharif     ', 'State': 'Jammu and Kashmir', 'Area': 210.0, 'Production': 1010, 'Annual_Rainfall': 902.8, 'Fertilizer': 32182.5, 'Pesticide': 73.5}
Predicted yield: 1.88
Actual yield: 3.82
Absolute error: 1.94

Sample 2:
Input: {'Crop': 'Maize', 'Crop_Year': 1999, 'Season': 'Rabi       ', 'State': 'Odisha', 'Area': 8270.0, 'Production': 10280, 'Annual_Rainfall': 1484.3, 'Fertilizer': 877695.1, 'Pesticide': 2232.9}
Predicted yield: 1.13
Actual yield: 1.40
Absolute error: 0.26

Sample 3:
Input: {'Crop': 'Potato', 'Crop_Year': 2016, 'Season': 'Winter     ', 'State': 'Meghalaya', 'Area': 6341.0, 'Production': 44026, 'Annual_Rainfall': 3837.1, 'Fertilizer': 971758.25, 'Pesticide': 2219.35}
Predicted yield: 6.05
Actual yield: 6.52
Absolute error: 0.47

Sample 4:
Input: {'Crop': 'Ragi', 'Crop_Year':

# Spearman’s Rank Correlation

In [None]:
from scipy.stats import spearmanr

spearman_corr, spearman_p = spearmanr(y_test, y_pred)
print(f"Spearman Correlation: {spearman_corr:.4f}, P-value: {spearman_p:.4f}")


Spearman Correlation: 0.5317, P-value: 0.0000


# Pearson Correlation Coefficient

In [None]:
from scipy.stats import pearsonr

pearson_corr, pearson_p = pearsonr(y_test, y_pred)
print(f"Pearson Correlation Coefficient: {pearson_corr:.4f}, P-value: {pearson_p:.4f}")
if pearson_p < 0.05:
    print("Reject the null hypothesis: Significant linear relationship exists.")
else:
    print("Fail to reject the null hypothesis: No significant linear relationship.")


Pearson Correlation Coefficient: 0.9916, P-value: 0.0000
Reject the null hypothesis: Significant linear relationship exists.


Pearson and Spearman correlations test relationships between actual and predicted values.

# Chi-Square Test

Chi-Square evaluates independence for categorical outcomes.

In [None]:
from scipy.stats import chi2_contingency

# Convert continuous values into categorical bins
actual_bins = pd.qcut(y_test, q=3, labels=["Low", "Medium", "High"])
predicted_bins = pd.qcut(y_pred, q=3, labels=["Low", "Medium", "High"])

# Create contingency table
contingency_table = pd.crosstab(actual_bins, predicted_bins)

# Perform Chi-Square Test
chi2_stat, chi2_p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-Square Statistic: {chi2_stat:.4f}, P-value: {chi2_p:.4f}")
if chi2_p < 0.05:
    print("Reject the null hypothesis: Observed and predicted values are dependent.")
else:
    print("Fail to reject the null hypothesis: Observed and predicted values are independent.")


Chi-Square Statistic: 1034.1082, P-value: 0.0000
Reject the null hypothesis: Observed and predicted values are dependent.


# Shapiro-Wilk Test for Normality

In [None]:
residuals = y_test - y_pred


In [None]:
from scipy.stats import shapiro

shapiro_stat, shapiro_p = shapiro(residuals)
print(f"Shapiro-Wilk Statistic: {shapiro_stat:.4f}, P-value: {shapiro_p:.4f}")
if shapiro_p > 0.05:
    print("Residuals are normally distributed.")
else:
    print("Residuals are not normally distributed.")


Shapiro-Wilk Statistic: 0.0624, P-value: 0.0000
Residuals are not normally distributed.


# Wilcoxon Signed-Rank Test

In [None]:
from scipy.stats import wilcoxon

wilcoxon_stat, wilcoxon_p = wilcoxon(y_test - y_pred)
print(f"Wilcoxon Statistic: {wilcoxon_stat:.4f}, P-value: {wilcoxon_p:.4f}")
if wilcoxon_p < 0.05:
    print("Reject the null hypothesis: Median difference is significant.")
else:
    print("Fail to reject the null hypothesis: Median difference is not significant.")


Wilcoxon Statistic: 2715758.0000, P-value: 0.0000
Reject the null hypothesis: Median difference is significant.


# Friedman Test

In [None]:
from scipy.stats import friedmanchisquare

model_1_preds = y_pred  # Example: First model predictions
model_2_preds = y_test + np.random.normal(scale=5, size=len(y_test))  # Simulated second model

friedman_stat, friedman_p = friedmanchisquare(y_test, model_1_preds, model_2_preds)
print(f"Friedman Statistic: {friedman_stat:.4f}, P-value: {friedman_p:.4f}")
if friedman_p < 0.05:
    print("Reject the null hypothesis: Significant differences between conditions.")
else:
    print("Fail to reject the null hypothesis: No significant differences between conditions.")


Friedman Statistic: 135.6877, P-value: 0.0000
Reject the null hypothesis: Significant differences between conditions.
