In [1]:
pip install matplotlib

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


In [3]:
df = pd.read_csv('/workspaces/Agricultural_Yield_Prediction/agriculture_dataset (1).csv')

df.head()

Unnamed: 0,Farm_ID,Crop_Type,Farm_Area(acres),Irrigation_Type,Fertilizer_Used(tons),Pesticide_Used(kg),Yield(tons),Soil_Type,Season,Water_Usage(cubic meters)
0,F001,Cotton,329.4,Sprinkler,8.14,2.21,14.44,Loamy,Kharif,76648.2
1,F002,Carrot,18.67,Manual,4.77,4.36,42.91,Peaty,Kharif,68725.54
2,F003,Sugarcane,306.03,Flood,2.91,0.56,33.44,Silty,Kharif,75538.56
3,F004,Tomato,380.21,Rain-fed,3.32,4.35,34.08,Silty,Zaid,45401.23
4,F005,Tomato,135.56,Sprinkler,8.33,4.48,43.28,Clay,Zaid,93718.69


In [4]:
df.info()

df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Farm_ID                    50 non-null     object 
 1   Crop_Type                  50 non-null     object 
 2   Farm_Area(acres)           50 non-null     float64
 3   Irrigation_Type            50 non-null     object 
 4   Fertilizer_Used(tons)      50 non-null     float64
 5   Pesticide_Used(kg)         50 non-null     float64
 6   Yield(tons)                50 non-null     float64
 7   Soil_Type                  50 non-null     object 
 8   Season                     50 non-null     object 
 9   Water_Usage(cubic meters)  50 non-null     float64
dtypes: float64(5), object(5)
memory usage: 4.0+ KB


Unnamed: 0,Farm_Area(acres),Fertilizer_Used(tons),Pesticide_Used(kg),Yield(tons),Water_Usage(cubic meters)
count,50.0,50.0,50.0,50.0,50.0
mean,254.9638,4.9054,2.398,27.0592,56724.2956
std,139.417782,2.732689,1.438613,13.345789,27264.992053
min,12.5,0.5,0.14,3.86,5869.75
25%,135.71,2.4375,0.9725,16.19,37818.1525
50%,281.98,5.045,2.33,28.97,54097.075
75%,368.1075,6.885,3.4175,37.86,82240.0325
max,483.88,9.96,4.99,48.02,94754.73


In [5]:
# 2. Minimal Feature Engineering (Avoid overfitting!)
df['Water_Efficiency'] = df['Yield(tons)'] / (df['Water_Usage(cubic meters)'] + 1)
df['Fertilizer_Efficiency'] = df['Yield(tons)'] / (df['Fertilizer_Used(tons)'] + 1)


In [6]:
# 3. Target: Keep original yield (no normalization for small data)
X = df.drop(['Yield(tons)', 'Farm_ID'], axis=1)
y = df['Yield(tons)']

In [7]:
# 4. Train-Test Split (80-20 for small data)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


In [8]:
# 5. Simple Preprocessing
numeric_features = ['Farm_Area(acres)', 'Fertilizer_Used(tons)',
                   'Pesticide_Used(kg)', 'Water_Usage(cubic meters)','Water_Efficiency', 'Fertilizer_Efficiency']
categorical_features = ['Crop_Type', 'Irrigation_Type', 'Soil_Type', 'Season']

In [9]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression

# Define preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Define multiple models
models = {
    "Gradient Boosting": GradientBoostingRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Linear Regression": LinearRegression()
}

# Evaluate each model
results = {}
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Cross-validation
    y_pred = cross_val_predict(pipeline, X_train, y_train, cv=5)
    
    # Train and predict
    pipeline.fit(X_train, y_train)
    test_pred = pipeline.predict(X_test)
    
    # Evaluation
    train_r2 = r2_score(y_train, y_pred)
    test_r2 = r2_score(y_test, test_pred)
    train_mae = mean_absolute_error(y_train, y_pred)
    test_mae = mean_absolute_error(y_test, test_pred)
    train_rmse = mean_squared_error(y_train, y_pred)
    test_rmse = mean_squared_error(y_test, test_pred)
    avg_r2 = (train_r2 + test_r2) / 2  

    results[name] = {
        'Train R²': train_r2,
        'Test R²': test_r2,
        'Avg R²': avg_r2,
        'Model': pipeline
    }

#best model by Avg R²
best_model_name = max(results, key=lambda k: results[k]['Avg R²'])
best_model = results[best_model_name]['Model']

# Print results
for model, scores in results.items():
    print(f"{model}: Train R² = {scores['Train R²']:.4f}, Test R² = {scores['Test R²']:.4f}")
    print("-" * 40)

# Display best model
print(f"🏆 Best Model: {best_model_name}")
print(f"Train R²: {results[best_model_name]['Train R²']:.4f}")
print(f"Test R²: {results[best_model_name]['Test R²']:.4f}")

Gradient Boosting: Train R² = 0.6570, Test R² = 0.6420
----------------------------------------
Random Forest: Train R² = 0.6809, Test R² = 0.5720
----------------------------------------
Linear Regression: Train R² = -0.5185, Test R² = 0.6129
----------------------------------------
🏆 Best Model: Gradient Boosting
Train R²: 0.6570
Test R²: 0.6420


In [11]:
import joblib

# Save the best model to a file
joblib.dump(best_model, 'best_model.joblib')

print("✅ Best model saved as 'best_model.joblib'")


✅ Best model saved as 'best_model.joblib'
