In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
# Load the dataset
data = pd.read_csv('cleaned_data.csv')


In [4]:
# Define columns to log transform and scale
columns_to_transform = ['Area', 'Production', 'Fertilizer', 'Pesticide']
columns_to_scale = ['Crop_Year', 'Annual_Rainfall', 'Area', 'Production', 'Fertilizer', 'Pesticide']
categorical_features = ['Crop', 'State', 'Season']

# Separate features and target variable
target_column = 'Yield'
X = data.drop(target_column, axis=1)
y = data[target_column]

# Log transformer function
log_transformer = FunctionTransformer(np.log1p, validate=True)

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('log_and_scale', Pipeline(steps=[
            ('log', log_transformer),
            ('scale', StandardScaler())
        ]), columns_to_transform),
        ('scale_only', StandardScaler(), [col for col in columns_to_scale if col not in columns_to_transform]),
        
    
        
        ('cat', OneHotEncoder(sparse_output=False), categorical_features)
    
    
    
    ]
)

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', XGBRegressor())])


In [5]:




# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)







In [6]:
# Define parameter grid for GridSearchCV
param_grid = {
    'model__colsample_bytree': [0.7, 0.8, 0.9],
    'model__gamma': [0.1, 0.3, 0.5],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7],
    'model__n_estimators': [100, 200, 300],
    'model__reg_alpha': [0, 0.1, 0.5],
    'model__reg_lambda': [0.5, 1.0, 1.5],
    'model__subsample': [0.7, 0.8, 0.9]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

Best parameters found:  {'model__colsample_bytree': 0.9, 'model__gamma': 0.1, 'model__learning_rate': 0.2, 'model__max_depth': 5, 'model__n_estimators': 100, 'model__reg_alpha': 0, 'model__reg_lambda': 1.5, 'model__subsample': 0.7}


In [7]:
# Predict on the test set with the best model
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate the model
train_r2 = grid_search.score(X_train, y_train)
test_r2 = grid_search.score(X_test, y_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Training R²: {train_r2}")
print(f"Test R²: {test_r2}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")


Training R²: 0.9999529307841566
Test R²: 0.9463286499455389
MAE: 36.33320310525322
MSE: 106454.95379035227
RMSE: 326.2743535590137
