In [11]:
import pandas as pd
import xgboost as xgb
from xgboost.callback import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import numpy as np

# Loading Data Set
df = pd.read_csv('./backend/prediction-model/Dataset/Agriculture_price_dataset.csv')

# Converting to datetime format
df['Price Date'] = pd.to_datetime(df['Price Date'])
                                     
# Striping and sorting data to maintain the order
df.columns = df.columns.str.strip()
df = df.sort_values('Price Date')

df.head()

Unnamed: 0,STATE,District Name,Market Name,Commodity,Variety,Grade,Min_Price,Max_Price,Modal_Price,Price Date
0,Maharashtra,nashik,Lasalgaon(Niphad),Wheat,Maharashtra 2189,FAQ,2172.0,2399.0,2300.0,2023-06-06
1241,Uttar Pradesh,bijnor,Chaandpur,Tomato,Hybrid,FAQ,600.0,700.0,650.0,2023-06-06
1240,Jammu & Kashmir,jammu,Batote,Tomato,Other,FAQ,1800.0,2200.0,2000.0,2023-06-06
1239,Gujarat,dahod,Dahod,Wheat,147 Average,FAQ,2500.0,2700.0,2600.0,2023-06-06
1238,Madhya Pradesh,guna,Guna(F&V),Tomato,Other,FAQ,350.0,530.0,410.0,2023-06-06


In [6]:
# Creating time-based features
df['year'] = df['Price Date'].dt.year
df['month'] = df['Price Date'].dt.month
df['day'] = df['Price Date'].dt.day
df['dayofweek'] = df['Price Date'].dt.dayofweek

df = pd.get_dummies(df, columns=['STATE', 'District Name', 'Market Name', 'Commodity', 'Variety', 'Grade'])

# Pridicting 'Modal Price' with defined features (X) and target (y)
X = df.drop(['Price Date', 'Min_Price', 'Max_Price', 'Modal_Price'], axis=1)
y = df['Modal_Price']

X.head()

Unnamed: 0,year,month,day,dayofweek,STATE_ Punjab,STATE_Andhra Pradesh,STATE_Assam,STATE_Bihar,STATE_Chandigarh,STATE_Chattisgarh,...,Variety_Tomato,Variety_WH-542,Variety_White,Grade_FAQ,Grade_Large,Grade_Local,Grade_Medium,Grade_Non-FAQ,Grade_Ref grade-1,Grade_Ref grade-2
0,2023,6,6,1,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1241,2023,6,6,1,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1240,2023,6,6,1,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1239,2023,6,6,1,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1238,2023,6,6,1,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [7]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to DMatrix — XGBoost's internal format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [9]:
# XGBoost parameters
params = {
    'max_depth': 7,
    'eta': 0.01,
    'subsample': 0.8,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'seed': 42
}

# Train model
evals = [(dtrain, 'train'), (dtest, 'eval')]
model = xgb.train(params, dtrain, num_boost_round=1000, evals=evals, verbose_eval=100)

[0]	train-rmse:1969.74158	eval-rmse:2172.01479
[100]	train-rmse:1322.46124	eval-rmse:1608.93325
[200]	train-rmse:1145.79163	eval-rmse:1477.08872
[300]	train-rmse:1039.57941	eval-rmse:1420.93354
[400]	train-rmse:922.12867	eval-rmse:1394.70423
[500]	train-rmse:842.56089	eval-rmse:1384.67671
[600]	train-rmse:806.76335	eval-rmse:1380.27614
[700]	train-rmse:782.30680	eval-rmse:1376.77857
[800]	train-rmse:764.35024	eval-rmse:1373.04187
[900]	train-rmse:747.19257	eval-rmse:1370.86454
[999]	train-rmse:735.48751	eval-rmse:1367.87233


In [12]:
# Predict and evaluate
y_pred = model.predict(dtest)
rmse = root_mean_squared_error(y_test, y_pred)
model.save_model("./backend/prediction-model/Model/xgb_price_model.json")
print(f"Test RMSE: {rmse:.4f}")

Test RMSE: 1367.8723
