In [14]:
import pandas as pd
import xgboost as xgb
from xgboost.callback import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Loading Data Set
df = pd.read_csv('Agriculture_price_dataset.csv')

# Converting to datetime format
df['Price Date'] = pd.to_datetime(df['Price Date'])
                                     
# Striping and sorting data to maintain the order
df.columns = df.columns.str.strip()
df = df.sort_values('Price Date')

df.head()

Unnamed: 0,STATE,District Name,Market Name,Commodity,Variety,Grade,Min_Price,Max_Price,Modal_Price,Price Date
0,Maharashtra,nashik,Lasalgaon(Niphad),Wheat,Maharashtra 2189,FAQ,2172.0,2399.0,2300.0,2023-06-06
1241,Uttar Pradesh,bijnor,Chaandpur,Tomato,Hybrid,FAQ,600.0,700.0,650.0,2023-06-06
1240,Jammu & Kashmir,jammu,Batote,Tomato,Other,FAQ,1800.0,2200.0,2000.0,2023-06-06
1239,Gujarat,dahod,Dahod,Wheat,147 Average,FAQ,2500.0,2700.0,2600.0,2023-06-06
1238,Madhya Pradesh,guna,Guna(F&V),Tomato,Other,FAQ,350.0,530.0,410.0,2023-06-06


In [10]:
# Creating time-based features
df['year'] = df['Price Date'].dt.year
df['month'] = df['Price Date'].dt.month
df['day'] = df['Price Date'].dt.day
df['dayofweek'] = df['Price Date'].dt.dayofweek

df = pd.get_dummies(df, columns=['STATE', 'District Name', 'Market Name', 'Commodity', 'Variety', 'Grade'])

# Pridicting 'Modal Price' with defined features (X) and target (y)
X = df.drop(['Price Date', 'Min_Price', 'Max_Price', 'Modal_Price'], axis=1)
y = df['Modal_Price']

X.head()

Unnamed: 0,year,month,day,dayofweek,STATE_ Punjab,STATE_Andhra Pradesh,STATE_Assam,STATE_Bihar,STATE_Chandigarh,STATE_Chattisgarh,...,Variety_Tomato,Variety_WH-542,Variety_White,Grade_FAQ,Grade_Large,Grade_Local,Grade_Medium,Grade_Non-FAQ,Grade_Ref grade-1,Grade_Ref grade-2
0,2023,6,6,1,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1241,2023,6,6,1,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1240,2023,6,6,1,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1239,2023,6,6,1,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1238,2023,6,6,1,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [11]:
# Calculate split point
split_point = int(len(df) * 0.8)

# Spliting data into training and testing data (Chronologically)
X_train, X_test = X[:split_point], X[split_point:]
y_train, y_test = y[:split_point], y[split_point:]

In [None]:
reg = xgb.XGBRegressor(n_estimators= 1000,
                             max_depth= 7,
                             learning_rate= 0.01,
                             subsample= 0.8,
                             random_state= 42)

early_stopping = EarlyStopping(rounds= 50,
                                            metric_name= 'rmse',
                                            data_name= 'validation_1')

reg.fit(X_train, y_train, 
        eval_set= [(X_train, y_train), (X_test, y_test)],
        callbacks=[early_stopping],
        verbose= 100)

TypeError: XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [None]:
# Make predictions on the test set
preds = reg.predict(X_test)

# Calculate the error
# RMSE (Root Mean Squared Error) is a common metric. It tells you, on average, how far your predictions are from the actual values.
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f"Root Mean Squared Error (RMSE): {rmse}")

# You can also look at a few predictions vs actual values
results = pd.DataFrame({'Actual': y_test, 'Predicted': preds})
print("\nSample of predictions vs actual values:")
print(results.head())