In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pickle
import os

# Load Data from CSV
df = pd.read_csv('../data/crop_price_data.csv')

# Ensure the DataFrame contains the required columns
required_columns = ['commodity_name', 'state', 'district', 'market', 'min_price', 'max_price', 'modal_price', 'date']
if not all(column in df.columns for column in required_columns):
    raise ValueError("DataFrame does not contain all required columns")

# Inspect the date column to understand its format
print(df['date'].head(10))

# Convert 'date' to datetime with automatic format detection
df['date'] = pd.to_datetime(df['date'], errors='coerce')  # errors='coerce' converts invalid parsing to NaT

# Check for any NaT values that indicate conversion issues
if df['date'].isna().any():
    print("Warning: Some dates could not be converted.")

# Extract year and month
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

# Drop rows with NaT in 'date' column (optional, if needed)
df = df.dropna(subset=['date'])

# Features and Labels
X = df[['year', 'month', 'min_price', 'max_price']]
y = df['modal_price']

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the fitted scaler to a .pkl file
os.makedirs('models', exist_ok=True)
with open('models/scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

# Initialize and Train Model
model = XGBRegressor(objective='reg:squarederror')
model.fit(X_train_scaled, y_train)

# Save the trained model to a .pkl file
with open('models/xgb_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Predict
y_pred = model.predict(X_test_scaled)

# Evaluate
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", mean_squared_error(y_test, y_pred, squared=False))

# Function for Price Prediction
def predict_price(year, month, min_price, max_price):
    input_data = pd.DataFrame([[year, month, min_price, max_price]],
                              columns=['year', 'month', 'min_price', 'max_price'])
    
    # Load the scaler from the .pkl file
    with open('models/scaler.pkl', 'rb') as file:
        loaded_scaler = pickle.load(file)
    
    # Load the model from the .pkl file
    with open('models/xgb_model.pkl', 'rb') as file:
        loaded_model = pickle.load(file)

    input_data_scaled = loaded_scaler.transform(input_data)
    predicted_price = loaded_model.predict(input_data_scaled)
    return predicted_price[0]

# Example Usage
print("Predicted Price:", predict_price(2024, 2, 1100, 1400))


0    2019-05-22
1    2019-05-22
2    2019-05-22
3    2019-05-22
4    2019-05-22
5    2019-05-22
6    2019-05-22
7    2019-05-22
8    2019-05-22
9    2019-05-22
Name: date, dtype: object
Mean Absolute Error: 1.37893174806677
Mean Squared Error: 494.8394067854872
Root Mean Squared Error: 22.244986104412096
Predicted Price: 345.88925


