In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from joblib import dump, load
import pickle

In [2]:
# Load data
df = pd.read_csv("../data/foods_grains/rice.csv")
df = df.rename(columns={'t': 'date'})

In [3]:
# Convert date properly
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')
df = df.sort_values('date')  # CRITICAL: Sort by date first!

print(f"Data period: {df['date'].min()} to {df['date'].max()}")
print(f"Total records: {len(df)}")

Data period: 2020-01-02 00:00:00 to 2025-09-20 00:00:00
Total records: 1669


In [4]:
# Extract date features
df['Year'] = df['date'].dt.year
df['Month'] = df['date'].dt.month
df['Day'] = df['date'].dt.day
df['DayOfWeek'] = df['date'].dt.dayofweek
df['WeekOfYear'] = df['date'].dt.isocalendar().week

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1669 entries, 291 to 1489
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           1669 non-null   datetime64[ns]
 1   cmdty          1669 non-null   object        
 2   market_id      1669 non-null   int64         
 3   market_name    1669 non-null   object        
 4   state_id       1669 non-null   int64         
 5   state_name     1669 non-null   object        
 6   district_id    1669 non-null   int64         
 7   district_name  1669 non-null   object        
 8   variety        1669 non-null   object        
 9   p_min          1669 non-null   int64         
 10  p_max          1669 non-null   int64         
 11  p_modal        1669 non-null   int64         
 12  Year           1669 non-null   int32         
 13  Month          1669 non-null   int32         
 14  Day            1669 non-null   int32         
 15  DayOfWeek      1669 non-

In [6]:

# Encode categorical variables
le_district = LabelEncoder()
le_market = LabelEncoder()

df['district_encoded'] = le_district.fit_transform(df['district_name'])
df['market_encoded'] = le_market.fit_transform(df['market_name'])

In [7]:
# Save these encoders for the API
pickle.dump(le_district, open('../models/Ricedistrict_encoder.pkl', 'wb'))
pickle.dump(le_market, open('../models/Ricemarket_encoder.pkl', 'wb'))

In [8]:
# Data cleaning - remove unrealistic prices
print(f"Before cleaning: {len(df)} records")
df = df[(df['p_modal'] > 500) & (df['p_modal'] < 10000)]
df = df[(df['p_min'] > 0) & (df['p_min'] < 10000)]
df = df[(df['p_max'] > 0) & (df['p_max'] < 10000)]
print(f"After cleaning: {len(df)} records")

Before cleaning: 1669 records
After cleaning: 1668 records


In [9]:
# Handle missing values
df['p_min'] = df['p_min'].fillna(df['p_modal'] * 0.9)  # p_min ≈ 90% of modal
df['p_max'] = df['p_max'].fillna(df['p_modal'] * 1.1)  # p_max ≈ 110% of modal

In [10]:
# ✅ CORRECT FEATURE SET - All 9 features your API expects
features = [
    'market_id', 'state_id', 'district_id',  # Location IDs
    'p_min', 'p_max',                        # Price range
    'Year', 'Month', 'Day',                  # Date components
    'district_encoded'                       # Encoded district
]

target = 'p_modal'

X = df[features]
y = df[target]

print(f"Features used: {features}")
print(f"Feature matrix shape: {X.shape}")

Features used: ['market_id', 'state_id', 'district_id', 'p_min', 'p_max', 'Year', 'Month', 'Day', 'district_encoded']
Feature matrix shape: (1668, 9)


In [11]:
# ✅ PROPER TIME-BASED SPLIT
split_date = df['date'].quantile(0.8)  # 80% for training, 20% for testing
train_mask = df['date'] < split_date
test_mask = df['date'] >= split_date

X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]

print(f"Train period: {df[train_mask]['date'].min()} to {df[train_mask]['date'].max()}")
print(f"Test period: {df[test_mask]['date'].min()} to {df[test_mask]['date'].max()}")
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

Train period: 2020-01-02 00:00:00 to 2024-04-27 00:00:00
Test period: 2024-04-28 00:00:00 to 2025-09-20 00:00:00
Train size: 1334, Test size: 334


In [12]:
# Create pipeline
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler()),
])

In [13]:
# Fit pipeline on training data only
X_train_prepared = my_pipeline.fit_transform(X_train)
X_test_prepared = my_pipeline.transform(X_test)

print(f"Prepared train shape: {X_train_prepared.shape}")
print(f"Prepared test shape: {X_test_prepared.shape}")

Prepared train shape: (1334, 9)
Prepared test shape: (334, 9)


In [14]:
# Train model
model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    max_depth=10,
    min_samples_split=5
)

model.fit(X_train_prepared, y_train)

In [15]:
# Evaluate model
y_pred_train = model.predict(X_train_prepared)
y_pred_test = model.predict(X_test_prepared)

train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print("\n📊 Model Performance:")
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")
print(f"Train MAE: {train_mae:.2f}")
print(f"Test MAE: {test_mae:.2f}")
print(f"Train R²: {train_r2:.4f}")
print(f"Test R²: {test_r2:.4f}")


📊 Model Performance:
Train RMSE: 61.53
Test RMSE: 73.34
Train MAE: 22.39
Test MAE: 27.34
Train R²: 0.9924
Test R²: 0.9830


In [16]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n🔍 Feature Importance:")
print(feature_importance)


🔍 Feature Importance:
            feature  importance
4             p_max    0.778674
3             p_min    0.216188
7               Day    0.002728
5              Year    0.001356
6             Month    0.000969
0         market_id    0.000085
2       district_id    0.000000
1          state_id    0.000000
8  district_encoded    0.000000


In [17]:
from sklearn.model_selection import cross_val_score, TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)
cv_scores = cross_val_score(model, X_train_prepared, y_train, 
                           scoring='neg_mean_squared_error', cv=tscv)
cv_rmse_scores = np.sqrt(-cv_scores)
print(f"\n📈 Time Series CV RMSE: {cv_rmse_scores.mean():.2f} (±{cv_rmse_scores.std():.2f})")


📈 Time Series CV RMSE: 157.52 (±74.97)


In [19]:
# Save model and pipeline
dump(model, '../models/rice_model.joblib')
pickle.dump(model, open('../models/rice_model.pkl', 'wb'))
pickle.dump(my_pipeline, open('../models/rice_preprocessor.pkl', 'wb'))

print("\n💾 Model and pipeline saved successfully!")


💾 Model and pipeline saved successfully!


In [20]:
# Test prediction with correct features
sample_data = X_test.iloc[:3]
print(f"\n🧪 Sample test data shape: {sample_data.shape}")
prepared_sample = my_pipeline.transform(sample_data)
predictions = model.predict(prepared_sample)

print("Sample predictions:", predictions)
print("Actual values:", y_test.iloc[:3].values)


🧪 Sample test data shape: (3, 9)
Sample predictions: [4608.0985     4295.61286109 4505.34909576]
Actual values: [4600 4300 4600]


In [21]:
# Create mappings for the API
district_mapping = dict(zip(le_district.classes_, range(len(le_district.classes_))))
market_mapping = df.groupby('market_name')['market_id'].first().to_dict()

print(f"\n🌍 District mappings: {list(district_mapping.keys())[:5]}...")
print(f"🏪 Market mappings: {list(market_mapping.keys())[:5]}...")


🌍 District mappings: ['Bhandara']...
🏪 Market mappings: ['Bhandara', 'Tumsar']...
