In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from joblib import dump, load
import pickle

In [53]:
df = pd.read_csv("Wheat.csv")
df = df.rename(columns={'t': 'date'})

In [54]:
df.head(1)

Unnamed: 0,date,cmdty,market_id,market_name,state_id,state_name,district_id,district_name,variety,p_min,p_max,p_modal
0,12/31/2020,Wheat,165,Nandurbar,27,Maharashtra,497,Nandurbar,Other,1500.0,1905,1807.0


In [55]:
df.head()

Unnamed: 0,date,cmdty,market_id,market_name,state_id,state_name,district_id,district_name,variety,p_min,p_max,p_modal
0,12/31/2020,Wheat,165,Nandurbar,27,Maharashtra,497,Nandurbar,Other,1500.0,1905,1807.0
1,12/31/2020,Wheat,174,Sangli,27,Maharashtra,531,Sangli,Other,2050.0,2850,2450.0
2,12/31/2020,Wheat,2498,Kurdwadi(Modnimb),27,Maharashtra,526,Solapur,Other,1300.0,1551,1400.0
3,12/31/2020,Wheat,1467,Mangal Wedha,27,Maharashtra,526,Solapur,Other,1710.0,2210,1850.0
4,12/31/2020,Wheat,176,Solapur,27,Maharashtra,526,Solapur,Sharbati,2305.0,2940,2500.0


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120148 entries, 0 to 120147
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   date           120148 non-null  object 
 1   cmdty          120148 non-null  object 
 2   market_id      120148 non-null  int64  
 3   market_name    120148 non-null  object 
 4   state_id       120148 non-null  int64  
 5   state_name     120148 non-null  object 
 6   district_id    120148 non-null  int64  
 7   district_name  120148 non-null  object 
 8   variety        120148 non-null  object 
 9   p_min          120111 non-null  float64
 10  p_max          120148 non-null  int64  
 11  p_modal        120116 non-null  float64
dtypes: float64(2), int64(4), object(6)
memory usage: 11.0+ MB


In [57]:
# Convert date properly
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')
df = df.sort_values('date')  # CRITICAL: Sort by date first!

print(f"Data period: {df['date'].min()} to {df['date'].max()}")
print(f"Total records: {len(df)}")

Data period: 2020-01-01 00:00:00 to 2025-09-20 00:00:00
Total records: 120148


In [58]:
# Extract date features
df['Year'] = df['date'].dt.year
df['Month'] = df['date'].dt.month
df['Day'] = df['date'].dt.day
df['DayOfWeek'] = df['date'].dt.dayofweek
df['WeekOfYear'] = df['date'].dt.isocalendar().week

In [59]:

# Encode categorical variables
le_district = LabelEncoder()
le_market = LabelEncoder()

df['district_encoded'] = le_district.fit_transform(df['district_name'])
df['market_encoded'] = le_market.fit_transform(df['market_name'])

In [60]:
# Save these encoders for the API
pickle.dump(le_district, open('Wheatdistrict_encoder.pkl', 'wb'))
pickle.dump(le_market, open('Wheatmarket_encoder.pkl', 'wb'))

In [61]:
# Data cleaning - remove unrealistic prices
print(f"Before cleaning: {len(df)} records")
df = df[(df['p_modal'] > 500) & (df['p_modal'] < 10000)]
df = df[(df['p_min'] > 0) & (df['p_min'] < 10000)]
df = df[(df['p_max'] > 0) & (df['p_max'] < 10000)]
print(f"After cleaning: {len(df)} records")

Before cleaning: 120148 records
After cleaning: 120045 records


In [62]:
# Handle missing values
df['p_min'] = df['p_min'].fillna(df['p_modal'] * 0.9)  # p_min ≈ 90% of modal
df['p_max'] = df['p_max'].fillna(df['p_modal'] * 1.1)  # p_max ≈ 110% of modal

In [63]:
# ✅ CORRECT FEATURE SET - All 9 features your API expects
features = [
    'market_id', 'state_id', 'district_id',  # Location IDs
    'p_min', 'p_max',                        # Price range
    'Year', 'Month', 'Day',                  # Date components
    'district_encoded'                       # Encoded district
]

target = 'p_modal'

X = df[features]
y = df[target]

print(f"Features used: {features}")
print(f"Feature matrix shape: {X.shape}")

Features used: ['market_id', 'state_id', 'district_id', 'p_min', 'p_max', 'Year', 'Month', 'Day', 'district_encoded']
Feature matrix shape: (120045, 9)


In [64]:
# ✅ PROPER TIME-BASED SPLIT
split_date = df['date'].quantile(0.8)  # 80% for training, 20% for testing
train_mask = df['date'] < split_date
test_mask = df['date'] >= split_date

X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]

print(f"Train period: {df[train_mask]['date'].min()} to {df[train_mask]['date'].max()}")
print(f"Test period: {df[test_mask]['date'].min()} to {df[test_mask]['date'].max()}")
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

Train period: 2020-01-01 00:00:00 to 2024-07-12 00:00:00
Test period: 2024-07-13 00:00:00 to 2025-09-20 00:00:00
Train size: 96016, Test size: 24029


In [65]:
# Create pipeline
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler()),
])

In [66]:
# Fit pipeline on training data only
X_train_prepared = my_pipeline.fit_transform(X_train)
X_test_prepared = my_pipeline.transform(X_test)

print(f"Prepared train shape: {X_train_prepared.shape}")
print(f"Prepared test shape: {X_test_prepared.shape}")

Prepared train shape: (96016, 9)
Prepared test shape: (24029, 9)


In [67]:
# Train model
model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    max_depth=10,
    min_samples_split=5
)

model.fit(X_train_prepared, y_train)

In [68]:
# Evaluate model
y_pred_train = model.predict(X_train_prepared)
y_pred_test = model.predict(X_test_prepared)

train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print("\n📊 Model Performance:")
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")
print(f"Train MAE: {train_mae:.2f}")
print(f"Test MAE: {test_mae:.2f}")
print(f"Train R²: {train_r2:.4f}")
print(f"Test R²: {test_r2:.4f}")


📊 Model Performance:
Train RMSE: 81.52
Test RMSE: 88.62
Train MAE: 47.15
Test MAE: 50.69
Train R²: 0.9769
Test R²: 0.9414


In [69]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n🔍 Feature Importance:")
print(feature_importance)


🔍 Feature Importance:
            feature  importance
4             p_max    0.814310
3             p_min    0.178005
0         market_id    0.002626
5              Year    0.001610
2       district_id    0.001366
8  district_encoded    0.001003
6             Month    0.000857
7               Day    0.000224
1          state_id    0.000000


In [70]:
# Time Series Cross Validation
from sklearn.model_selection import cross_val_score, TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)
cv_scores = cross_val_score(model, X_train_prepared, y_train, 
                           scoring='neg_mean_squared_error', cv=tscv)
cv_rmse_scores = np.sqrt(-cv_scores)
print(f"\n📈 Time Series CV RMSE: {cv_rmse_scores.mean():.2f} (±{cv_rmse_scores.std():.2f})")


📈 Time Series CV RMSE: 101.14 (±13.28)


In [71]:
# Save model and pipeline
dump(model, 'wheat_model.joblib')
pickle.dump(model, open('wheat_model.pkl', 'wb'))
pickle.dump(my_pipeline, open('wheat_preprocessor.pkl', 'wb'))

print("\n💾 Model and pipeline saved successfully!")


💾 Model and pipeline saved successfully!


In [72]:
# Test prediction with correct features
sample_data = X_test.iloc[:3]
print(f"\n🧪 Sample test data shape: {sample_data.shape}")
prepared_sample = my_pipeline.transform(sample_data)
predictions = model.predict(prepared_sample)

print("Sample predictions:", predictions)
print("Actual values:", y_test.iloc[:3].values)


🧪 Sample test data shape: (3, 9)
Sample predictions: [2405.29285179 2675.8993096  2784.55370483]
Actual values: [2404. 2625. 2575.]


In [73]:
# Create mappings for the API
district_mapping = dict(zip(le_district.classes_, range(len(le_district.classes_))))
market_mapping = df.groupby('market_name')['market_id'].first().to_dict()

print(f"\n🌍 District mappings: {list(district_mapping.keys())[:5]}...")
print(f"🏪 Market mappings: {list(market_mapping.keys())[:5]}...")


🌍 District mappings: ['Ahmadnagar', 'Akola', 'Amravati', 'Aurangabad', 'Bhandara']...
🏪 Market mappings: ['Aarni', 'Achalpur', 'Ahmednagar', 'Ahmedpur', 'Akhadabalapur']...
