In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from joblib import dump, load
import pickle

In [2]:
# Load data
df = pd.read_csv("../data/fruits/grapes.csv")
df = df.rename(columns={'t': 'date'})

In [3]:

df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True, infer_datetime_format=True)

# Check how many rows were successfully converted
total = len(df)
converted = df['date'].notna().sum()
failed = total - converted

print(f"✅ Successfully converted {converted}/{total} dates.")
if failed > 0:
    print(f"⚠️ {failed} rows could not be parsed — check these:")
    print(df[df['date'].isna()].head())

# Sort chronologically (important before feature extraction)
df = df.sort_values('date')

# Extract date-based features
df['Year'] = df['date'].dt.year
df['Month'] = df['date'].dt.month
df['Day'] = df['date'].dt.day
df['DayOfWeek'] = df['date'].dt.dayofweek
df['WeekOfYear'] = df['date'].dt.isocalendar().week

print("\n📅 Date features extracted successfully!")
print(df[['date', 'Year', 'Month', 'Day', 'DayOfWeek', 'WeekOfYear']].head())


✅ Successfully converted 6072/6072 dates.

📅 Date features extracted successfully!
          date  Year  Month  Day  DayOfWeek  WeekOfYear
924 2020-01-01  2020      1    1          2           1
923 2020-01-01  2020      1    1          2           1
922 2020-01-01  2020      1    1          2           1
921 2020-01-01  2020      1    1          2           1
919 2020-01-02  2020      1    2          3           1


  df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True, infer_datetime_format=True)


In [4]:
# Extract date features
df['Year'] = df['date'].dt.year
df['Month'] = df['date'].dt.month
df['Day'] = df['date'].dt.day
df['DayOfWeek'] = df['date'].dt.dayofweek
df['WeekOfYear'] = df['date'].dt.isocalendar().week

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6072 entries, 924 to 4839
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           6072 non-null   datetime64[ns]
 1   cmdty          6072 non-null   object        
 2   market_id      6072 non-null   int64         
 3   market_name    6072 non-null   object        
 4   state_id       6072 non-null   int64         
 5   state_name     6072 non-null   object        
 6   district_id    6072 non-null   int64         
 7   district_name  6072 non-null   object        
 8   variety        6072 non-null   object        
 9   p_min          6072 non-null   int64         
 10  p_max          6072 non-null   int64         
 11  p_modal        6072 non-null   int64         
 12  commodity      6072 non-null   object        
 13  Year           6072 non-null   int32         
 14  Month          6072 non-null   int32         
 15  Day            6072 non-

In [6]:

# Encode categorical variables
le_district = LabelEncoder()
le_market = LabelEncoder()

df['district_encoded'] = le_district.fit_transform(df['district_name'])
df['market_encoded'] = le_market.fit_transform(df['market_name'])

In [8]:
# Save these encoders for the API
pickle.dump(le_district, open('../models/grapesdistrict_encoder.pkl', 'wb'))
pickle.dump(le_market, open('../models/grapesmarket_encoder.pkl', 'wb'))

In [9]:
# Data cleaning - remove unrealistic prices
print(f"Before cleaning: {len(df)} records")
df = df[(df['p_modal'] > 500) & (df['p_modal'] < 10000)]
df = df[(df['p_min'] > 0) & (df['p_min'] < 10000)]
df = df[(df['p_max'] > 0) & (df['p_max'] < 10000)]
print(f"After cleaning: {len(df)} records")

Before cleaning: 6072 records
After cleaning: 4478 records


In [10]:
# Handle missing values
df['p_min'] = df['p_min'].fillna(df['p_modal'] * 0.9)  # p_min ≈ 90% of modal
df['p_max'] = df['p_max'].fillna(df['p_modal'] * 1.1)  # p_max ≈ 110% of modal

In [11]:
# ✅ CORRECT FEATURE SET - All 9 features your API expects
features = [
    'market_id', 'state_id', 'district_id',  # Location IDs
    'p_min', 'p_max',                        # Price range
    'Year', 'Month', 'Day',                  # Date components
    'district_encoded'                       # Encoded district
]

target = 'p_modal'

X = df[features]
y = df[target]

print(f"Features used: {features}")
print(f"Feature matrix shape: {X.shape}")

Features used: ['market_id', 'state_id', 'district_id', 'p_min', 'p_max', 'Year', 'Month', 'Day', 'district_encoded']
Feature matrix shape: (4478, 9)


In [12]:
# ✅ PROPER TIME-BASED SPLIT
split_date = df['date'].quantile(0.8)  # 80% for training, 20% for testing
train_mask = df['date'] < split_date
test_mask = df['date'] >= split_date

X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]

print(f"Train period: {df[train_mask]['date'].min()} to {df[train_mask]['date'].max()}")
print(f"Test period: {df[test_mask]['date'].min()} to {df[test_mask]['date'].max()}")
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

Train period: 2020-01-01 00:00:00 to 2024-11-28 00:00:00
Test period: 2024-11-30 00:00:00 to 2025-09-19 00:00:00
Train size: 3582, Test size: 896


In [13]:
# Create pipeline
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler()),
])

In [14]:
# Fit pipeline on training data only
X_train_prepared = my_pipeline.fit_transform(X_train)
X_test_prepared = my_pipeline.transform(X_test)

print(f"Prepared train shape: {X_train_prepared.shape}")
print(f"Prepared test shape: {X_test_prepared.shape}")

Prepared train shape: (3582, 9)
Prepared test shape: (896, 9)


In [15]:
# Train model
model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    max_depth=10,
    min_samples_split=5
)

model.fit(X_train_prepared, y_train)

In [16]:
# Evaluate model
y_pred_train = model.predict(X_train_prepared)
y_pred_test = model.predict(X_test_prepared)

train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print("\n📊 Model Performance:")
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")
print(f"Train MAE: {train_mae:.2f}")
print(f"Test MAE: {test_mae:.2f}")
print(f"Train R²: {train_r2:.4f}")
print(f"Test R²: {test_r2:.4f}")


📊 Model Performance:
Train RMSE: 134.87
Test RMSE: 174.48
Train MAE: 53.66
Test MAE: 81.22
Train R²: 0.9932
Test R²: 0.9903


In [17]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n🔍 Feature Importance:")
print(feature_importance)


🔍 Feature Importance:
            feature  importance
4             p_max    0.795215
3             p_min    0.188510
0         market_id    0.007152
7               Day    0.002621
8  district_encoded    0.002181
2       district_id    0.001905
6             Month    0.001621
5              Year    0.000795
1          state_id    0.000000


In [18]:
# Time Series Cross Validation
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = cross_val_score(model, X_train_prepared, y_train, 
                           scoring='neg_mean_squared_error', cv=tscv)
cv_rmse_scores = np.sqrt(-cv_scores)
print(f"\n📈 Time Series CV RMSE: {cv_rmse_scores.mean():.2f} (±{cv_rmse_scores.std():.2f})")


📈 Time Series CV RMSE: 258.17 (±51.88)


In [19]:
# Save model and pipeline
dump(model, '../models/grapes_model.joblib')
pickle.dump(model, open('../models/grapes_model.pkl', 'wb'))
pickle.dump(my_pipeline, open('../models/grapes_preprocessor.pkl', 'wb'))

print("\n💾 Model and pipeline saved successfully!")


💾 Model and pipeline saved successfully!


In [20]:
# Test prediction with correct features
sample_data = X_test.iloc[:3]
print(f"\n🧪 Sample test data shape: {sample_data.shape}")
prepared_sample = my_pipeline.transform(sample_data)
predictions = model.predict(prepared_sample)

print("Sample predictions:", predictions)
print("Actual values:", y_test.iloc[:3].values)


🧪 Sample test data shape: (3, 9)
Sample predictions: [8000.         5513.3430493  6994.48724343]
Actual values: [8000 5500 7000]


In [21]:
# Create mappings for the API
district_mapping = dict(zip(le_district.classes_, range(len(le_district.classes_))))
market_mapping = df.groupby('market_name')['market_id'].first().to_dict()

print(f"\n🌍 District mappings: {list(district_mapping.keys())[:5]}...")
print(f"🏪 Market mappings: {list(market_mapping.keys())[:5]}...")


🌍 District mappings: ['Ahmadnagar', 'Amravati', 'Aurangabad', 'Chandrapur', 'Jalgaon']...
🏪 Market mappings: ['Ahmednagar', 'Amrawati(Frui & Veg. Market)', 'Bhusaval', 'Chandrapur(Ganjwad)', 'Chattrapati Sambhajinagar']...
