# Crop Market Analysis and Price Prediction

This notebook provides an end-to-end analysis of crop market data, including:
1. Data Preprocessing and EDA
2. Feature Engineering
3. Model Training and Evaluation
4. Market Trend Analysis
5. Seasonal Pattern Analysis
6. Region-based Recommendations

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from datetime import datetime, timedelta

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
import lightgbm as lgb
import pickle

# Set random seed for reproducibility
np.random.seed(42)

Matplotlib is building the font cache; this may take a moment.


## 1. Data Loading and Preprocessing

In [3]:
# Load the dataset
data = pd.read_csv('../data/processed/tomato_MAH_Pune_features.csv', parse_dates=['date'])

# Display basic information about the dataset
print("Dataset Info:")
print(data.info())

print("\nFirst few rows:")
print(data.head())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           365 non-null    datetime64[ns]
 1   market_id      365 non-null    object        
 2   market_name    365 non-null    object        
 3   price          365 non-null    float64       
 4   price_lag_1    364 non-null    float64       
 5   price_lag_7    358 non-null    float64       
 6   price_lag_14   351 non-null    float64       
 7   price_lag_30   335 non-null    float64       
 8   price_ma_7     359 non-null    float64       
 9   price_ma_30    336 non-null    float64       
 10  precipitation  365 non-null    float64       
 11  temp_max       365 non-null    float64       
 12  temp_min       365 non-null    float64       
 13  humidity       365 non-null    float64       
dtypes: datetime64[ns](1), float64(11), object(2)
memory usage: 4

In [4]:
# Add time-based features
data['month'] = data['date'].dt.month
data['year'] = data['date'].dt.year
data['day_of_week'] = data['date'].dt.dayofweek
data['quarter'] = data['date'].dt.quarter

# Print available features
print("Features available for modeling:")
print(data.columns.tolist())

Features available for modeling:
['date', 'market_id', 'market_name', 'price', 'price_lag_1', 'price_lag_7', 'price_lag_14', 'price_lag_30', 'price_ma_7', 'price_ma_30', 'precipitation', 'temp_max', 'temp_min', 'humidity', 'month', 'year', 'day_of_week', 'quarter']


In [11]:
# Add advanced features for better prediction confidence

# 1. Price volatility features
data['price_volatility_7d'] = data['price'].rolling(window=7).std()
data['price_volatility_30d'] = data['price'].rolling(window=30).std()

# 2. Price momentum indicators
data['price_momentum_7d'] = data['price'] / data['price'].shift(7) - 1
data['price_momentum_30d'] = data['price'] / data['price'].shift(30) - 1

# 3. Seasonal decomposition features
from statsmodels.tsa.seasonal import seasonal_decompose

# Ensure the date index is regular
data_temp = data.set_index('date')['price'].asfreq('D')
decomposition = seasonal_decompose(data_temp, period=30, extrapolate_trend='freq')

data['seasonal_factor'] = decomposition.seasonal
data['trend'] = decomposition.trend
data['residual'] = decomposition.resid

# 4. Market supply-demand indicators
data['supply_pressure'] = (data['price'] < data['price_ma_30']).astype(int)
data['demand_pressure'] = (data['price'] > data['price_ma_30']).astype(int)

# 5. Weather interaction features
data['temp_humidity_interaction'] = data['temp_max'] * data['humidity']
data['rain_temp_interaction'] = data['precipitation'] * data['temp_max']

# 6. Cyclical features
data['day_sin'] = np.sin(2 * np.pi * data['day_of_week']/7)
data['day_cos'] = np.cos(2 * np.pi * data['day_of_week']/7)
data['month_sin'] = np.sin(2 * np.pi * data['month']/12)
data['month_cos'] = np.cos(2 * np.pi * data['month']/12)

# Handle any missing values from the calculations
data = data.fillna(method='bfill')

print("New features added for model enhancement:")
new_features = ['price_volatility_7d', 'price_volatility_30d', 
                'price_momentum_7d', 'price_momentum_30d',
                'seasonal_factor', 'trend', 'residual',
                'supply_pressure', 'demand_pressure',
                'temp_humidity_interaction', 'rain_temp_interaction',
                'day_sin', 'day_cos', 'month_sin', 'month_cos']
print("\n".join(f"- {feature}" for feature in new_features))

New features added for model enhancement:
- price_volatility_7d
- price_volatility_30d
- price_momentum_7d
- price_momentum_30d
- seasonal_factor
- trend
- residual
- supply_pressure
- demand_pressure
- temp_humidity_interaction
- rain_temp_interaction
- day_sin
- day_cos
- month_sin
- month_cos



DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



## 2. Data Splitting

In [5]:
# Define features and target
features = ['month', 'year', 'day_of_week', 'quarter', 
           'price_lag_1', 'price_lag_7', 'price_lag_14', 'price_lag_30',
           'price_ma_7', 'price_ma_30',
           'temp_max', 'temp_min', 'humidity', 'precipitation']

X = data[features]
y = data['price']

# First split: separate test set
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Second split: separate training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

Training set size: 219
Validation set size: 73
Test set size: 73


## 3. Model Training and Evaluation

In [6]:
# Train XGBoost model
xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

xgb_model.fit(X_train, y_train)

# Train LightGBM model
lgb_model = lgb.LGBMRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

lgb_model.fit(X_train, y_train)

# Save models
with open('../models/xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

with open('../models/lgb_model.pkl', 'wb') as f:
    pickle.dump(lgb_model, f)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000598 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 756
[LightGBM] [Info] Number of data points in the train set: 219, number of used features: 13
[LightGBM] [Info] Start training from score 45.189862


In [7]:
# Evaluate models
def evaluate_model(model, X, y, dataset_name):
    predictions = model.predict(X)
    mse = mean_squared_error(y, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y, predictions)
    r2 = r2_score(y, predictions)
    
    print(f"{dataset_name} Metrics:")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R2 Score: {r2:.4f}\n")
    
    return predictions

print("XGBoost Results:")
xgb_train_pred = evaluate_model(xgb_model, X_train, y_train, "Training")
xgb_val_pred = evaluate_model(xgb_model, X_val, y_val, "Validation")
xgb_test_pred = evaluate_model(xgb_model, X_test, y_test, "Test")

print("\nLightGBM Results:")
lgb_train_pred = evaluate_model(lgb_model, X_train, y_train, "Training")
lgb_val_pred = evaluate_model(lgb_model, X_val, y_val, "Validation")
lgb_test_pred = evaluate_model(lgb_model, X_test, y_test, "Test")

XGBoost Results:
Training Metrics:
RMSE: 0.27
MAE: 0.20
R2 Score: 0.9989

Validation Metrics:
RMSE: 3.80
MAE: 3.01
R2 Score: 0.7673

Test Metrics:
RMSE: 3.57
MAE: 2.74
R2 Score: 0.7953


LightGBM Results:
Training Metrics:
RMSE: 1.44
MAE: 1.15
R2 Score: 0.9695

Validation Metrics:
RMSE: 3.51
MAE: 2.78
R2 Score: 0.8007

Test Metrics:
RMSE: 3.20
MAE: 2.57
R2 Score: 0.8359



## 4. Market Trend Analysis

In [8]:
# Create price trend visualization
fig = go.Figure()

# Historical price
fig.add_trace(
    go.Scatter(
        x=data['date'],
        y=data['price'],
        name='Actual Price',
        line=dict(color='blue')
    )
)

# Moving averages
fig.add_trace(
    go.Scatter(
        x=data['date'],
        y=data['price_ma_7'],
        name='7-Day MA',
        line=dict(color='red', dash='dash')
    )
)

fig.add_trace(
    go.Scatter(
        x=data['date'],
        y=data['price_ma_30'],
        name='30-Day MA',
        line=dict(color='green', dash='dot')
    )
)

fig.update_layout(
    title='Tomato Price Trends',
    xaxis_title='Date',
    yaxis_title='Price (₹/Quintal)',
    template='plotly_white'
)

fig.show()

## 5. Seasonal Pattern Analysis

In [9]:
# Create seasonal analysis plots
fig = make_subplots(rows=2, cols=2,
                    subplot_titles=('Monthly Price Box Plot',
                                   'Quarterly Price Trends',
                                   'Price vs Temperature',
                                   'Price vs Humidity'))

# Monthly patterns
monthly_data = data.groupby('month')['price'].agg(['mean', 'std']).reset_index()
fig.add_trace(
    go.Box(x=data['month'], y=data['price'], name='Monthly Prices'),
    row=1, col=1
)

# Quarterly patterns
quarterly_data = data.groupby('quarter')['price'].mean().reset_index()
fig.add_trace(
    go.Bar(x=quarterly_data['quarter'], y=quarterly_data['price'], name='Quarterly Avg'),
    row=1, col=2
)

# Temperature correlation
fig.add_trace(
    go.Scatter(x=data['temp_max'], y=data['price'], mode='markers',
               name='Price vs Temp', marker=dict(size=5)),
    row=2, col=1
)

# Humidity correlation
fig.add_trace(
    go.Scatter(x=data['humidity'], y=data['price'], mode='markers',
               name='Price vs Humidity', marker=dict(size=5)),
    row=2, col=2
)

fig.update_layout(height=800, title_text='Seasonal Patterns and Weather Correlations')
fig.show()

## 6. Region-based Analysis and Recommendations

In [10]:
def generate_recommendations(region_data):
    """Generate recommendations based on market analysis"""
    
    # Calculate key metrics
    current_price = region_data['price'].iloc[-1]
    avg_price = region_data['price'].mean()
    price_volatility = region_data['price'].std()
    
    # Analyze seasonal patterns
    monthly_avg = region_data.groupby('month')['price'].mean()
    best_month = monthly_avg.idxmax()
    worst_month = monthly_avg.idxmin()
    
    # Generate recommendations
    recommendations = []
    
    if current_price > avg_price * 1.2:
        recommendations.append("Current prices are above average - consider selling")
    elif current_price < avg_price * 0.8:
        recommendations.append("Current prices are below average - consider storing if possible")
    
    # Add seasonal recommendations
    recommendations.append(f"Best selling month historically: {best_month}")
    recommendations.append(f"Avoid selling in month: {worst_month}")
    
    return recommendations

# Example for Pune region
recommendations = generate_recommendations(data)
print("Recommendations for Pune region:")
for rec in recommendations:
    print(f"- {rec}")

Recommendations for Pune region:
- Current prices are above average - consider selling
- Best selling month historically: 11
- Avoid selling in month: 1


In [12]:
# Enhanced Market Analysis and Recommendations System

def analyze_market_conditions(region_data, current_date=None):
    """
    Analyze market conditions and provide comprehensive recommendations
    """
    if current_date is None:
        current_date = region_data['date'].max()
    
    # 1. Market Trend Analysis
    recent_data = region_data[region_data['date'] <= current_date].tail(30)
    current_price = recent_data['price'].iloc[-1]
    avg_price = recent_data['price'].mean()
    price_trend = recent_data['price_momentum_30d'].iloc[-1]
    volatility = recent_data['price_volatility_30d'].iloc[-1]
    
    # 2. Seasonal Pattern Analysis
    month = pd.to_datetime(current_date).month
    monthly_avg = region_data.groupby('month')['price'].mean()
    best_months = monthly_avg.nlargest(3).index.tolist()
    worst_months = monthly_avg.nsmallest(3).index.tolist()
    
    # 3. Supply-Demand Analysis
    supply_pressure = recent_data['supply_pressure'].mean()
    demand_pressure = recent_data['demand_pressure'].mean()
    
    # 4. Generate Recommendations
    recommendations = {
        'market_status': {
            'current_price': current_price,
            'avg_price': avg_price,
            'price_trend': 'Increasing' if price_trend > 0 else 'Decreasing',
            'volatility': volatility
        },
        'seasonal_insights': {
            'best_months': best_months,
            'worst_months': worst_months,
            'current_month_rank': monthly_avg.rank()[month]
        },
        'market_pressure': {
            'supply_pressure': supply_pressure,
            'demand_pressure': demand_pressure
        },
        'recommendations': []
    }
    
    # Add specific recommendations
    if current_price > avg_price * 1.2:
        recommendations['recommendations'].append({
            'type': 'sell',
            'urgency': 'high',
            'message': 'Current prices are significantly above average. Consider selling now.'
        })
    elif current_price < avg_price * 0.8:
        recommendations['recommendations'].append({
            'type': 'hold',
            'urgency': 'medium',
            'message': 'Prices are below average. Consider storing if possible.'
        })
    
    if month in best_months:
        recommendations['recommendations'].append({
            'type': 'timing',
            'urgency': 'high',
            'message': 'Current month historically has high prices. Good time to sell.'
        })
    
    if supply_pressure > 0.7:
        recommendations['recommendations'].append({
            'type': 'distribution',
            'urgency': 'high',
            'message': 'High supply pressure detected. Consider exploring alternative markets.'
        })
    
    return recommendations

# Generate example recommendations
example_recommendations = analyze_market_conditions(data)

print("Market Analysis and Recommendations:")
print("\n1. Market Status:")
for key, value in example_recommendations['market_status'].items():
    print(f"- {key.replace('_', ' ').title()}: {value}")

print("\n2. Seasonal Insights:")
for key, value in example_recommendations['seasonal_insights'].items():
    print(f"- {key.replace('_', ' ').title()}: {value}")

print("\n3. Recommendations:")
for rec in example_recommendations['recommendations']:
    print(f"- [{rec['urgency'].upper()}] {rec['message']}")

Market Analysis and Recommendations:

1. Market Status:
- Current Price: 59.50188022990729
- Avg Price: 49.33907484902877
- Price Trend: Decreasing
- Volatility: 7.318850577198946

2. Seasonal Insights:
- Best Months: [11, 12, 10]
- Worst Months: [1, 2, 3]
- Current Month Rank: 11.0

3. Recommendations:
- [HIGH] Current prices are significantly above average. Consider selling now.
- [HIGH] Current month historically has high prices. Good time to sell.


In [13]:
# Interactive Market Analysis Dashboard

def create_market_dashboard(region_data, current_date=None):
    """
    Create interactive visualizations for market analysis
    """
    if current_date is None:
        current_date = region_data['date'].max()
    
    # 1. Price Trends with Confidence Intervals
    fig = go.Figure()
    
    # Historical prices
    fig.add_trace(
        go.Scatter(
            x=region_data['date'],
            y=region_data['price'],
            name='Actual Price',
            line=dict(color='blue')
        )
    )
    
    # Add moving averages
    fig.add_trace(
        go.Scatter(
            x=region_data['date'],
            y=region_data['price_ma_30'],
            name='30-Day Trend',
            line=dict(color='red', dash='dash')
        )
    )
    
    # Add volatility bands
    fig.add_trace(
        go.Scatter(
            x=region_data['date'],
            y=region_data['price_ma_30'] + 2*region_data['price_volatility_30d'],
            name='Upper Volatility Band',
            line=dict(color='gray', dash='dot'),
            showlegend=False
        )
    )
    
    fig.add_trace(
        go.Scatter(
            x=region_data['date'],
            y=region_data['price_ma_30'] - 2*region_data['price_volatility_30d'],
            name='Lower Volatility Band',
            line=dict(color='gray', dash='dot'),
            fill='tonexty',
            fillcolor='rgba(128, 128, 128, 0.1)',
            showlegend=False
        )
    )
    
    fig.update_layout(
        title='Price Trends with Confidence Bands',
        xaxis_title='Date',
        yaxis_title='Price (₹/Quintal)',
        hovermode='x unified'
    )
    
    # 2. Create seasonal pattern visualization
    season_fig = go.Figure()
    
    monthly_stats = region_data.groupby('month').agg({
        'price': ['mean', 'std']
    }).reset_index()
    
    season_fig.add_trace(
        go.Scatter(
            x=monthly_stats['month'],
            y=monthly_stats['price']['mean'],
            name='Average Price',
            line=dict(color='blue')
        )
    )
    
    # Add confidence bands
    season_fig.add_trace(
        go.Scatter(
            x=monthly_stats['month'],
            y=monthly_stats['price']['mean'] + monthly_stats['price']['std'],
            name='Price Range',
            line=dict(color='gray', dash='dot'),
            showlegend=False
        )
    )
    
    season_fig.add_trace(
        go.Scatter(
            x=monthly_stats['month'],
            y=monthly_stats['price']['mean'] - monthly_stats['price']['std'],
            name='Price Range',
            fill='tonexty',
            fillcolor='rgba(128, 128, 128, 0.1)',
            line=dict(color='gray', dash='dot'),
            showlegend=False
        )
    )
    
    season_fig.update_layout(
        title='Seasonal Price Patterns',
        xaxis_title='Month',
        yaxis_title='Price (₹/Quintal)',
        xaxis=dict(tickmode='array', ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                                             'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
                  tickvals=list(range(1, 13)))
    )
    
    # 3. Create supply-demand pressure gauge
    recent_data = region_data[region_data['date'] <= current_date].tail(30)
    supply_pressure = recent_data['supply_pressure'].mean()
    
    gauge_fig = go.Figure(go.Indicator(
        mode="gauge+number",
        value=supply_pressure * 100,
        title={'text': "Market Supply Pressure"},
        gauge={
            'axis': {'range': [0, 100]},
            'bar': {'color': "red"},
            'steps': [
                {'range': [0, 33], 'color': "green"},
                {'range': [33, 66], 'color': "yellow"},
                {'range': [66, 100], 'color': "red"}
            ],
            'threshold': {
                'line': {'color': "black", 'width': 4},
                'thickness': 0.75,
                'value': supply_pressure * 100
            }
        }
    ))
    
    return fig, season_fig, gauge_fig

# Generate and display the dashboard
price_trend_fig, seasonal_fig, pressure_gauge = create_market_dashboard(data)

print("Displaying interactive market analysis dashboard...")
price_trend_fig.show()
seasonal_fig.show()
pressure_gauge.show()

Displaying interactive market analysis dashboard...
