# Crop Market Analysis and Demand Forecasting - EDA

This notebook performs exploratory data analysis on crop market data to understand trends, seasonality, and factors affecting crop prices. We'll analyze synthetic data first and then apply the same analysis to real data.

## Objectives
1. Analyze market price trends and seasonality
2. Identify key features affecting crop prices
3. Create market indicators and ratios
4. Visualize patterns and relationships
5. Prepare data for model development

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Load and Prepare Data

We'll start by loading our synthetic data and preparing it for analysis. This includes:
1. Loading price and weather data
2. Basic data cleaning and formatting
3. Creating initial market indicators

In [None]:
# Load price and weather data
def load_data():
    prices = pd.read_csv('../data/raw/mandi_prices.csv', parse_dates=['date'])
    weather = pd.read_csv('../data/raw/weather.csv', parse_dates=['date'])
    return prices, weather

prices, weather = load_data()

# Display basic information about the datasets
print("Price Data Overview:")
print("-------------------")
print(f"Time Range: {prices['date'].min()} to {prices['date'].max()}")
print(f"Number of records: {len(prices)}")
print("\nSample of price data:")
display(prices.head())

print("\nWeather Data Overview:")
print("---------------------")
print(f"Time Range: {weather['date'].min()} to {weather['date'].max()}")
print(f"Number of records: {len(weather)}")
print("\nSample of weather data:")
display(weather.head())

In [None]:
# Create market indicators
def create_market_indicators(df):
    # Sort by date
    df = df.sort_values('date')
    
    # Price changes
    df['price_change'] = df['modal_price'].diff()
    df['price_pct_change'] = df['modal_price'].pct_change() * 100
    
    # Moving averages
    for window in [7, 14, 30]:
        df[f'MA_{window}'] = df['modal_price'].rolling(window=window).mean()
        df[f'price_volatility_{window}d'] = df['modal_price'].rolling(window=window).std()
    
    # Relative Strength Index (RSI)
    def calculate_rsi(prices, periods=14):
        delta = prices.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=periods).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=periods).mean()
        rs = gain / loss
        return 100 - (100 / (1 + rs))
    
    df['RSI'] = calculate_rsi(df['modal_price'])
    
    # Price momentum
    df['momentum_5d'] = df['modal_price'].diff(5)
    df['momentum_20d'] = df['modal_price'].diff(20)
    
    return df

# Apply indicators to price data
prices_with_indicators = create_market_indicators(prices.copy())

# Display the new features
print("Market Indicators Created:")
print("-------------------------")
display(prices_with_indicators.head())

# Time Series Analysis

Let's analyze the time series components of our price data:
1. Trend
2. Seasonality
3. Residual noise
4. Stationarity test

This will help us understand the underlying patterns in the price movements.

In [None]:
# Perform time series decomposition
def analyze_time_series(df):
    # Set date as index
    ts_data = df.set_index('date')['modal_price']
    
    # Decompose the time series
    decomposition = seasonal_decompose(ts_data, period=30)
    
    # Create subplot
    fig = make_subplots(rows=4, cols=1, subplot_titles=('Original', 'Trend', 'Seasonal', 'Residual'))
    
    # Add components to subplots
    fig.add_trace(go.Scatter(x=ts_data.index, y=ts_data.values, name='Original'), row=1, col=1)
    fig.add_trace(go.Scatter(x=ts_data.index, y=decomposition.trend, name='Trend'), row=2, col=1)
    fig.add_trace(go.Scatter(x=ts_data.index, y=decomposition.seasonal, name='Seasonal'), row=3, col=1)
    fig.add_trace(go.Scatter(x=ts_data.index, y=decomposition.resid, name='Residual'), row=4, col=1)
    
    # Update layout
    fig.update_layout(height=800, title_text="Time Series Decomposition")
    fig.show()
    
    # Perform Augmented Dickey-Fuller test
    adf_result = adfuller(ts_data.dropna())
    print('\nAugmented Dickey-Fuller Test:')
    print('---------------------------')
    print(f'ADF Statistic: {adf_result[0]}')
    print(f'p-value: {adf_result[1]}')
    print('Critical values:')
    for key, value in adf_result[4].items():
        print(f'\t{key}: {value}')

In [None]:
# Run the time series analysis
analyze_time_series(prices)

# Market Analysis and Feature Relationships

Now let's analyze the relationships between different market indicators and weather data:
1. Correlation analysis
2. Price volatility patterns
3. Weather impact on prices
4. Market momentum indicators

In [None]:
# Merge price and weather data
market_data = prices_with_indicators.merge(weather, on='date', how='left')

# Calculate correlations
corr_features = ['modal_price', 'MA_7', 'MA_30', 'RSI', 'price_volatility_30d', 
                 'temp_max', 'temp_min', 'precipitation', 'humidity']
correlation_matrix = market_data[corr_features].corr()

# Create correlation heatmap
fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='RdBu',
    zmid=0
))

fig.update_layout(
    title='Feature Correlation Heatmap',
    height=600,
    width=800
)
fig.show()

# Display key correlations with price
print("\nCorrelations with Modal Price:")
print("------------------------------")
correlations = correlation_matrix['modal_price'].sort_values(ascending=False)
print(correlations)

In [None]:
# Analyze price volatility patterns
fig = go.Figure()

# Add price and volatility
fig.add_trace(go.Scatter(
    x=market_data['date'],
    y=market_data['modal_price'],
    name='Price',
    line=dict(color='blue')
))

fig.add_trace(go.Scatter(
    x=market_data['date'],
    y=market_data['price_volatility_30d'],
    name='30-day Volatility',
    line=dict(color='red', dash='dash'),
    yaxis='y2'
))

# Update layout with secondary y-axis
fig.update_layout(
    title='Price vs Volatility Over Time',
    xaxis_title='Date',
    yaxis_title='Price',
    yaxis2=dict(
        title='Volatility',
        overlaying='y',
        side='right'
    ),
    height=500
)

fig.show()

# Calculate volatility statistics
print("\nVolatility Statistics:")
print("---------------------")
volatility_stats = market_data['price_volatility_30d'].describe()
print(volatility_stats)

# Save Processed Data for Modeling

Now that we've completed our analysis, let's save the processed data with all our computed features for use in the modeling notebook. We'll also save some key statistics and findings that will guide our modeling approach.

In [None]:
# Save processed data
output_path = '../data/processed/market_data_with_features.csv'
market_data.to_csv(output_path, index=False)
print(f"Saved processed data to {output_path}")

# Save key statistics and findings
analysis_results = {
    'time_range': {
        'start': market_data['date'].min().strftime('%Y-%m-%d'),
        'end': market_data['date'].max().strftime('%Y-%m-%d')
    },
    'price_stats': market_data['modal_price'].describe().to_dict(),
    'volatility_stats': market_data['price_volatility_30d'].describe().to_dict(),
    'key_correlations': correlation_matrix['modal_price'].to_dict()
}

import json
with open('../data/processed/analysis_results.json', 'w') as f:
    json.dump(analysis_results, f, indent=4)
print("\nSaved analysis results to analysis_results.json")