In [None]:
# Comprehensive EDA for Tomato Prices & Meteorological Data

import os, zipfile, glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose

sns.set_theme(style="whitegrid")
data_dir = r"C:\Users\neera\OneDrive\Desktop\New Project"

# 1. Load Tomato Data from centre wise data ZIP
zip_path = os.path.join(data_dir, "centre wise data.zip")
extract_path = os.path.join(data_dir, "centre_wise")
os.makedirs(extract_path, exist_ok=True)
with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(extract_path)
csv_files = glob.glob(os.path.join(extract_path, "*.csv"))
tomato_raw = pd.read_csv(csv_files[0], parse_dates=["Arrival_Date"])
tomato = tomato_raw[tomato_raw['Commodity'].str.lower()=="tomato"].copy()
tomato['Price'] = tomato['Modal_x0020_Price']
tomato['Date'] = tomato['Arrival_Date']
tomato['Month'] = tomato['Date'].dt.to_period("M").dt.to_timestamp()
price_monthly = (
    tomato.groupby(['State','Month'])['Price']
           .mean()
           .reset_index()
           .rename(columns={'Month':'Date'})
           .set_index('Date')
)

# 2. Load Meteorological Data (NASA POWER)
weather_raw = pd.read_csv(os.path.join(data_dir,"nasa_power_daily.csv"))
date_col = [c for c in weather_raw.columns if 'YYYY' in c or 'Date' in c][0]
weather_raw['Date'] = pd.to_datetime(weather_raw[date_col].astype(str), format='%Y%m%d', errors='coerce')
weather = (
    weather_raw.set_index('Date')
               [['T2M','PRECTOT']]
               .rename(columns={'T2M':'Temperature','PRECTOT':'Rainfall'})
)

# 3. Merge Price & Weather
data = price_monthly.join(weather, how='inner').dropna()
states = data['State'].unique()

# 4A. Time Series Plot by State
fig, axes = plt.subplots(len(states), 1, figsize=(14, 4*len(states)))
for ax, st in zip(axes, states):
    df = data[data['State']==st]
    sns.lineplot(x=df.index, y='Price', data=df, ax=ax)
    ax.set_title(f"Monthly Tomato Price — {st}")
    ax.set_ylabel("Price (₹)")
plt.tight_layout()

# 4B. Price vs. Rainfall & Temperature
fig, ax = plt.subplots(1, 2, figsize=(14,6))
sns.scatterplot(x='Rainfall', y='Price', hue='State', data=data, ax=ax[0])
ax[0].set_title("Price vs. Rainfall")
sns.scatterplot(x='Temperature', y='Price', hue='State', data=data, ax=ax[1])
ax[1].set_title("Price vs. Temperature")
plt.tight_layout()

# 4C. Monthly Boxplot Seasonality
data['Month_Num'] = data.index.month
fig, ax = plt.subplots(figsize=(12,6))
sns.boxplot(x='Month_Num', y='Price', hue='State', data=data, ax=ax)
ax.set_title("Monthly Price Distribution by State")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()

# 4D. Correlation Heatmap
corr = data[['Price','Rainfall','Temperature']].corr()
fig, ax = plt.subplots(figsize=(6,5))
sns.heatmap(corr, annot=True, cmap="coolwarm", ax=ax)
ax.set_title("Feature Correlation")

# 4E. Seasonal Decomposition for First State
st0 = states[0]
series = data[data['State']==st0]['Price']
decomp = seasonal_decompose(series, model='additive', period=12)
fig = decomp.plot()
fig.set_size_inches(12,8)
plt.suptitle(f"Seasonal Decomposition — {st0}", y=1.02)
plt.tight_layout()

# 5. Summary Statistics
summary = data.groupby('State')['Price'].describe()
print(summary)
