In [1]:
import pandas as pd
import numpy as np

import os
  # <-- import pandas
import kagglehub

# Download dataset
path = kagglehub.dataset_download("neuromusic/avocado-prices")
print("Path to dataset files:", path)

# Load CSV (the dataset has "avocado.csv")
df = pd.read_csv(os.path.join(path, "avocado.csv"))

# Convert date and sort
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(['region', 'Date']).reset_index(drop=True)

# Check for duplicates
print(f"Duplicate rows: {df.duplicated().sum()}")

# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Aggregate to weekly data per region
df_weekly = df.groupby(['region', 'type', pd.Grouper(key='Date', freq='W-MON')]).agg({
    'AveragePrice': 'mean',
    'Total Volume': 'sum',
    '4046': 'sum',
    '4225': 'sum',
    '4770': 'sum',
    'Total Bags': 'sum',
    'Small Bags': 'sum',
    'Large Bags': 'sum',
    'XLarge Bags': 'sum'
}).reset_index()

# Rename Date to date for consistency
df_weekly = df_weekly.rename(columns={'Date': 'date'})

# Save cleaned data
df_weekly.to_csv('../data/avocado_weekly.csv', index=False)

print(f"Weekly data shape: {df_weekly.shape}")
print(f"Date range: {df_weekly['date'].min()} to {df_weekly['date'].max()}")

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\Purv Patel\.cache\kagglehub\datasets\neuromusic\avocado-prices\versions\1
Duplicate rows: 0
Missing values:
 Unnamed: 0      0
Date            0
AveragePrice    0
Total Volume    0
4046            0
4225            0
4770            0
Total Bags      0
Small Bags      0
Large Bags      0
XLarge Bags     0
type            0
year            0
region          0
dtype: int64
Weekly data shape: (18249, 12)
Date range: 2015-01-05 00:00:00 to 2018-03-26 00:00:00
