In [None]:
```xml
<VSCode.Cell id="#VSC-4e90a66f" language="python">
import sys
sys.path.insert(0, '..')
sys.path.insert(0, '../src')


import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data.preprocess import extend_market_data

# Define ticker symbols for corn and ethanol futures
corn_ticker = "ZC=F"   # Corn Futures (CBOT)
ethanol_ticker = "ZS=F"  # Ethanol Futures (NYMEX)
corn = yf.Ticker(corn_ticker)
ethanol = yf.Ticker(ethanol_ticker)
corn_data = corn.history(start ="2014-01-01", end ="2024-12-31")
ethanol_data = ethanol.history(start ="2014-01-01", end ="2024-12-31")

corn_data = extend_market_data(corn_data)
ethanol_data = extend_market_data(ethanol_data)

features = list(corn_data.columns)
features.remove('expiry')
corn_data = corn_data[features]

features = list(ethanol_data.columns)
features.remove('expiry')
ethanol_data = ethanol_data[features]


</VSCode.Cell>
<VSCode.Cell id="#VSC-cb745e17" language="python">
plt.figure(figsize = (16,6))
plt.title("Corn Nearest to Expiration Futures Volume")
for year in corn_data['Year'].unique():  # Loop through each unique year
    yearly_data = corn_data[corn_data['Year'] == year]
    sns.lineplot(x = 'Date', y = 'Volume', data = yearly_data)
plt.show()

plt.figure(figsize = (16,6))
sns.lineplot(x = 'Day_Of_Year', y = 'Volume', data = corn_data, hue ='Year', palette='Set1')
plt.show()

plt.figure(figsize = (16,6))
sns.lineplot(x = 'DTE', y = 'Volume', data = corn_data)
plt.show()
</VSCode.Cell>
<VSCode.Cell id="#VSC-71b8b949" language="python">
monthly_average = corn_data.groupby(['Year', 'Month']).Volume.agg('mean').reset_index()

plt.figure(figsize = (16,6))
plt.title("SRW Nearest to Expiration Futures Monthly Average Volume")
sns.barplot(x = 'Month', y = 'Volume', data = monthly_average, hue = 'Year', palette='Set1')
# sns.barplot(x = 'Month', y = 'Volume', data = monthly_average)
plt.show()
</VSCode.Cell>
<VSCode.Cell id="#VSC-69c5932d" language="markdown">
The trading volume in March, May, July, September, and December are low, which is expected as these are the expiration dates of corn futures. However, August trading volume is also low, relative to the other months. 
</VSCode.Cell>
<VSCode.Cell id="#VSC-a61cd42e" language="python">
plt.figure(figsize = (16,6))
plt.title("SRW Nearest to Expiration Future Close Price")
for year in corn_data['Year'].unique():  # Loop through each unique year
    yearly_data = corn_data[corn_data['Year'] == year]
    sns.lineplot(x = 'Date', y = 'Close', data = yearly_data)
plt.show()

plt.figure(figsize = (16,6))
sns.lineplot(x = 'Day_Of_Year', y = 'Close', data = corn_data, hue ='Year', palette='Set1')
plt.show()
</VSCode.Cell>
<VSCode.Cell id="#VSC-4ba6b4d2" language="python">
plt.figure(figsize = (16,6))
sns.histplot(corn_data['Log_Return'], kde = True)
print(corn_data['Log_Return'].describe())
plt.show()

plt.figure(figsize = (16,6))
sns.lineplot(corn_data['Log_Return'])
plt.show()
</VSCode.Cell>
<VSCode.Cell id="#VSC-e6240c55" language="markdown">
There's tiny seasonality at around 250 days.
</VSCode.Cell>
<VSCode.Cell id="#VSC-58dc1b4a" language="python">
from scipy.signal import periodogram

frequencies, power = periodogram(corn_data['Close'].dropna())

plt.figure(figsize=(10, 5))
plt.plot(1 / frequencies, power)
plt.xlabel("Period (days)")
plt.ylabel("Power")
plt.title("Periodogram")
plt.xlim(0, 400)  # Limit x-axis to reasonable periods
plt.show()
</VSCode.Cell>
<VSCode.Cell id="#VSC-287dde85" language="python">
from src.data import DataLoader
dl = DataLoader()

production_raw = dl.get_production_data("CORN", 2014, national_level=True, raw=True)
production_national = production_raw[
    (production_raw.unit_desc == 'BU')
    & (production_raw.short_desc == 'CORN, GRAIN - PRODUCTION, MEASURED IN BU')
    & (production_raw.domain_desc == 'TOTAL')
    & (production_raw.reference_period_desc == 'YEAR')
][['state_name', 'Value', 'unit_desc', 'year', 'reference_period_desc', 'source_desc', 'short_desc']]

production_national['year'] = pd.to_numeric(production_national['year'])
production_national['Value'] = production_national['Value'].str.replace(',', '', regex=True)
production_national['Value'] = pd.to_numeric(production_national['Value'], errors='coerce')

stocks_national_raw = dl.get_stocks_data('CORN', 2014, True)
stocks_national_raw['end_month'] = stocks_national_raw['end_month'].astype(int)
stocks_national = stocks_national_raw[['year', 'end_month', 'CORN, GRAIN - STOCKS, MEASURED IN BU',
       'CORN, OFF FARM, GRAIN - STOCKS, MEASURED IN BU',
       'CORN, ON FARM, GRAIN - STOCKS, MEASURED IN BU']]
stocks_national = stocks_national.rename(columns={
    'CORN, GRAIN - STOCKS, MEASURED IN BU': 'Total',
    'CORN, ON FARM, GRAIN - STOCKS, MEASURED IN BU': 'ON_FARM',
    'CORN, OFF FARM, GRAIN - STOCKS, MEASURED IN BU': 'OFF_FARM',
})
</VSCode.Cell>
<VSCode.Cell id="#VSC-335e814c" language="python">
import matplotlib.cm as cm

month_map = {3: "Mar", 6: "Jun", 9: "Sep", 12: "Dec"}
stocks_national['month'] = stocks_national['end_month'].map(month_map)
months = ['Mar', 'Jun', 'Sep', 'Dec']
stocks_national['month'] = pd.Categorical(stocks_national['month'], categories=months, ordered=True)
stocks_national = stocks_national.sort_values(['month', 'year'])
years = sorted(stocks_national['year'].unique())
num_years = len(years)
bar_width = 0.08
positions = np.arange(len(months))

# Generate color map for each year
colors = cm.get_cmap('tab10', num_years)

fig, axes = plt.subplots(nrows=2, figsize=(10, 6))

ax=axes[0]
for i, year in enumerate(years):
    df_year = stocks_national[stocks_national['year'] == year]

    # Only include months that actually exist in this year's data
    valid_months = df_year['month'].tolist()
    month_indices = [months.index(m) for m in valid_months]

    offset = (i - num_years/2) * bar_width + bar_width/2
    x = np.array(month_indices) + offset

    # Fetch corresponding values in order
    on_farm_vals = df_year['ON_FARM'].values
    off_farm_vals = df_year['OFF_FARM'].values

    # Plot stacked bars
    ax.bar(x, off_farm_vals, width=bar_width, color=colors(i), label=f'{year}')
    ax.bar(x, on_farm_vals, width=bar_width, bottom=off_farm_vals, color=colors(i), alpha=0.4)

ax.set_xticks(positions)
ax.set_xticklabels(months)
# ax.set_xlabel("Month")
ax.set_ylabel("Corn Stocks (bushels)")
ax.set_title("Quarterly U.S. Wheat Stocks by Year (Off-Farm (Solid) + On-Farm (Faded) Breakdown)")
ax.legend(title="Year", bbox_to_anchor=(1.01, 1), loc='upper left')
# plt.tight_layout()
# plt.show()

# plt.figure(figsize=(12, 6))


sns.lineplot(x='year', y='Value', data=production_national, ax=axes[1])
axes[1].set_title("Corn Production by Year")
axes[1].set_ylabel("Corn Production (bushels)")
axes[1].set_xlabel("Year")

plt.savefig('../imgs/corn/production_stock.png')
plt.show()
</VSCode.Cell>
<VSCode.Cell id="#VSC-2f8d47db" language="python">
production_state_raw = dl.get_production_data("CORN", 2014, national_level=False, raw=True)

year = 2024
production_state = production_state_raw[
    (production_state_raw.unit_desc.isin(['BU', 'PCT BY TYPE'])) &
    (production_state_raw.reference_period_desc == 'YEAR') &
    (production_state_raw.year == year)
][['state_alpha', 'state_name', 'Value', 'unit_desc']]

production_state['Value'] = production_state['Value'].str.replace(',', '', regex=True)
production_state['Value'] = pd.to_numeric(production_state['Value'], errors='coerce')


fig, axes = plt.subplots(ncols=1, figsize=(10, 5.6))

# ploting the production for winter wheat by state
ax = axes

winter_production_state = production_state.sort_values('Value', ascending=False)
threshold = 48000000
major = winter_production_state[winter_production_state['Value'] >= threshold]
other = winter_production_state[winter_production_state['Value'] < threshold]
other_sum = other.Value.sum()

df_pie = pd.concat([
    major[['state_name', 'Value']],
    pd.DataFrame({'state_name': ['OTHERS'], 'Value': [other_sum]})
])

ax.set_title(f"{year} U.S. Corn Production by State")
ax.pie(df_pie.Value, labels=df_pie.state_name, autopct='%1.1f%%')

plt.show()
</VSCode.Cell>
<VSCode.Cell id="#VSC-4667891c" language="python">
condition_state_raw = pd.concat(dl.get_condition_data('CORN', 2014, exact_year = year, national_level=False, raw=True) for year in range(2014,2026))

</VSCode.Cell>
<VSCode.Cell id="#VSC-6248ff92" language="python">
SRW_states_of_interest = winter_production_state[:9][['state_name', 'Value']]
SRW_states_of_interest['weight'] = SRW_states_of_interest['Value'] / SRW_states_of_interest['Value'].sum()

condition_state_raw['year'] = pd.to_numeric(condition_state_raw['year'])
condition_state_raw['Value'] = pd.to_numeric(condition_state_raw['Value'], errors='coerce')
raw_data = condition_state_raw[
    (condition_state_raw.state_name.isin(SRW_states_of_interest['state_name']))
]
condition_state = raw_data.pivot(index=['week_ending', 'year', 'state_name', 'end_code'], columns='unit_desc', values='Value').reset_index()
condition_state.rename(columns={'end_code': 'week_number', 'week_ending': 'date'}, inplace=True)
condition_state.date = pd.to_datetime(condition_state.date)

# 1. Filter only SRW states
condition_srw = condition_state[condition_state['state_name'].isin(SRW_states_of_interest['state_name'])]

# 2. Merge weights into condition data
condition_srw = condition_srw.merge(SRW_states_of_interest, on='state_name', how='left')

# 3. Compute weighted condition percentages
conditions = ['PCT EXCELLENT', 'PCT GOOD', 'PCT FAIR', 'PCT POOR', 'PCT VERY POOR']
for col in conditions:
    condition_srw[col] = pd.to_numeric(condition_srw[col], errors='coerce')
    condition_srw[f'{col}_weighted'] = condition_srw[col] * condition_srw['weight']

# 4. Aggregate weekly by year
weekly_national = (
    condition_srw
    .groupby(['date'])[[f'{c}_weighted' for c in conditions]]
    .sum()
    .reset_index()
)

weekly_national['sum'] = (
    weekly_national['PCT EXCELLENT_weighted'] +
    weekly_national['PCT GOOD_weighted'] +
    weekly_national['PCT FAIR_weighted'] +
    weekly_national['PCT POOR_weighted'] +
    weekly_national['PCT VERY POOR_weighted']
)

df = weekly_national[weekly_national['sum']>=80].copy()
for condition in conditions:
    df[f'{condition}_weighted'] = df[f'{condition}_weighted'] * 100 / df['sum']

plt.figure(figsize=(16,6))
plt.stackplot(
    df['date'],
    [df[f'{col}_weighted'] for col in ['PCT EXCELLENT', 'PCT GOOD']],
    labels=['Excellent', 'Good'],
    alpha=0.85
)
plt.title(f"Weighted Corn Corp Condition")
plt.legend(loc='upper center', ncol=2)
# plt.savefig('../imgs/wheat/condition.png')
plt.show()

df['Good'] = df['PCT EXCELLENT_weighted'] + df['PCT GOOD_weighted']
df.rename(columns={'date': 'Date'}, inplace=True)
merged = pd.merge(df, corn_data, how='outer', on='Date')
merged.sort_values('Date', inplace=True)
merged.ffill(inplace=True)
merged['next_day_increment'] = merged['Close'].shift(-1) - merged['Close']
merged['next_3day_increment'] = merged['Close'].shift(-3) - merged['Close']
merged['next_7day_increment'] = merged['Close'].shift(-7) - merged['Close']
data = merged[merged.Date.isin(df.Date)][['Date', 'Good', 'Close', 'next_day_increment', 'next_3day_increment', 'next_7day_increment']].dropna()
data['condition_increment'] = data['Good']-data['Good'].shift(1)
print(f"The correlation between the crop condition change and the next day price increment is {(data.condition_increment).corr(data.next_day_increment):.4f}")
print(f"The correlation between the crop condition change and the next 3 day price increment is {(data.condition_increment).corr(data.next_3day_increment):.4f}")
print(f"The correlation between the crop condition change and the next 7 day price increment is {(data.condition_increment).corr(data.next_7day_increment):.4f}")
</VSCode.Cell>
```