In [None]:
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import numpy as np

from src.data import DataLoader
from src.data.preprocess import extend_market_data

In [None]:
SRW = yf.Ticker("ZW=F")
SRW_data = SRW.history(start ="2014-01-01").drop(['Dividends', 'Stock Splits'], axis=1)
market_data = extend_market_data(SRW_data)

In [None]:
market_data

In [None]:
dl = DataLoader()
production_raw = dl.get_production_data("WHEAT", 2014, True, raw=True)

In [None]:
production_national = production_raw[
    (production_raw.unit_desc == 'BU') &
    (production_raw.short_desc == 'WHEAT - PRODUCTION, MEASURED IN BU') &
    (production_raw.domain_desc == 'TOTAL')
][['state_name', 'Value', 'unit_desc', 'year', 'source_desc', 'short_desc']]

production_national['year'] = pd.to_numeric(production_national['year'])
production_national['Value'] = production_national['Value'].str.replace(',', '', regex=True)
production_national['Value'] = pd.to_numeric(production_national['Value'], errors='coerce')

production_national = production_national.groupby(by ='year').agg({'Value': 'mean'})
production_national.rename(columns={'Value': 'Production'}, inplace = True)
production_national

In [None]:
production_state_raw = dl.get_production_data("WHEAT", 2014, raw=True)
production_state = production_state_raw[
    (production_state_raw.unit_desc == 'BU') &
    (production_state_raw.class_desc == 'WINTER') &
#    ((production_state_raw.class_desc == 'WINTER') | (production_state_raw.class_desc == 'ALL CLASSES')) &
    (production_state_raw.domaincat_desc == 'NOT SPECIFIED') &
    (production_state_raw.short_desc == 'WHEAT, WINTER - PRODUCTION, MEASURED IN BU')
][['state_name', 'Value', 'unit_desc', 'year', 'source_desc', 'class_desc', 'reference_period_desc', 'short_desc']]

production_state['year'] = pd.to_numeric(production_state['year'])
production_state['Value'] = production_state['Value'].str.replace(',', '', regex=True)
production_state['Value'] = pd.to_numeric(production_state['Value'], errors='coerce')
# production_state

In [None]:
production_2022 = production_state[(production_state.year == 2022)].groupby(by ='state_name').agg({'Value': 'mean'}).sort_values('Value', ascending=False).dropna()
plt.pie(production_2022['Value'], labels=production_2022['Value'].index)

In [None]:
states_of_interest = production_2022[:11].index.tolist()
states_of_interest.remove('OTHER STATES')
states_of_interest

In [None]:
stocks_national_raw = dl.get_stocks_data('WHEAT', 2014, True)
stocks_national_raw['end_month'] = stocks_national_raw['end_month'].astype(int)
stocks_national = stocks_national_raw[['year', 'end_month', 'WHEAT - STOCKS, MEASURED IN BU', 'WHEAT, OFF FARM - STOCKS, MEASURED IN BU', 'WHEAT, ON FARM - STOCKS, MEASURED IN BU']]
# stocks_national.index.name = 'id'
stocks_national = stocks_national.rename(columns={
    'WHEAT - STOCKS, MEASURED IN BU': 'Total',
    'WHEAT, ON FARM - STOCKS, MEASURED IN BU': 'ON_FARM',
    'WHEAT, OFF FARM - STOCKS, MEASURED IN BU': 'OFF_FARM',
})
stocks_national

In [None]:
condition_state_raw = dl.get_condition_data('WHEAT', 2018, national_level=False, raw=True)

condition_state_raw['year'] = pd.to_numeric(condition_state_raw['year'])
# condition_national_raw['Value'] = condition_national_raw['Value'].str.replace(',', '', regex=True)
condition_state_raw['Value'] = pd.to_numeric(condition_state_raw['Value'], errors='coerce')
raw_data = condition_state_raw[condition_state_raw.class_desc == 'WINTER']

condition_state = raw_data.pivot(index=['week_ending', 'year', 'state_name', 'end_code'], columns='unit_desc', values='Value').reset_index().set_index('week_ending')
condition_state.rename(columns={'end_code': 'week_number'}, inplace=True)

# Due to the API constrain, we get the data for 2014-2018 separately, and combine them together
for year in range(2014, 2018):
    condition_state_raw = dl.get_condition_data('WHEAT', 2014, exact_year=year, national_level=False, raw=True)

    condition_state_raw['year'] = pd.to_numeric(condition_state_raw['year'])
    # condition_national_raw['Value'] = condition_national_raw['Value'].str.replace(',', '', regex=True)
    condition_state_raw['Value'] = pd.to_numeric(condition_state_raw['Value'], errors='coerce')
    raw_data = condition_state_raw[condition_state_raw.class_desc == 'WINTER']

    condition_state_year = raw_data.pivot(index=['week_ending', 'year', 'state_name', 'end_code'], columns='unit_desc', values='Value').reset_index().set_index('week_ending')
    condition_state_year.rename(columns={'end_code': 'week_number'}, inplace=True)
    condition_state = pd.concat([condition_state_year, condition_state], axis=0)
condition_state.sort_index(inplace=True)
condition_state

# Put all data together to a dataframe

In [None]:
data = market_data.copy().reset_index()
for state in states_of_interest:
    # appending the yearly production data of each state
    yearly_production_data = production_state[
        (production_state.state_name==state) &
        (production_state.source_desc=='SURVEY')
    ].pivot(index='year', columns='reference_period_desc', values='Value').reset_index().set_index('year').add_prefix(state+'_')
    data = pd.merge(data, yearly_production_data, how='left', left_on='Year', right_on='year')

    weekly_condition_data = condition_state[
        condition_state.state_name==state
    ][['PCT EXCELLENT', 'PCT FAIR', 'PCT GOOD', 'PCT POOR', 'PCT VERY POOR']].add_prefix(state+'_')
    weekly_condition_data.index = pd.to_datetime(weekly_condition_data.index)

    # appending the weekly condition data
    data=pd.merge(data, weekly_condition_data, how='outer', left_on='Date', right_index=True)

# appending the quarterly stocks data
data = pd.merge(data, stocks_national, how='outer', left_on=['Year', 'Month'], right_on=['year', 'end_month'])

# drop redundant columns, sort by dates, and perform ffill
data = data.drop(['year', 'end_month', 'expiry'], axis=1)
data.sort_values(by='Date', inplace=True)
data = data.ffill(axis=0)

# This will remove the extra date created when merging the tables
data = data[data.Date.isin(market_data.index)]

# Creating the target column, which is the 7day log return
data['7d_log_return'] = np.log(data['Close']/data['Close'].shift(7))
# data['Target'] = data['7d_log_return'].shift(-7)

data.dropna(inplace=True)
data.set_index('Date', inplace=True)
data

In [None]:
data.columns

In [None]:
data['7d_log_return'].describe()

In [None]:
import seaborn as sns
sns.histplot(data=data, x='7d_log_return', kde='True')

In [None]:
import statsmodels.api as sm
sm.graphics.tsa.plot_pacf(data['7d_log_return'], lags = 20)
plt.show()

# Building a LSTM model using the above data

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score

device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

In [None]:
def create_sequences(data, seq_length):
    sequences = []
    targets = []
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i+seq_length])
        # the following code assumes that the target is the last column
        targets.append(data[i+seq_length][-1])
    return np.array(sequences), np.array(targets)

data_train = data[:-500]
data_test = data[-500:]

# Now, this scaler will not cause data leakage
scaler = MinMaxScaler(feature_range=(-1, 1))
data_train= scaler.fit_transform(data_train)
data_test = scaler.transform(data_test)
data_scaled = np.vstack([data_train, data_test])

SEQ_LENGTH = 60
X, y = create_sequences(data_scaled, SEQ_LENGTH)

# Convert to PyTorch tensors
X_train, y_train = torch.tensor(X[:-500], dtype=torch.float32).to(device), torch.tensor(y[:-500], dtype=torch.float32).to(device)
X_test, y_test = torch.tensor(X[-500:], dtype=torch.float32).to(device), torch.tensor(y[-500:], dtype=torch.float32).to(device)

# Reshape for LSTM (batch_size, seq_length, num_features)
X_train = X_train.view(-1, SEQ_LENGTH, 128)
X_test = X_test.view(-1, SEQ_LENGTH, 128)

# Create DataLoader
BATCH_SIZE = 32
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=BATCH_SIZE, shuffle=False)


In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim=128, hidden_dim=64, num_layers=2, output_dim=1):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])  # Take last output from LSTM
        return out

# Initialize Model
model = LSTMModel().to(device)

In [None]:
# Define Loss and Optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
EPOCHS = 1000
for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        y_pred = model(batch_x)
        loss = criterion(y_pred.squeeze(), batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{EPOCHS}], Loss: {epoch_loss/len(train_loader):.6f}')

In [None]:
model.eval()
with torch.no_grad():
    y_pred_test = model(X_test).squeeze().cpu().numpy()

y_pred_test_rescaled = y_pred_test.reshape(-1, 1)
y_test_rescaled = y_test.cpu().numpy().reshape(-1, 1)

# Plot results

plt.figure(figsize=(12, 6))
plt.plot(data.index[-500:], y_test_rescaled, label='Actual')
plt.plot(data.index[-500:], y_pred_test_rescaled, label='Predicted')
plt.legend()
plt.ylabel('7D log return')
plt.title("LSTM Model Predictions on testing set (without inverse transform)")
plt.show()

print(f"r2 score {r2_score(y_test_rescaled, y_pred_test_rescaled):.4f}")

In [None]:
# Transforming the data back to the usual scale
res1 = scaler.inverse_transform(np.hstack([data_test[:, :-1], y_pred_test.reshape(-1, 1)]))
res2 = scaler.inverse_transform(data_test)
y_pred = res1[:, -1].reshape(-1, 1)
y_test = res2[:, -1].reshape(-1, 1)

plt.figure(figsize=(12, 6))
plt.plot(data.index[-50:], y_test[-50:], label='Actual')
plt.plot(data.index[-50:], y_pred[-50:], label='Predicted')
plt.legend()
plt.ylabel('7D log return')
plt.title("LSTM Model Predictions on testing set (with inverse transform)")
plt.show()

print(f"r2 score {r2_score(y_test, y_pred):.4f}")

In [None]:
price_predicted = np.exp(y_pred) * (data.Close.shift(7)[-500:].to_numpy().reshape(-1, 1))
actual_price = data.Close[-500:]
plt.figure(figsize=(12, 6))
plt.title("Predicted vs. Actual price")
plt.plot(data.index[-500:], actual_price, label='Actual')
plt.plot(data.index[-500:], price_predicted, label='Predicted')
plt.legend()
print(f"r2: {r2_score(actual_price, price_predicted)}")

def directional_accuracy(y_true, y_pred):
    y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)

    # Compute price changes
    actual_change = np.diff(y_true)  # y_t+1 - y_t
    predicted_change = np.diff(y_pred)  # ŷ_t+1 - ŷ_t
    correct_directions = np.sum((actual_change * predicted_change) > 0)
    da = correct_directions / len(actual_change) * 100  # Convert to percentage

    return da

da = directional_accuracy(actual_price.to_numpy(), price_predicted.reshape(-1,))
print(f"Directional Accuracy: {da:.2f}%")

In [None]:
double_check = pd.DataFrame({
    "predicted 7d log return": y_pred.reshape(-1,),
    "7d log return from y test": y_test.reshape(-1,),
    "from raw data": data['7d_log_return'][-500:].to_numpy(),
    "Close_shifted": data['Close'][-507:-7].to_numpy(),
})

double_check['predicted_price'] = np.exp(y_pred.reshape(-1,)) * double_check['Close_shifted']
double_check['actual_price_calculated'] = np.exp(y_test.reshape(-1,)) * double_check['Close_shifted']
double_check['Close'] = data['Close'][-500:].to_numpy()

double_check

In [None]:
actual_logreturn = np.log(actual_price[1:].to_numpy()/actual_price[:-1].to_numpy())
# The wierd divide by 2 does the job, the log return r2 score is close now
predicted_logreturn = np.log(price_predicted[1:]/price_predicted[:-1])
plt.figure(figsize=(16, 6))
plt.plot(actual_logreturn, label='Actual', alpha=0.5)
plt.plot(predicted_logreturn, label='Predicted')
plt.legend()
plt.show()
print(f"r2: {r2_score(actual_logreturn, predicted_logreturn)}")

In [None]:
pd.DataFrame({"predicted": price_predicted.reshape(-1,), "actual": actual_price})

In [None]:
logreturn_df = pd.DataFrame({"predicted log return": predicted_logreturn.reshape(-1,), "actual log return": actual_logreturn})
print(f"directional accuracy: {((logreturn_df['predicted log return'] * logreturn_df['actual log return'])>0).sum()/len(logreturn_df)}")
logreturn_df

In [None]:
price_predicted_adjusted = np.exp(predicted_logreturn.reshape(-1,)) * data.Close[-500:-1]
plt.figure(figsize=(12, 6))
plt.title("Predicted_adjusted vs. Actual price")
plt.plot(data.index[-499:], actual_price[1:].to_numpy(), label='Actual')
plt.plot(data.index[-499:], price_predicted_adjusted, label='Predicted')
plt.legend()
print(f"r2: {r2_score(actual_price[1:], price_predicted_adjusted)}")

In [None]:
df = pd.DataFrame({"predicted": price_predicted[1:].reshape(-1,), "predicted_adjusted": price_predicted_adjusted.to_numpy(), "actual": actual_price[1:].to_numpy()})
price_predicted_adjusted_adjusted = (price_predicted_adjusted.to_numpy().reshape(-1,) + price_predicted[1:].reshape(-1,))/2
df["adjusted_adjusted"] = price_predicted_adjusted_adjusted
df

In [None]:
from sklearn.metrics import r2_score
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LinearRegression

In [None]:
data['next_log_return'] = data['Log_Return'].shift(1)

not_necessary_cols = ['Log_Return', '7d_log_return']

data_train = data.drop(columns=not_necessary_cols).dropna()
data_train

X_train = data_train.drop(columns='next_log_return')[:-500]
y_train = data_train['next_log_return'][:-500]

X_test = data_train.drop(columns='next_log_return')[-500:]
y_test = data_train['next_log_return'][-500:]

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(r2_score(y_test, y_pred))


In [None]:
y_pred

# Prediction for the next day price

TODO, think about the scaling. The wierd divide by 2 above. Use the price predicted_adjusted constructed above

In [None]:
from datetime import datetime, timedelta
def create_last_two_sequences(data, seq_length):
    sequences = []

    sequences.append(data[-seq_length-1:-1])
    sequences.append(data[-seq_length:])
    return np.array(sequences)

data_test = scaler.transform(data)
input = create_last_two_sequences(data_test, SEQ_LENGTH)

input_tensor = torch.tensor(input, dtype=torch.float32).to(device)
input_tensor = input_tensor.view(-1, SEQ_LENGTH, 128)
model.eval()
with torch.no_grad():
    output = model(input_tensor).squeeze().cpu().numpy()

res1 = scaler.inverse_transform(np.concat([np.zeros(2*127).reshape(2, 127), output.reshape(2, 1)], axis=1))
res1
print(f"The predicted next 7-day log return is {res1[:, -1][0]:.4f}")
print(f"The predicted {(data.index[-1].date() + timedelta(days=1)).strftime("%Y-%m-%d")} SRW Wheat Future Close Price is {(np.exp(res1[:, -1]) * data['Close'][-7])[0]:.4f}")

In [None]:
res1[:,-1]

In [None]:
np.exp(res1[:, -1]) * data.Close[-8:-6]

In [None]:
np.concat([np.zeros(2*127).reshape(2, 127), output.reshape(2, 1)], axis=1)

In [None]:
np.zeros(2*127).reshape(2, 127)

In [None]:
output.reshape(2,1)