In [29]:
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

In [30]:
data_path = '../data/soybeans_model_input.csv'
data_df = pd.read_csv(data_path, low_memory=False)

# Parameters and targets
parameters = ['STU, US', 'STU, AR', 'STU, BR', 'STU, Corn', 'Gold', 'DX', 'Crude', 'GDP (Bn USD)']
targets = ['Price_High', 'Price_Low', 'Price_Average']

# Drop NaN values
combined_data = data_df[parameters + targets].dropna()

# Split the data into features (X) and targets (y)
X = combined_data[parameters]
y = combined_data[targets]

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline with StandardScaler and RandomForestRegressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [100, 200, 300, 500],
    'regressor__max_depth': [None, 10, 20, 30, 40],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': [None, 'sqrt', 'log2'],
    'regressor__bootstrap': [True, False]
}

# GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model after tuning
best_model = grid_search.best_estimator_

# Print the best parameters
print("Best parameters found by GridSearchCV:")
print(grid_search.best_params_)

model_filename = './soybeans_model_v1.pkl'
joblib.dump(best_model, model_filename)
print(f"Model saved to {model_filename}")

# Evaluate the model
y_pred = best_model.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)
print(f"Test MSE: {mse_test:.4f}")
print(f"Test R²: {r2_test:.4f}")


Fitting 5 folds for each of 1080 candidates, totalling 5400 fits
Best parameters found by GridSearchCV:
{'regressor__bootstrap': False, 'regressor__max_depth': 10, 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 200}
Model saved to ./soybeans_model_v1.pkl
Test MSE: 8521.6602
Test R²: 0.9080


'regressor__bootstrap': False,
'regressor__max_depth': 10,
'regressor__max_features': 'sqrt',
'regressor__min_samples_leaf': 1,
'regressor__min_samples_split': 2,
'regressor__n_estimators': 200


In [35]:
# Prediction function
def predict_prices(input, model):
    # if isinstance(input, pd.DataFrame):
    #     input = input.values
    input = np.array(input).reshape(1, -1)
    prediction = model.predict(input)
    high_price, low_price, avg_price = prediction[0]
    
    return high_price, low_price, avg_price

In [36]:
wasde_path = '../data/input_data/wasde0824.csv'
gold_path = '../data/input_data/gcy.csv'
dx_path = '../data/input_data/dxy.csv'
crude_path = '../data/input_data/qay.csv'

wasde_data = pd.read_csv(wasde_path)
gold_data = pd.read_csv(gold_path)
dx_data = pd.read_csv(dx_path)
crude_data = pd.read_csv(crude_path)

corn_data = wasde_data[
    (wasde_data['ReportTitle'] == 'World Corn Supply and Use') &
    (wasde_data['Region'] == 'United States') &
    (wasde_data['ProjEstFlag'] == 'Proj.')
]

corn_stock = corn_data[corn_data['Attribute'] == 'Ending Stocks']['Value'].values[0]
corn_use = corn_data[corn_data['Attribute'] == 'Domestic Total']['Value'].values[0]
stu_corn = corn_stock / corn_use if corn_use != 0 else None

soybean_data = wasde_data[
    (wasde_data['ReportTitle'] == 'World Soybean Supply and Use') &
    (wasde_data['ProjEstFlag'] == 'Proj.')
]

stu_us, stu_ar, stu_br = None, None, None

for region in ['United States', 'Argentina', 'Brazil']:
    region_data = soybean_data[soybean_data['Region'] == region]
    stock = region_data[region_data['Attribute'] == 'Ending Stocks']['Value'].values[0]
    use = region_data[region_data['Attribute'] == 'Domestic Total']['Value'].values[0]
    
    if use != 0:
        if region == 'United States':
            stu_us = stock / use
        elif region == 'Argentina':
            stu_ar = stock / use
        elif region == 'Brazil':
            stu_br = stock / use


gold_curr, dx_curr, crude_curr = None, None, None
gold_curr = gold_data['Open'][0]
dx_curr = dx_data['Open'][0]
crude_curr = crude_data['Open'][0]

gold_avg = gold_data['Open'].head(15).mean()
dx_avg = dx_data['Open'].head(15).mean()
crude_avg = crude_data['Open'].head(15).mean()

gdp = 28630.0

model = joblib.load('./soybeans_model_v1.pkl')
parameters = ['STU, US', 'STU, AR', 'STU, BR', 'STU, Corn', 'Gold', 'DX', 'Crude', 'GDP (Bn USD)']

input_avg = [stu_us, stu_ar, stu_br, stu_corn, gold_avg, dx_avg, crude_avg, gdp]
input_curr = [stu_us, stu_ar, stu_br, stu_corn, gold_curr, dx_curr, crude_curr, gdp]
input_avg = pd.DataFrame([input_avg], columns=parameters)
input_curr = pd.DataFrame([input_curr], columns=parameters)

# Scale the input data
# input_avg_scaled = scaler.transform(input_avg)
# input_curr_scaled = scaler.transform(input_curr)

input_avg = [stu_us, stu_ar, stu_br, stu_corn, gold_avg, dx_avg, crude_avg, gdp]
input_curr = [stu_us, stu_ar, stu_br, stu_corn, gold_curr, dx_curr, crude_curr, gdp]
high_curr, low_curr, avg_curr = predict_prices(input_curr, model)
high, low, avg = predict_prices(input_avg, model)

print(f"Predicted High Price: {high}")
print(f"Predicted Low Price: {low}")
print(f"Predicted Average Price: {avg}")

print(f"Predicted High Price w/ Today's Open: {high_curr}")
print(f"Predicted Low Price w/ Today's Open: {low_curr}")
print(f"Predicted Average Price w/ Today's Open: {avg_curr}")

Predicted High Price: 1163.7844791958448
Predicted Low Price: 1093.4929920634922
Predicted Average Price: 1128.3738035913673
Predicted High Price w/ Today's Open: 1152.7626979166664
Predicted Low Price w/ Today's Open: 1087.40375
Predicted Average Price w/ Today's Open: 1118.1024522569448


