In [37]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import time
import matplotlib.pyplot as plt

In [38]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU is available and will be used: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU not found. The model will run on the CPU.")

GPU is available and will be used: NVIDIA GeForce RTX 4060


In [39]:
df = pd.read_csv('../Weather_Merged_CSVs/Banana.csv')
df.head()

Unnamed: 0,District Name,Market Name,Commodity,Variety,Grade,Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Price Date,Day Of Week,lookback_temp_mean,lookback_precip_sum
0,Dharmapuri,AJattihalli(Farmers Market),Banana,Besrai,Local,4000.0,4500.0,4500.0,2025-06-18,2,27.779167,258.2
1,Dharmapuri,AJattihalli(Farmers Market),Banana,Besrai,Local,4000.0,4500.0,4500.0,2024-12-26,3,25.363333,578.0
2,Dharmapuri,AJattihalli(Farmers Market),Banana,Besrai,Local,4500.0,5000.0,5000.0,2025-06-01,6,27.2775,217.9
3,Dharmapuri,AJattihalli(Farmers Market),Banana,Besrai,Local,4500.0,5000.0,5000.0,2025-06-17,1,27.753333,256.6
4,Dharmapuri,AJattihalli(Farmers Market),Banana,Besrai,Local,4500.0,5000.0,5000.0,2024-11-18,0,26.224167,542.6


In [40]:
# Time Series Analysis 
df['Price Date'] = pd.to_datetime(df['Price Date'])
df.sort_values('Price Date', inplace=True)
df.set_index('Price Date', inplace=True)
df.head()

Unnamed: 0_level_0,District Name,Market Name,Commodity,Variety,Grade,Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Day Of Week,lookback_temp_mean,lookback_precip_sum
Price Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2016-04-30,Coimbatore,Sulur,Banana,Other,Large,1200.0,1400.0,1300.0,5,27.2075,12.8
2018-05-07,Coimbatore,Sulur,Banana,Other,Large,3200.0,3400.0,3300.0,0,27.240833,117.6
2018-09-20,Theni,Chinnamanur,Banana,Poovan,Medium,1300.0,1600.0,1450.0,3,26.824167,309.3
2018-09-27,Theni,Chinnamanur,Banana,Poovan,Medium,1300.0,1600.0,1450.0,3,26.780833,359.3
2018-10-12,Theni,Chinnamanur,Banana,Other,Medium,1200.0,1400.0,1300.0,4,26.440833,505.7


In [41]:
categorical_cols = ['District Name', 'Market Name', 'Commodity', 'Variety', 'Grade']
for col in categorical_cols:
    encoder = LabelEncoder()
    # Fit the encoder to the column and transform the text to numbers (e.g., 'Coimbatore', 'Theni' -> 0, 1)
    df[col] = encoder.fit_transform(df[col])
df.head()

Unnamed: 0_level_0,District Name,Market Name,Commodity,Variety,Grade,Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Day Of Week,lookback_temp_mean,lookback_precip_sum
Price Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2016-04-30,2,191,0,1,0,1200.0,1400.0,1300.0,5,27.2075,12.8
2018-05-07,2,191,0,1,0,3200.0,3400.0,3300.0,0,27.240833,117.6
2018-09-20,24,32,0,2,2,1300.0,1600.0,1450.0,3,26.824167,309.3
2018-09-27,24,32,0,2,2,1300.0,1600.0,1450.0,3,26.780833,359.3
2018-10-12,24,32,0,1,2,1200.0,1400.0,1300.0,4,26.440833,505.7


In [42]:
# Scales values between 0 and 1 for each column
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df)
df_scaled = pd.DataFrame(scaled_data, columns=df.columns)
df_scaled.head()

Unnamed: 0,District Name,Market Name,Commodity,Variety,Grade,Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Day Of Week,lookback_temp_mean,lookback_precip_sum
0,0.057143,0.795833,0.0,0.333333,0.0,0.047428,0.013803,0.012803,0.833333,0.75952,0.0
1,0.057143,0.795833,0.0,0.333333,0.0,0.127476,0.033807,0.032807,0.0,0.761455,0.083049
2,0.685714,0.133333,0.0,0.666667,0.666667,0.051431,0.015803,0.014303,0.5,0.737262,0.234963
3,0.685714,0.133333,0.0,0.666667,0.666667,0.051431,0.015803,0.014303,0.5,0.734746,0.274586
4,0.685714,0.133333,0.0,0.333333,0.666667,0.047428,0.013803,0.012803,0.666667,0.715005,0.390601


In [43]:
def create_sequences(data, n_steps, target_column):
    X, y = [], []
    for i in range(len(data) - n_steps):
        X.append(data.iloc[i:(i + n_steps)].values)
        y.append(data.iloc[i + n_steps][target_column])
    return np.array(X), np.array(y).reshape(-1, 1)

In [44]:
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_prob):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_prob)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])
        return out

In [45]:
# Tuning n_steps
n_steps_to_test = [7, 14, 21, 30, 60]
RMSE_results = {}
target_column = 'Modal Price (Rs./Quintal)'

for n_steps in n_steps_to_test:
    print("-" * 50)
    print(f"Testing with n_steps = {n_steps}")
    start_time = time.time()

    # 1. Create sequences for the current n_steps
    X, y = create_sequences(df_scaled, n_steps, target_column)

    # 2. Split data and create PyTorch Tensors
    split = int(0.8 * len(X))
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]

    X_train_tensor = torch.from_numpy(X_train).float()
    y_train_tensor = torch.from_numpy(y_train).float()
    X_test_tensor = torch.from_numpy(X_test).float()
    y_test_tensor = torch.from_numpy(y_test).float()

    # 3. Create DataLoaders
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # 4. Initialize the Model, Loss, and Optimizer
    input_size = X_train.shape[2]
    model = GRUModel(input_size, hidden_size=50, num_layers=2, output_size=1, dropout_prob=0.2).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # 5. Train the Model
    num_epochs = 50
    for epoch in range(num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # 6. Make Predictions
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for batch_X, _ in test_loader:
            batch_X = batch_X.to(device)
            outputs = model(batch_X)
            all_predictions.append(outputs.cpu().numpy())
    predictions = np.concatenate(all_predictions)

    # 7. Inverse Transform to Get Actual Price Values
    price_col_index = df_scaled.columns.get_loc(target_column)
    
    # Inverse predictions
    dummy_pred = np.zeros((len(predictions), df_scaled.shape[1]))
    dummy_pred[:, price_col_index] = predictions.flatten()
    inversed_predictions = scaler.inverse_transform(dummy_pred)[:, price_col_index]
    
    # Inverse actual values
    dummy_actual = np.zeros((len(y_test), df_scaled.shape[1]))
    dummy_actual[:, price_col_index] = y_test.flatten()
    inversed_actual = scaler.inverse_transform(dummy_actual)[:, price_col_index]
    
    # 8. Calculate and Store RMSE
    rmse = np.sqrt(mean_squared_error(inversed_actual, inversed_predictions))
    mean_actual_price = np.mean(inversed_actual)
    nrmse = rmse / mean_actual_price
    RMSE_results[n_steps] = {'rmse': rmse, 'nrmse': nrmse}
    
    end_time = time.time()
    print(f"n_steps = {n_steps} | RMSE = {rmse:.2f} | nRMSE = {nrmse:.4f} | Time = {end_time - start_time:.2f}s")


--------------------------------------------------
Testing with n_steps = 7
n_steps = 7 | RMSE = 1584.31 | nRMSE = 0.2816 | Time = 117.62s
--------------------------------------------------
Testing with n_steps = 14
n_steps = 14 | RMSE = 1613.60 | nRMSE = 0.2868 | Time = 124.54s
--------------------------------------------------
Testing with n_steps = 21
n_steps = 21 | RMSE = 1583.14 | nRMSE = 0.2814 | Time = 120.84s
--------------------------------------------------
Testing with n_steps = 30
n_steps = 30 | RMSE = 1611.71 | nRMSE = 0.2865 | Time = 118.61s
--------------------------------------------------
Testing with n_steps = 60
n_steps = 60 | RMSE = 1584.62 | nRMSE = 0.2817 | Time = 117.19s


In [46]:
print("\n" + "="*60)
print("Hyperparameter Tuning Results:")
print("="*60)
print(f"{'n_steps':<10} | {'RMSE (Rs./Quintal)':<20} | {'nRMSE (%)':<15}")
print("-" * 55)

best_n_steps = -1
best_nrmse = float('inf')

for n_steps, metrics in RMSE_results.items():
    print(f"{n_steps:<10} | {metrics['rmse']:<20.2f} | {metrics['nrmse'] * 100:<15.2f}%")
    if metrics['nrmse'] < best_nrmse:
        best_nrmse = metrics['nrmse']
        best_n_steps = n_steps

print("-" * 55)
print(f"Best n_steps found: {best_n_steps} with an nRMSE of {best_nrmse * 100:.2f}%")
print("="*60)



Hyperparameter Tuning Results:
n_steps    | RMSE (Rs./Quintal)   | nRMSE (%)      
-------------------------------------------------------
7          | 1584.31              | 28.16          %
14         | 1613.60              | 28.68          %
21         | 1583.14              | 28.14          %
30         | 1611.71              | 28.65          %
60         | 1584.62              | 28.17          %
-------------------------------------------------------
Best n_steps found: 21 with an nRMSE of 28.14%
