# The Future of Harveston: Predicting Nature's Shifts

## Read Data

In [40]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# sMAPE
def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-8))

# Load datasets
train = pd.read_csv('train.csv', skip_blank_lines=True)
test = pd.read_csv('test.csv',skip_blank_lines=True)
print('Training and test data loaded.')
train.head(4)

Training and test data loaded.


Unnamed: 0,ID,Year,Month,Day,kingdom,latitude,longitude,Avg_Temperature,Avg_Feels_Like_Temperature,Temperature_Range,Feels_Like_Temperature_Range,Radiation,Rain_Amount,Rain_Duration,Wind_Speed,Wind_Direction,Evapotranspiration
0,1,1,4,1,Arcadia,24.280002,-37.22998,25.5,30.5,8.5,10.3,22.52,58.89,16,8.6,283,1.648659
1,2,1,4,1,Atlantis,22.979999,-37.32999,299.65,305.15,5.9,8.2,22.73,11.83,12,15.8,161,1.583094
2,3,1,4,1,Avalon,22.88,-37.130006,26.3,31.5,5.2,6.4,22.73,11.83,12,15.8,161,1.593309
3,4,1,4,1,Camelot,24.180003,-36.929994,24.0,28.4,8.2,10.7,22.67,75.27,16,6.4,346,1.638997


In [19]:
test.head(4)

Unnamed: 0,ID,Year,Month,Day,kingdom
0,84961,9,1,1,Arcadia
1,84962,9,1,1,Atlantis
2,84963,9,1,1,Avalon
3,84964,9,1,1,Camelot


## Data Preprocessing and Feature Engineering

In [44]:
#forward fill to handle initial missing values
train.ffill(inplace=True)
train['Year'] = train['Year'].apply(lambda x: x if x > 1000 else x + 2000)
train['date'] = pd.to_datetime(
    train['Year'].astype(str).str.zfill(4) + '-' +
    train['Month'].astype(str).str.zfill(2) + '-' +
    train['Day'].astype(str).str.zfill(2),
    errors='coerce'
)
for lag in [1, 2, 3]:
    train[f'Avg_Temperature_lag{lag}'] = train['Avg_Temperature'].shift(lag)
train.bfill(inplace=True)

print('Preprocessing and feature engineering completed.')


Preprocessing and feature engineering completed.


## Model Training

We define our target variables and use a RandomForestRegressor as an example regression model for each target.

In [46]:
target_vars = ['Avg_Temperature', 'Radiation', 'Rain_Amount', 'Wind_Speed', 'Wind_Direction']

models = {}
features = [col for col in train.columns if col not in ['ID', 'Year', 'Month', 'Day', 'kingdom', 'date'] + target_vars]

for target in target_vars:
    X = train[features]
    y = train[target]
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    models[target] = model
    print(f'Model trained for {target}')

Model trained for Avg_Temperature
Model trained for Radiation
Model trained for Rain_Amount
Model trained for Wind_Speed
Model trained for Wind_Direction


## Testing and Evaluation

Here we preprocess the test dataset similarly, perform predictions for each target, and compute sMAPE (for demonstration we use training data evaluation since true test targets are not available).

In [47]:
test.ffill(inplace=True)
test['Year'] = test['Year'].apply(lambda x: x if x > 1000 else x + 2000)
test['date'] = pd.to_datetime(
    test['Year'].astype(str).str.zfill(4) + '-' +
    test['Month'].astype(str).str.zfill(2) + '-' +
    test['Day'].astype(str).str.zfill(2),
    errors='coerce'
)
test_processed_list = []

for kingdom in test['kingdom'].unique():
    train_tail = train[train['kingdom'] == kingdom].tail(3)
    test_kingdom = test[test['kingdom'] == kingdom].copy()
    combined = pd.concat([train_tail, test_kingdom], ignore_index=True)

    for lag in [1, 2, 3]:
        combined[f'Avg_Temperature_lag{lag}'] = combined['Avg_Temperature'].shift(lag)

    test_combined = combined.iloc[len(train_tail):].copy()
    test_processed_list.append(test_combined)

test_processed = pd.concat(test_processed_list, ignore_index=True)

test_processed.bfill(inplace=True)

results = {}
for target in target_vars:
    X_test = test_processed[features]
    preds = models[target].predict(X_test)
    results[target] = preds
    train_preds = models[target].predict(train[features])
    score = smape(train[target].values, train_preds)
    print(f'sMAPE for {target}: {score:.2f}')


sMAPE for Avg_Temperature: 0.17
sMAPE for Radiation: 0.95
sMAPE for Rain_Amount: 15.17
sMAPE for Wind_Speed: 4.98
sMAPE for Wind_Direction: 11.02


## Submission File Creation

In [49]:
submission = test[['ID']].copy()
for target in target_vars:
    submission[target] = results[target]

submission.to_csv('submission.csv', index=False)
print('Submission file created as submission.csv')

AttributeError: 'RandomForestRegressor' object has no attribute 'save'