In [37]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, Flatten
from tensorflow.keras.optimizers import Adam

# Load the dataset
data_path = 'train_data.csv'  # Update this path
data = pd.read_csv(data_path)

In [38]:
data.head()

Unnamed: 0,index,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,...,wind-vwnd-925-2010-11,wind-vwnd-925-2010-12,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20
0,0,0.0,0.833333,9/1/14,237.0,29.02,31.64,29.57,30.73,29.71,...,-27.68,-37.21,8.32,9.56,-2.03,48.13,28.09,-13.5,11.9,4.58
1,1,0.0,0.833333,9/2/14,228.9,29.02,31.64,29.57,30.73,29.71,...,-21.13,-36.57,8.77,21.17,4.44,48.6,27.41,-23.77,15.44,3.42
2,2,0.0,0.833333,9/3/14,220.69,29.02,31.64,29.57,30.73,29.71,...,-10.72,-34.16,6.99,32.16,5.01,48.53,19.21,-33.16,15.11,4.82
3,3,0.0,0.833333,9/4/14,225.28,29.02,31.64,29.57,30.73,29.71,...,0.33,-31.04,6.17,39.66,-1.41,50.59,8.29,-37.22,18.24,9.74
4,4,0.0,0.833333,9/5/14,237.24,29.02,31.64,29.57,30.73,29.71,...,9.83,-31.8,7.47,38.62,-5.21,54.73,-2.58,-42.3,21.91,10.95


In [39]:
# Check for missing values in the dataset
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)

Missing values in each column:
 index                                 0
lat                                   0
lon                                   0
startdate                             0
contest-pevpr-sfc-gauss-14d__pevpr    0
                                     ..
wind-vwnd-925-2010-16                 0
wind-vwnd-925-2010-17                 0
wind-vwnd-925-2010-18                 0
wind-vwnd-925-2010-19                 0
wind-vwnd-925-2010-20                 0
Length: 246, dtype: int64


In [40]:
# show the data types of the columns
data.dtypes

index                                   int64
lat                                   float64
lon                                   float64
startdate                              object
contest-pevpr-sfc-gauss-14d__pevpr    float64
                                       ...   
wind-vwnd-925-2010-16                 float64
wind-vwnd-925-2010-17                 float64
wind-vwnd-925-2010-18                 float64
wind-vwnd-925-2010-19                 float64
wind-vwnd-925-2010-20                 float64
Length: 246, dtype: object

In [41]:
# Convert 'startdate' to datetime format
data['startdate'] = pd.to_datetime(data['startdate'])

  data['startdate'] = pd.to_datetime(data['startdate'])


In [42]:
data.dtypes

index                                          int64
lat                                          float64
lon                                          float64
startdate                             datetime64[ns]
contest-pevpr-sfc-gauss-14d__pevpr           float64
                                           ...      
wind-vwnd-925-2010-16                        float64
wind-vwnd-925-2010-17                        float64
wind-vwnd-925-2010-18                        float64
wind-vwnd-925-2010-19                        float64
wind-vwnd-925-2010-20                        float64
Length: 246, dtype: object

In [43]:
# Assuming 'temp_day' and 'temp_night' are the average daytime and nighttime temperatures from your dataset
data['day_temp_optimal'] = data.apply(lambda x: 1 if 70 <= x['nmme0-tmp2m-34w__cancm30'] <= 80 else 0, axis=1)
data['night_temp_optimal_foliage'] = data.apply(lambda x: 1 if 60 <= x['nmme0-tmp2m-34w__ccsm30'] <= 68 else 0, axis=1)
data['night_temp_optimal_flowering'] = data.apply(lambda x: 1 if 55 <= x['nmme0-tmp2m-34w__ccsm30'] <= 60 else 0, axis=1)

# Assuming foliage plants are our  primary concern
data['overall_temp_optimal'] = data['day_temp_optimal'] & data['night_temp_optimal_foliage']

In [44]:
data.head()

Unnamed: 0,index,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,...,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20,day_temp_optimal,night_temp_optimal_foliage,night_temp_optimal_flowering,overall_temp_optimal
0,0,0.0,0.833333,2014-09-01,237.0,29.02,31.64,29.57,30.73,29.71,...,-2.03,48.13,28.09,-13.5,11.9,4.58,0,0,0,0
1,1,0.0,0.833333,2014-09-02,228.9,29.02,31.64,29.57,30.73,29.71,...,4.44,48.6,27.41,-23.77,15.44,3.42,0,0,0,0
2,2,0.0,0.833333,2014-09-03,220.69,29.02,31.64,29.57,30.73,29.71,...,5.01,48.53,19.21,-33.16,15.11,4.82,0,0,0,0
3,3,0.0,0.833333,2014-09-04,225.28,29.02,31.64,29.57,30.73,29.71,...,-1.41,50.59,8.29,-37.22,18.24,9.74,0,0,0,0
4,4,0.0,0.833333,2014-09-05,237.24,29.02,31.64,29.57,30.73,29.71,...,-5.21,54.73,-2.58,-42.3,21.91,10.95,0,0,0,0


In [45]:
# Assuming the presence of columns for average wind speed ('wind_avg') and the optimal range flags from earlier
import numpy as np

# Temperature Stress Indicator: Difference from optimal range boundaries
data['temp_stress'] = np.where(data['day_temp_optimal'] == 0, abs(data[['nmme0-tmp2m-34w__cancm30', 'nmme0-tmp2m-34w__ccsm30']].mean(axis=1) - 75), 0)

# Wind Stress Indicator: High wind speeds (arbitrarily defining high wind speed as > 20 units)
data['wind_stress'] = np.where(data['wind-vwnd-925-2010-1'] > 20, 1, 0)  # This may need to refined based on more data from plant stress

# Combine into a composite plant stress indicator (simple example, more sophisticated methods could be applied)
data['plant_stress_indicator'] = data['temp_stress'] + data['wind_stress']

In [46]:
# Define the columns we need for the models
required_columns = [
    'nmme0-tmp2m-34w__cancm30', 'nmme0-tmp2m-34w__ccsm30',  # Temperature Forecasts
    'contest-pevpr-sfc-gauss-14d__pevpr',  # Potential Evaporation Rate
    'wind-vwnd-925-2010-1',  # Example wind speed column
    'startdate',  # Date/Time Information
    # Derived target variables
    'day_temp_optimal', 'night_temp_optimal_foliage', 'night_temp_optimal_flowering',
    'overall_temp_optimal', 'temp_stress', 'wind_stress', 'plant_stress_indicator'
]

# Filter the dataset to only include these columns
filtered_data = data[required_columns].copy()

# Display the first few rows to verify the filtering
print(filtered_data.head())

   nmme0-tmp2m-34w__cancm30  nmme0-tmp2m-34w__ccsm30  \
0                     29.02                    29.57   
1                     29.02                    29.57   
2                     29.02                    29.57   
3                     29.02                    29.57   
4                     29.02                    29.57   

   contest-pevpr-sfc-gauss-14d__pevpr  wind-vwnd-925-2010-1  startdate  \
0                              237.00               -107.46 2014-09-01   
1                              228.90               -105.73 2014-09-02   
2                              220.69               -102.51 2014-09-03   
3                              225.28                -96.11 2014-09-04   
4                              237.24                -89.19 2014-09-05   

   day_temp_optimal  night_temp_optimal_foliage  night_temp_optimal_flowering  \
0                 0                           0                             0   
1                 0                           0         

In [47]:
# Save the filtered dataset to a CSV file
filtered_data.to_csv('filtered_ecosmart_pot_data.csv', index=False)

In [48]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the preprocessed dataset
filtered_data = pd.read_csv('filtered_ecosmart_pot_data.csv')

# Define the subset size
subset_size = 1000

# Take a random sample of the data
subset_data = data.sample(n=subset_size, random_state=42)

# Define the rolling window size
window_size = 7

# Initialize lists to store evaluation metrics
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
confusion_matrices = []

# Iterate over the subset with the rolling window
for i in range(len(subset_data) - window_size + 1):
    # Extract data for the current rolling window
    window_data = subset_data.iloc[i:i+window_size]
    
    # Extract features and target variable for the current window
    X_train = window_data[['nmme0-tmp2m-34w__cancm30', 'nmme0-tmp2m-34w__ccsm30', 
                           'contest-pevpr-sfc-gauss-14d__pevpr', 'wind-vwnd-925-2010-1']]
    y_train = window_data['overall_temp_optimal']

    # Initialize the imputer with strategy 'mean'
    imputer = SimpleImputer(strategy='mean')

    # Fit the imputer on X_train
    imputer.fit(X_train)

    # Transform X_train with the imputer
    X_train_imputed = imputer.transform(X_train)
    
    # Initialize and train the Random Forest model
    model = RandomForestClassifier()
    model.fit(X_train_imputed, y_train)
    
    # Predict the target variable for the current window
    predictions = model.predict(X_train_imputed)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_train, predictions)
    precision = precision_score(y_train, predictions)
    recall = recall_score(y_train, predictions)
    f1 = f1_score(y_train, predictions)
    confusion = confusion_matrix(y_train, predictions)
    
    # Store the evaluation metrics
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    confusion_matrices.append(confusion)

# Print the evaluation metrics
print("Accuracy:", accuracy_scores)
print("Precision:", precision_scores)
print("Recall:", recall_scores)
print("F1 Score:", f1_scores)
print("Confusion Matrix:", confusion_matrices)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_

Accuracy: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [49]:
rolling_window_results = pd.DataFrame({
    'Accuracy': accuracy_scores,
    'Precision': precision_scores,
    'Recall': recall_scores,
    'F1 Score': f1_scores,
    'Confusion Matrix': confusion_matrices
})

# Save the DataFrame to a CSV file
rolling_window_results.to_csv('rolling_window_results.csv', index=False)