# **Process for Wheat Yield Environment Data**

This step requires the file IWIN_Weather_AgERA5_20210211.txt to be located in the source_data folder, And then execute the notebook 1_env_processing.ipynb.

In [1]:
import csv

def txt_to_csv(input_file, output_file):
    with open(input_file, 'r') as txt_file:
        lines = txt_file.readlines()

    with open(output_file, 'w', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)

        for line in lines:
            row = line.strip().split(',')
            csv_writer.writerow(row)

if __name__ == "__main__":
    input_txt_file = "source_data/IWIN_Weather_AgERA5_20210211.txt"
    output_csv_file = "output/IWIN_Weather_AgERA5_20210211.csv" 

    txt_to_csv(input_txt_file, output_csv_file)


In [1]:
import pandas as pd

df = pd.read_csv('output/IWIN_Weather_AgERA5_20210211.csv')

df_filtered = df[df['Year'] >= 2002]

df_filtered.to_csv('output/IWIN_Weather_AgERA5_2003-2021.csv', index=False)


In [None]:
import pandas as pd

# 文件路径
weather_data_file = 'output/IWIN_Weather_AgERA5_2003-2021.csv'
merged_data_file = '../1_Pheno/output/AllWithGidDropMissingValuesSowHarFilteredGidLocUnormalNoDuplicated.csv'


weather_data = pd.read_csv(weather_data_file)
merged_data = pd.read_csv(merged_data_file)

unique_locations = merged_data['Loc_no'].drop_duplicates()

filtered_weather_data = weather_data[weather_data['location'].isin(unique_locations)]

filtered_weather_data.to_csv('output/IWIN_Weather_AgERA5_2003-2021_Trimed.csv', index=False)

# Print a message with the number of rows and unique locations retained
print(f"筛选完成！共保留 {len(filtered_weather_data)} 行数据，涉及 {len(unique_locations)} 个唯一地点。")

In [None]:
import pandas as pd
from datetime import datetime, timedelta

weather_data_file = 'output/IWIN_Weather_AgERA5_2003-2021_Trimed.csv'
merged_data_file = '../1_Pheno/output/AllWithGidDropMissingValuesSowHarFilteredGidLocUnormalNoDuplicated.csv'

weather_data = pd.read_csv(weather_data_file)
merged_data = pd.read_csv(merged_data_file)

# Optimize date field merging
def combine_date(year, month, day):
    return pd.to_datetime({'year': year, 'month': month, 'day': day})

# Convert sowing and harvest dates to datetime format
merged_data['SowDate'] = combine_date(
    merged_data['SowYear'], merged_data['SowMonth'], merged_data['SowDay']
)
merged_data['HarDate'] = combine_date(
    merged_data['HarYear'], merged_data['HarMonth'], merged_data['HarDay']
)

# Convert weather data date fields to datetime format
weather_data['Date'] = pd.to_datetime(
    weather_data[['Year', 'Month', 'Day']]
)

# Iterate through the main dataset
results = []

for index, row in merged_data.iterrows():
    print(index)
    loc_no = row['Loc_no']
    sow_date = row['SowDate']
    har_date = row['HarDate']
    days_before_sow = sow_date - timedelta(days=14)

    # Filter weather data
    weather_subset = weather_data[
        (weather_data['location'] == loc_no) &
        (weather_data['Date'] >= days_before_sow) &
        (weather_data['Date'] <= har_date)
    ]

    if weather_subset.empty:
        print(f"Warning: No weather data found for Loc_no={loc_no}, SowDate={sow_date}, HarvestDate={har_date}")
        continue

    # Group by every 7 days and calculate average, use direct average for less than 7 days
    weather_subset = weather_subset.sort_values('Date')
    weather_subset['WeekIndex'] = (weather_subset['Date'] - days_before_sow).dt.days // 7
    weekly_weather = weather_subset.groupby('WeekIndex').mean(numeric_only=True).round(2)

    # Select relevant weather columns
    selected_weather_columns = [
        'Precipitation [mm]', 'Relative Humidity max [%]', 
        'Relative Humidity min [%]', 'Shortwave Radiation [MJ/m2/d]', 
        'TemperatureMax [C]', 'TemperatureMin [C]', 
        'Vapor Pressure Deficit max [kPa]', 
        'Wind Speed 2m [m/s]', 'Wind Speed 10m [m/s]'
    ]

    flattened_data = {}
    for week_idx, week_data in weekly_weather.iterrows():
        for col in selected_weather_columns:
            flattened_data[f"Week{week_idx+1}_{col}"] = week_data[col]

    results.append({**row.to_dict(), **flattened_data})

final_df = pd.DataFrame(results)
final_df.to_csv('output/YieldWeeklyWeather.csv', index=False)

In [8]:
import pandas as pd
import numpy as np
import pickle

data = pd.read_csv('output/YieldWeeklyWeather.csv')

# Extract all columns starting from the specified column
start_column = "Week1_Precipitation [mm]"
subset_data = data.loc[:, start_column:]

# Fill missing values with 0
subset_data = subset_data.fillna(0)

# Extract values and store them as a 2D array
processed_data = []
for index, row in subset_data.iterrows():

    row_data = row.dropna().values  # Extract the numeric values
    if len(row_data) > 0:
        # Ensure the data length is a multiple of 9
        if len(row_data) % 9 == 0:
            row_matrix = np.array(row_data).reshape(-1, 9)  # Convert to a 2D array
            processed_data.append(row_matrix)
        else:
            print(index+1)
            print(row_data.shape)
            
data_array = np.array(processed_data)

with open('output/YieldWeeklyWeather.pkl', 'wb') as f:
    pickle.dump(data_array, f)

In [9]:
# Normalized weather

import numpy as np
import pandas as pd
import pickle

# Load the 2D array from the PKL file
with open('output/YieldWeeklyWeather.pkl', 'rb') as file:
    matrix = pickle.load(file)

N, W, H = matrix.shape

# Reshape the matrix to a 2D array for processing
matrix = matrix.reshape(N * W, H)

# Record the row indices where all values are zero
zero_rows = np.all(matrix == 0, axis=1)

# Replace all-zero rows with np.nan
matrix[zero_rows] = np.nan

# Compute the mean and standard deviation for each column, ignoring np.nan values
column_means = np.nanmean(matrix, axis=0)
column_stds = np.nanstd(matrix, axis=0)

# Initialize a new matrix to store the normalized data
normalized_matrix = np.zeros_like(matrix)

# Standardize each column
for i in range(matrix.shape[1]):
    if column_stds[i] != 0.0:  # Exclude columns with all zeros
        normalized_matrix[:, i] = (matrix[:, i] - column_means[i]) / column_stds[i]

# Replace np.nan values in the normalized matrix with 0
normalized_matrix[np.isnan(normalized_matrix)] = 0

normalized_matrix = normalized_matrix.reshape(N, W, H)

with open('output/YieldWeeklyWeatherNormalized.pkl', 'wb') as file:
    pickle.dump(normalized_matrix, file)