In [13]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
# Copy dataset to avoid modifying original accidentally
merged = pd.read_csv("data/yield_climate_merged.csv")
data = merged.copy()

# Label encode crop and country
le_crop = LabelEncoder()
le_country = LabelEncoder()

data['crop_encoded'] = le_crop.fit_transform(data['crop'])
data['country_encoded'] = le_country.fit_transform(data['country'])

print("Encoding complete.")
print('\n')
print(data[['crop', 'crop_encoded']].drop_duplicates().head())
print('\n')
print(data[['country', 'country_encoded']].drop_duplicates().head(10))


Encoding complete.


            crop  crop_encoded
0   Maize (corn)             0
22          Rice             1
44    Sugar cane             2
66         Wheat             3


         country  country_encoded
0    Afghanistan                0
88       Algeria                1
154       Angola                2
242    Argentina                3
330    Australia                4
418   Azerbaijan                5
484   Bangladesh                6
572       Belize                7
638        Benin                8
704       Bhutan                9


In [4]:
data

Unnamed: 0,country,crop,year,yield_kg_ha,t2m,rad,rh2m,precip,lag_yield_kg_ha,crop_encoded,country_encoded
0,Afghanistan,Maize (corn),2002,2980.0,5.046154,20.841538,40.205385,5.49,2000.0,0,0
1,Afghanistan,Maize (corn),2003,840.0,4.636923,20.180769,44.924615,6.76,2980.0,0,0
2,Afghanistan,Maize (corn),2004,1600.0,5.232308,21.035385,42.566154,6.56,840.0,0,0
3,Afghanistan,Maize (corn),2005,1206.9,3.538462,20.149231,46.926154,8.85,1600.0,0,0
4,Afghanistan,Maize (corn),2006,2620.4,4.574615,20.161538,48.684615,12.50,1206.9,0,0
...,...,...,...,...,...,...,...,...,...,...,...
7932,Zimbabwe,Wheat,2019,3914.9,22.114615,22.482308,49.091538,15.05,2717.8,3,102
7933,Zimbabwe,Wheat,2020,4779.6,21.335385,20.869231,54.176154,28.38,3914.9,3,102
7934,Zimbabwe,Wheat,2021,5075.9,20.162308,21.315385,60.146923,24.58,4779.6,3,102
7935,Zimbabwe,Wheat,2022,5154.2,20.466154,21.053846,60.271538,27.40,5075.9,3,102


In [10]:
# Sort by year just to be safe
data = data.sort_values('year')

# Train = 2002–2017, Test = 2018–2023 {dropped 2001 during lag yield creation}
train = data[data['year'] <= 2017]
test  = data[data['year'] >  2017]

print("Train size:", len(train))
print("Test size:", len(test))

# Define features and target
features = ['t2m', 'precip', 'rad', 'rh2m', 'lag_yield_kg_ha',
            'crop_encoded', 'country_encoded']

target = 'yield_kg_ha'

X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]


Train size: 5785
Test size: 2152


In [11]:
data.head()

Unnamed: 0,country,crop,year,yield_kg_ha,t2m,rad,rh2m,precip,lag_yield_kg_ha,crop_encoded,country_encoded
6602,South Africa,Wheat,2002,2590.6,17.874615,22.388462,41.238462,8.14,2570.8,3,85
6624,Spain,Maize (corn),2002,9514.2,13.202308,16.440769,64.092308,16.55,9720.8,0,86
6514,Somalia,Wheat,2002,373.6,28.02,22.956923,54.958462,9.11,365.4,3,84
6536,South Africa,Maize (corn),2002,2851.6,17.874615,22.388462,41.238462,8.14,2437.1,0,85
6558,South Africa,Rice,2002,2356.3,17.874615,22.388462,41.238462,8.14,2285.7,1,85


In [12]:
data.to_csv("data/yield_climate_final.csv", index=False)
print("Saved final processed dataset to yield_climate_final.csv")


Saved final processed dataset to yield_climate_final.csv
