## Crop Yield Prediction using XGBoost

### Importing the Libraries.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

### Loading the numpy binary dataset

In [2]:
## Data load
weather_data = np.load('inputs_weather_train.npy', allow_pickle=False, fix_imports=True)
other_data = np.load('inputs_others_train.npy', allow_pickle=False, fix_imports=True)
yield_data = np.load('yield_train.npy', fix_imports=True, allow_pickle=False)
clusterID_genotype = np.load('clusterID_genotype.npy', fix_imports=True, allow_pickle=False)

weather_test_data = np.load('inputs_weather_test.npy', allow_pickle=False, fix_imports=True)
other_test_data = np.load('inputs_others_test.npy', allow_pickle=False, fix_imports=True)

### Scaling and Encoding

In [3]:
encoder = LabelEncoder()
scaler = MinMaxScaler(feature_range=(0, 1))
other_data[:, 2] = encoder.fit_transform(other_data[:, 2].reshape(-1, 1))

  return f(*args, **kwargs)


In [5]:
[length, days, prop_num] = weather_data.shape 

# main data part
weather_data1 = np.zeros([length, int(days/7), prop_num])

# sum up year data by week
for i in range(length):
    year_data = weather_data[i]
    year_data1 = year_data[0:int(days/7)*7, :]
    year_data2 = year_data1.reshape(int(days/7),7,7)
    year_by_week = year_data2.sum(axis=1)
    weather_data1[i] = year_by_week

weather_data3 = weather_data1.reshape(length, 7 * int(days/7))
weather_df = pd.DataFrame(weather_data3)
other_df = pd.DataFrame(other_data, columns=['MG', 'Genotype_ID', 'State', 'Year', 'Location'])
yield_df = pd.DataFrame(yield_data, columns=['Yield'])

# Combining the entire dataset 
combined_df = pd.concat([weather_df, other_df, yield_df], axis = 1, join = 'inner')

In [5]:
state = combined_df['State'].values

In [6]:
state

array(['7', '0', '25', ..., '6', '9', '3'], dtype=object)

In [7]:
X = combined_df.iloc[:,0:-1].values # selects everything except label and assign to X
Y_real = combined_df.iloc[:, -1].values # selects yield data part and assign to Y_real
max_val = np.max(Y_real)     # maximum value of Y_real 
Y = Y_real/max_val


In [8]:
X = scaler.fit_transform(X)
X = np.asarray(X).astype(np.float32)

In [9]:
X[:, -3] = state

### Splitting the dataset into train_test. Ratio 9:1

In [10]:
# Split the dataset to training and testing, ratio 9:1
training_size = int(len(X) * 0.90)
test_size = len(X) - training_size
X_train, X_test = X[0:training_size], X[training_size:len(X)]
Y_train, Y_test = Y[0:training_size], Y[training_size:len(X)]

### GridSearch CV for hyper-parameter tuning

In [11]:
##GridSearch CV for hyper-parameter-tuning.
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import time
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

model = XGBRegressor()

param_grid = {
             'max_depth': [8, 9],
             'n_estimators':[800,1000],
             'subsample': [0.6,0.7],
             'colsample_bytree': [0.8,0.9]}


kfold = KFold(n_splits=5, shuffle=True, random_state=10)
grid_search = GridSearchCV(model, param_grid, scoring="r2", n_jobs=-1)
grid_result = grid_search.fit(X_train, Y_train)

# # summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_param_))
print(time.time()-start_time)

In [12]:
xgb=XGBRegressor(n_estimators=1500, max_depth=8, eta=0.1, subsample=1.0, colsample_bytree=0.8,
                 min_child_weight=7, reg_alpha=0.01)

xgb.fit(X_train, Y_train)
test_predict=xgb.predict(X_test)
trainScore = np.sqrt(mean_squared_error(Y_test *max_val, test_predict*max_val))
print ('Training RMSE', trainScore)

Training MSE 6.242293070203708


In [13]:
xgb

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             eta=0.1, gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.100000001,
             max_delta_step=0, max_depth=8, min_child_weight=7, missing=nan,
             monotone_constraints='()', n_estimators=1500, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0,
             reg_alpha=0.01, reg_lambda=1, scale_pos_weight=1, subsample=1.0,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [14]:
[length_test, days_test, prop_num_test] = weather_test_data.shape

weather_test_data1 = np.zeros([length_test, int(days_test/7), prop_num_test])

for i in range(length_test):
    year_data = weather_test_data[i]
    year_data1 = year_data[0:int(days/7)*7, :]
    year_data2 = year_data1.reshape(30,7, prop_num_test)
    year_by_week = year_data2.sum(axis=1)
    weather_test_data1[i] = year_by_week


weather_data3 = weather_test_data1.reshape(length_test, prop_num_test * int(days/7))
weather_test_df = pd.DataFrame(weather_data3)

other_test_df = pd.DataFrame(other_test_data, columns=['MG', 'Genotype_ID', 'State', 'Year', 'Location'])

In [15]:
other_test_df

Unnamed: 0,MG,Genotype_ID,State,Year,Location
0,3.0,3069.0,"""IA""",2010.0,41.0
1,4.0,2526.0,"""IN""",2004.0,154.0
2,3.0,636.0,"""IA""",2014.0,41.0
3,5.0,1350.0,"""MD""",2005.0,113.0
4,3.0,2983.0,"""IL""",2006.0,148.0
...,...,...,...,...,...
10332,1.0,4511.0,"""MI""",2013.0,64.0
10333,3.0,5106.0,"""NE""",2007.0,136.0
10334,2.0,5826.0,"""IL""",2008.0,148.0
10335,1.0,5466.0,"""SD""",2005.0,10.0


In [16]:
other_test_df["State"] = encoder.transform(other_test_df["State"])

In [17]:
test_states = other_test_df["State"].values
test_states

array([ 4,  6,  4, ...,  5, 24, 17])

In [18]:
len(test_states)

10337

In [19]:
combined_test_df = pd.concat([weather_test_df, other_test_df], axis = 1, join = 'inner')
X_test = combined_test_df.values


In [20]:
len(X_test)

10337

In [21]:
X_test = scaler.transform(X_test)
X_test = np.asarray(X_test).astype(np.float32)

In [22]:
X_test[:, -3] = test_states
# X_test = X_test.reshape(X_test.shape[0],1,X_test.shape[1])
test_predict=xgb.predict(X_test)*max_val

In [22]:
np.save("test_predict-Final_best.npy",  test_predict)  # save the new