<!--
import data_analytics.github as github
print(github.create_jupyter_notebook_header("markcrowe-com", "agriculture-data-analytics", "notebooks/notebook-3-03-ml-milk-production.ipynb", "master"))
-->
<table style="margin: auto;"><tr><td><a href="https://mybinder.org/v2/gh/markcrowe-com/agriculture-data-analytics/master?filepath=notebooks/notebook-3-03-ml-milk-production.ipynb" target="_parent"><img src="https://mybinder.org/badge_logo.svg" alt="Open In Binder"/></a></td><td>online editors</td><td><a href="https://colab.research.google.com/github/markcrowe-com/agriculture-data-analytics/blob/master/notebooks/notebook-3-03-ml-milk-production.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a></td></tr></table>

# Objective
Create the best machine learning model to predict Milk production value in Ireland according to the historical data from Central Statistics office CSO. AEA01 Value at Current Prices for Output, Input and Income in Agriculture Downloaded https://data.cso.ie/table/AEA01 

# Contents
    - Read data from Assets folder
    - Split to training / testing sets
    - Scale each set seperatly
    - Run Models
        - Define Hyper parameter tuning Cross Validation Grid or Random Search
        - Random Forest Regressor
        - XGBOOST Regressor
        - ANN
    - Save best model into Pickle file
    - Next step: Deploy selected model on a Streamlit webapp

### Setup

Import required third party Python libraries, import supporting functions and sets up data source file paths.

In [1]:
# Local
#!pip install -r script/requirements.txt
# Remote option
#!pip install -r https://raw.githubusercontent.com/tahirawwad/agriculture-data-analytics/requirements.txt
#Options: --quiet --user

In [2]:
from keras_tuner.tuners import RandomSearch
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
from tensorflow.keras import layers
from xgboost import XGBRegressor
import numpy as np
import os
import pandas as pd
import pickle
import tensorflow as tf

### Load dataframe

In [3]:
df = pd.read_csv("./../artifacts/irish-milk-production-eda-output.csv")
print("data dimensions \n",df.shape)
print()
#print("data column info \n",df.info)
print()
print('Data sample\n')

df.sample(5)

data dimensions 
 (31, 55)


Data sample



Unnamed: 0,Year,Agricultural Output at Basic Prices,All Cereals,All Crops,All Livestock,All Livestock Products,All Livestock Products - Milk,All Livestock Products Other Products (excluding Milk),Compensation of Employees,Contract Work,...,Livestock - Horses,Livestock - Pig,Livestock - Poultry,Livestock - Sheep,Net Value Added at Basic Prices,Operating Surplus,Other Subsidies Less Taxes on Production,Subsidies less Taxes on Products,Subsidies on Products,Taxes on Products
29,2019,8516.4,327.2,1893.6,3372.4,2689.7,2608.6,81.1,586.2,461.2,...,255.5,543.0,169.2,260.8,1881.1,2924.8,1630.0,99.5,150.6,51.1
25,2015,7404.2,262.7,1737.3,3452.5,1949.4,1881.1,68.3,489.0,348.0,...,247.3,456.3,142.2,245.2,1666.3,2587.2,1409.9,-82.9,43.2,126.2
17,2007,5975.4,241.8,1632.7,2378.7,1716.1,1667.5,48.6,448.6,288.3,...,269.3,288.6,133.0,184.5,1045.8,2442.0,1844.9,-40.4,0.7,41.1
24,2014,7293.9,280.5,1747.5,3070.1,2151.3,2093.1,58.2,495.7,358.7,...,221.7,471.3,133.3,231.6,1411.1,2441.2,1525.8,-33.7,28.8,62.5
28,2018,8663.7,288.4,2126.0,3431.4,2639.2,2556.7,82.6,567.7,453.2,...,306.4,459.1,159.0,258.8,1708.0,2823.0,1682.8,13.9,65.1,51.2


## Production of Milk

In [4]:
## Extract milk production dataset
# drop redundunt columns

# extract milk dataset
df_milk = df[['Year',
              'All Livestock Products - Milk',
              'Taxes on Products',
              'Subsidies on Products',
              'Compensation of Employees',
              'Contract Work',
              'Entrepreneurial Income',
              'Factor Income',
              'Fixed Capital Consumption - Farm Buildings',
              'Fixed Capital Consumption - Machinery, Equipment, etc',
              'Interest less FISIM',
              'Operating Surplus',
              'Livestock - Cattle',
              'Livestock - Sheep',
              'Land Rental',
              'Intermediate Consumption - Contract Work',
              'Intermediate Consumption - Crop Protection Products',
              'Intermediate Consumption - Energy and Lubricants',
              'Intermediate Consumption - Feeding Stuffs',
              'Intermediate Consumption - Fertilisers',
              'Intermediate Consumption - Financial Intermediation Services Indirect',
              'Intermediate Consumption - Forage Plants',
              'Intermediate Consumption - Maintenance and Repairs',
              'Intermediate Consumption - Seeds',
              'Intermediate Consumption - Services',
              'Intermediate Consumption - Veterinary Expenses',
              'Intermediate Consumption - Other Goods (Detergents, Small Tools, etc)',
              'Intermediate Consumption - Other Goods and Services'
              
             ]]
# Assign year as index
df_milk.set_index('Year',drop=True,inplace=True)

print("Milk production dataset dimenssions \n", df_milk.shape)
print("Milk production dataset Sample \n")
df_milk.head()


Milk production dataset dimenssions 
 (31, 27)
Milk production dataset Sample 



Unnamed: 0_level_0,All Livestock Products - Milk,Taxes on Products,Subsidies on Products,Compensation of Employees,Contract Work,Entrepreneurial Income,Factor Income,Fixed Capital Consumption - Farm Buildings,"Fixed Capital Consumption - Machinery, Equipment, etc",Interest less FISIM,...,Intermediate Consumption - Feeding Stuffs,Intermediate Consumption - Fertilisers,Intermediate Consumption - Financial Intermediation Services Indirect,Intermediate Consumption - Forage Plants,Intermediate Consumption - Maintenance and Repairs,Intermediate Consumption - Seeds,Intermediate Consumption - Services,Intermediate Consumption - Veterinary Expenses,"Intermediate Consumption - Other Goods (Detergents, Small Tools, etc)",Intermediate Consumption - Other Goods and Services
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990,1316.3,75.0,408.9,377.6,180.7,1577.2,2321.1,106.4,325.5,267.4,...,642.2,326.0,34.0,553.1,203.9,42.0,109.1,85.2,73.2,182.3
1991,1258.9,78.2,357.3,362.5,172.0,1440.2,2136.5,113.2,331.7,237.2,...,646.5,319.1,39.0,525.6,184.6,43.8,110.2,91.9,75.3,185.5
1992,1373.1,79.5,446.0,336.8,179.8,1842.6,2516.2,117.8,332.4,237.6,...,668.5,305.1,43.0,533.8,175.1,46.4,110.8,93.7,72.2,183.0
1993,1439.0,68.0,466.4,339.0,199.5,1985.6,2586.9,121.9,335.5,162.5,...,733.5,301.2,49.0,572.9,195.7,46.9,113.3,100.2,79.8,193.1
1994,1446.2,53.7,666.0,345.2,205.3,2051.7,2623.7,125.0,346.5,119.3,...,818.6,316.3,56.0,604.6,220.3,50.0,118.5,108.7,81.8,200.3


### Define 20% Training set 80% Test set

In [5]:
# define target & feature variables

X = df_milk.iloc[:,2:].values
Y = df_milk.iloc[:,1].values.reshape(-1,1)
print('features dimension ',np.shape(X))
print('target dimension ',np.shape(Y))

# impute mean value for NA's
#from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X = imp_mean.fit_transform(X)
Y = imp_mean.fit_transform(Y)


# split train test split 20
X_train, X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=2021)
print()
print('x_train dimension ', X_train.shape)
print('y_train dimension ', Y_train.shape)
print()
print('x_test dimension ', X_test.shape)
print('y_test dimension ', Y_test.shape)

features dimension  (31, 25)
target dimension  (31, 1)

x_train dimension  (24, 25)
y_train dimension  (24, 1)

x_test dimension  (7, 25)
y_test dimension  (7, 1)


### Scale & Transform

In [6]:
# Scale raining set and test set seperatly
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()

# calculate mean and std of training set 
scaler_x.fit(X_train)
scaler_y.fit(Y_train)
scaler_x.fit(X_test)
scaler_y.fit(Y_test)

# apply scaler to data set
xtrain_scale = scaler_x.transform(X_train)
ytrain_scale = scaler_y.transform(Y_train)

xtest_scale = scaler_x.transform(X_test)
ytest_scale = scaler_y.transform(Y_test)

# fit and transform in one line
# scaler_x.fit_transform(X_train)

# remeber to inverse the scaling on model output
# scaler_x.inverse_transform(xtest_scale)

# create a score dataframe to store model scores
df_score = pd.DataFrame()
print(df_score)

Empty DataFrame
Columns: []
Index: []


### Model 1 RandomForest Regressor

#### Train RandomForest

In [7]:
# define Random Forest Regressor
rf_regressor_milk = RandomForestRegressor(random_state=2021)

# define list of Parameters
params_rf_milk = {'n_estimators':[100,500,800],
                  'criterion':['squared_error', 'absolute_error', 'poisson'],
                  'max_features':["auto", "sqrt", "log2"],
                  "bootstrap": [True, False]
                   }

# Hyper parameter tuning via Grid Search Cross Validation 
grid_rf_milk = GridSearchCV(estimator= rf_regressor_milk,
                          param_grid= params_rf_milk,
                          n_jobs=-1,
                          cv=5
                     )

# Fit the grid to scaled data
grid_rf_milk.fit(xtrain_scale,ytrain_scale.reshape(-1))

# print best training model & R squared score
print('Best training model ',grid_rf_milk.best_estimator_)
print('Best training model score, coefficient of determination R squared', grid_rf_milk.best_score_)

Best training model  RandomForestRegressor(bootstrap=False, max_features='log2', random_state=2021)
Best training model score, coefficient of determination R squared 0.19402807847158815


#### Predict RandomForest

In [8]:
# Predict Milk Production and unscale back to original values
y_predict = scaler_y.inverse_transform(grid_rf_milk.predict(xtest_scale).reshape(-1, 1))

print('predicted milk production values \n',y_predict)
print('actual milk production values \n',Y_test)

# Calculate Mean Absolute Error
MAE_rf = mean_absolute_error(Y_test,y_predict)
#print(MAE_rf)

# add model score to Score Dataframe
df_score = pd.DataFrame(data={'Model':'RandomForest',
                           'Score MAE':MAE_rf},index=['Model 1'])

print(df_score)

predicted milk production values 
 [[49.959]
 [44.913]
 [41.825]
 [62.06 ]
 [40.956]
 [38.219]
 [60.544]]
actual milk production values 
 [[36.8]
 [39.3]
 [25. ]
 [53.7]
 [27.9]
 [39.4]
 [48.8]]
                Model  Score MAE
Model 1  RandomForest   9.991143


### Model 2 XGBOOST Regressor

#### Train XGBOOST

In [9]:
# define XGBRegressor
xgb_regressor_milk = XGBRegressor(random_state=2021)

# define parameters space to loop over
params_xgb_milk = {'n_estimators':[20,40,80,160,340,500],
             'max_depth':[3,6,9],
             'gamma':[0.01,0.1],
             'learning_rate':[0.001,0.01,0.1,1]
             }

# Hyper parameter tuning via Grid Search Cross Validation 
grid_xgb_milk = GridSearchCV(estimator=xgb_regressor_milk,
                     param_grid=params_xgb_milk,
                     #n_jobs=-1,
                     scoring=['r2','neg_root_mean_squared_error'],
                     refit= 'r2',
                     n_jobs=-1,
                     cv=5,
                     verbose=4
                     )

# fit grid to training scaled set
grid_xgb_milk.fit(xtrain_scale,ytrain_scale);


# print best training model & R squared score
print('Best training model ',grid_xgb_milk.best_estimator_)
print('Best model Parameters',grid_xgb_milk.best_params_)
print('Best training model score, coefficient of determination R squared', grid_xgb_milk.best_score_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best training model  XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0.01, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=20, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=2021,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)
Best model Parameters {'gamma': 0.01, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 20}
Best training model score, coefficient of determination R squared -0.3097339440520266


#### Predict XGBOOST

In [10]:
# Predict Milk Production and unscale back to original values
y_predict = scaler_y.inverse_transform(grid_xgb_milk.predict(xtest_scale).reshape(-1, 1))

print('predicted milk production values \n',y_predict)
print('actual milk production values \n',Y_test)

# Calculate Mean Absolute Error
MAE_xgb = mean_absolute_error(Y_test,y_predict)
#print(MAE_xgb)

# add model score to Score Dataframe
df_score = pd.DataFrame(data = {'Model':['RandomForest','XGBOOST'],
                                'Score MAE': [MAE_rf,MAE_xgb]},
                        index=['Model 1','Model 2'])

print(df_score)

predicted milk production values 
 [[45.075523]
 [40.96675 ]
 [54.415245]
 [58.39248 ]
 [54.415245]
 [38.954105]
 [50.69726 ]]
actual milk production values 
 [[36.8]
 [39.3]
 [25. ]
 [53.7]
 [27.9]
 [39.4]
 [48.8]]
                Model  Score MAE
Model 1  RandomForest   9.991143
Model 2       XGBOOST  10.415486


In [11]:
# write the Grid Search results to csv to choose best model with least resource consumption

#GS_xgb_df_milk = pd.DataFrame(GS_xgb_milk.cv_results_)
#GS_xgb_df_milk = GS_xgb_df_milk.sort_values('rank_test_r2')
#GS_xgb_df_milk.to_csv('./../artifacts/grid-search-xgb-milk-results.csv')

## ANN Artificial Neural Network

#### Training & Keras Parameter Tuning

In [13]:
temp_directory: str = './../temp/ANN-tuner/'


# Define ANN model with Hyper paramter variable
def build_model(hp):
    model = keras.Sequential()
    for i in range(hp.Int('num_layers', 2, 23)):
        model.add(
            layers.Dense(units=hp.Int('units_' + str(i),
                                      min_value=23,
                                      max_value=600,
                                      step=32),
                         activation='relu'))
        model.add(layers.Dense(1, activation='linear'))
        model.compile(optimizer=keras.optimizers.Adam(
            hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
                      loss='mean_absolute_error',
                      metrics=['mean_absolute_error'])
        return model

#if os.path.isdir(temp_directory):
#    os.remove(temp_directory)


# create a directory to store each iteration of modelling
tuner = RandomSearch(build_model,
                     objective='val_mean_absolute_error',
                     max_trials=5,
                     executions_per_trial=3,
                     directory=temp_directory,
                     project_name='Milk production')

# Defined parameter space to search in
tuner.search_space_summary()

# train trial models and compare with validation set
tuner.search(xtrain_scale,
             ytrain_scale,
             epochs=50,
             validation_data=(xtest_scale, ytest_scale))

# print best 10 models according to val_mean_absolute_error
print('\n')
tuner.results_summary()

# get best model from training trials
bestANNModel = tuner.get_best_models(num_models=1)[0]

# fit best model to training scaled data and scaled test data
bestANNModel.fit(xtrain_scale,
                 ytrain_scale,
                 epochs=50,
                 validation_data=(xtest_scale, ytest_scale))

INFO:tensorflow:Reloading Oracle from existing project ./../temp/ANN-tuner/Milk production\oracle.json
INFO:tensorflow:Reloading Tuner from ./../temp/ANN-tuner/Milk production\tuner0.json
Search space summary
Default search space size: 3
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 2, 'max_value': 23, 'step': 1, 'sampling': None}
units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 23, 'max_value': 600, 'step': 32, 'sampling': None}
learning_rate (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001, 0.0001], 'ordered': True}
INFO:tensorflow:Oracle triggered exit


Results summary
Results in ./../temp/ANN-tuner/Milk production
Showing 10 best trials
Objective(name='val_mean_absolute_error', direction='min')
Trial summary
Hyperparameters:
num_layers: 11
units_0: 183
learning_rate: 0.01
Score: 0.11820712437232335
Trial summary
Hyperparameters:
num_layers: 2
units_0: 503
learning_rate: 0.01
Score: 0.12775792429844537
Trial summary
Hyperparamete

Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x14ff6d4f100>

In [14]:
# Predict Milk Production and unscale back to original values
y_predict = scaler_y.inverse_transform(bestANNModel.predict(xtest_scale).reshape(-1, 1))

print('predicted milk production values \n',y_predict)
print('actual milk production values \n',Y_test)

# Calculate Mean Absolute Error
MAE_ANN = mean_absolute_error(Y_test,y_predict)
#print(MAE_xgb)

# add model score to Score Dataframe
df_score = pd.DataFrame(data = {'Model':['RandomForest','XGBOOST','ANN'],
                                'Score MAE': [MAE_rf,MAE_xgb,MAE_ANN]},
                        index=['Model 1','Model 2','Model 3'])

print(df_score)

predicted milk production values 
 [[40.269165]
 [43.412926]
 [34.84264 ]
 [60.56095 ]
 [31.343634]
 [38.87374 ]
 [48.420654]]
actual milk production values 
 [[36.8]
 [39.3]
 [25. ]
 [53.7]
 [27.9]
 [39.4]
 [48.8]]
                Model  Score MAE
Model 1  RandomForest   9.991143
Model 2       XGBOOST  10.415486
Model 3           ANN   4.090703


# Pickle file
    Save trained model into binary pickle file to use the model later with new input data from web app

In [15]:
model_name = "milk-production"
directory = f'./../artifacts/{model_name}/'

# Dump/write Scaler into binary pickle
pickle.dump(scaler_x, open(f'{directory}pkl_scaler_x', 'wb'))

# Read pickle file into variable to use scaler
scaler_x_pkl_ann = pickle.load(open(f'{directory}pkl_scaler_x', 'rb'))

# Dump/write Scaler into binary pickle
pickle.dump(scaler_y, open(f'{directory}pkl_scaler_y', 'wb'))

# Read pickle file into variable to use scaler
scaler_y_pkl_ann = pickle.load(open(f'{directory}pkl_scaler_y', 'rb'))

In [16]:
# Dump/write model into binary pickle file in the current notebook directory
pickle.dump(bestANNModel, open(f'{directory}pkl_ann_milk', 'wb'))
# Read pickle file into variable to use model
model_pkl_ann = pickle.load(open(f'{directory}pkl_ann_milk', 'rb'))

INFO:tensorflow:Assets written to: ram://568a2d65-be89-4ac5-8d45-79442ce9013c/assets


NotFoundError: 

In [None]:
## Example using pickle file with saved ANN model

# take input from source as array
data_input_from_webapp = np.array([357.3, 362.5, 172., 1440.2, 2136.5])

# scale input with same scaler as used in model
scale_data_from_webapp = scaler_x.transform(
    data_input_from_webapp.reshape(1, -1))

# predict scaled value
scaled_prediction = bestANNModel.predict(scale_data_from_webapp)

# descale prediction back to normal value
prediction = scaler_y.inverse_transform(scaled_prediction)
print('\n Expected Milk Production is ', prediction[0][0])