# Omdena  - Milan Chapter Agrifoods
## AI for Sustainable agri-food systems: use of Satellite Imagery
### Tabular model for fruits in Italy 2006-2021
#### Author: Maria Fisher 


The main objective of this study is to have gather information about crop production in Italy for the period of 2006-2021. 

Crop dataset used in this study was downloaded from the Italian National Institute of Statistics (Istat).



In [None]:
import warnings 
warnings.filterwarnings("ignore")

import os
import pandas as pd
pd.options.display.float_format = "{:.2f}".format
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import seaborn as sns 
import scipy 
import sklearn
import geopandas as gpd
import pgeocode
import folium
import sys
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot, plot

# Pre-processing data for modelling 

In [None]:
fruits = pd.read_csv('fruits_model.csv')
fruits

Check for correlation

In [None]:
# Compute the correlation matrix
corr_fruits = fruits.select_dtypes(include=[np.number]).corr()

In [None]:
corr_fruits

In [None]:
sns.set_theme(style="white")

# Generate a mask for the upper triangle

mask = np.zeros_like(corr_fruits, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(8, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(250, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_fruits, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})



In [None]:
from sklearn.preprocessing import OneHotEncoder

fruits_1hotEn = pd.get_dummies(fruits, columns=['City','Type_crop', 'Type_fertilizer'], prefix = ['City','Crop', 'Fertilizer'])
features= fruits_1hotEn.loc[:, fruits_1hotEn.columns != 'production_tonnes']
features['production_tonnes'] = fruits_1hotEn['production_tonnes']
features.head()

In [None]:
features.info()

 
## Scaling Features

In [None]:
from sklearn.preprocessing import MinMaxScaler

y = features['production_tonnes']
X = features.drop('production_tonnes', axis=1)

scaler = MinMaxScaler()
features_no_prod = pd.DataFrame(scaler.fit_transform(X), index=y.index)
features_no_prod.columns = X.columns
features_no_prod.insert(len(features_no_prod.columns), 'production_tonnes', y)

data = features_no_prod


In [None]:
data.head()


Remove 10% of data for futher tests

In [None]:
data_model = data.sample(frac=0.9, random_state=786)
data_unseen = data.drop(data_model.index)

data_model.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(data_model.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))


Save unseen data for futher prediction test

In [None]:
data_unseen = data_unseen.to_csv('fruits_ds_unseen.csv', index=False)

## Modelling

## Select best model

In [None]:
y = data_model['production_tonnes']
X = data_model.drop('production_tonnes', axis=1)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Random Forest

In [None]:
# Prepare the pipeline
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
import joblib

regressor_rf = make_pipeline(RandomForestRegressor(n_estimators = 200, random_state = 0))
regressor_rf.fit(X_train, y_train)


In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Predicting Cross Validation Score
crossval_rf = cross_val_score(estimator = regressor_rf, X = X_train, y = y_train.ravel(), cv = 10)

# Predicting R2 Score the Train set results
y_pred_rf_train = regressor_rf.predict(X_train)
r2_score_rf_train = r2_score(y_train, y_pred_rf_train)

# Predicting R2 Score the Test set results
y_pred_rf_test = regressor_rf.predict(X_test)
r2_score_rf_test = r2_score(y_test, y_pred_rf_test)

# Predicting RMSE the Test set results
rmse_rf = (np.sqrt(mean_squared_error(y_test, y_pred_rf_test)))

# Print metrics
print('CV: ', crossval_rf.mean())
print('R2_score (train): ', r2_score_rf_train)
print('R2_score (test): ', r2_score_rf_test)
print("RMSE: ", rmse_rf)

CV:  0.9848972813960162

R2_score (train):  0.9983194181313884

R2_score (test):  0.9892629713056141

RMSE:  120.25995340520304



In [None]:
print("Values before saving the model:",y_pred_rf_train)

Save the model

In [None]:
joblib.dump(regressor_rf, 'fruits_rf.pkl')

Load the model

In [None]:
model_rf = joblib.load('fruits_rf.pkl') 

In [None]:
print("Values after saving the model:",model_rf.predict(X_test))

In [None]:
result = model_rf.score(X_test, y_test)
print(result)

Use the loaded model to make predictions with new data

In [None]:
# Predict on new dataset
pr = pd.read_csv('fruits_ds_unseen.csv')
pred_cols = list(pr.columns.values)[:-1]

# Apply the pipeline to new data
pred = pd.Series(regressor_rf.predict(pr[pred_cols]))
print (pred)

Residuals versus predicted values for the random forest model

In [None]:
import dalex as dx
regressor_rf = dx.Explainer(regressor_rf, X, y)

In [None]:
md_rf = regressor_rf.model_diagnostics()
md_rf.result

In [None]:
md_rf.plot()

In [None]:
md_rf = regressor_rf.model_diagnostics()
md_rf.plot(variable = "y", yvariable = "y_hat")

### Gradient Boosting

In [None]:
# Prepare the pipeline
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor
import joblib

regressor_gbr = make_pipeline(GradientBoostingRegressor(n_estimators = 200, random_state = 0))
regressor_gbr.fit(X_train, y_train)


In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


# Predicting Cross Validation Score
crossval_gbr = cross_val_score(estimator = regressor_gbr, X = X_train, y = y_train.ravel(), cv = 10)

# Predicting R2 Score the Train set results
y_pred_gbr_train = regressor_gbr.predict(X_train)
r2_score_gbr_train = r2_score(y_train, y_pred_gbr_train)

# Predicting R2 Score the Test set results
y_pred_gbr_test = regressor_gbr.predict(X_test)
r2_score_gbr_test = r2_score(y_test, y_pred_gbr_test)

# Predicting RMSE the Test set results
rmse_gbr = (np.sqrt(mean_squared_error(y_test, y_pred_gbr_test)))

# Print metrics
print('CV: ', crossval_gbr.mean())
print('R2_score (train): ', r2_score_gbr_train)
print('R2_score (test): ', r2_score_gbr_test)
print("RMSE: ", rmse_gbr)

CV:  0.9090644971207531

R2_score (train):  0.9148229836222868

R2_score (test):  0.9104947779673703

RMSE:  347.218743774312


In [None]:
print("Values before saving the model:",y_pred_gbr_train)

Save the model

In [None]:
joblib.dump(regressor_gbr, 'fruits_gbr.pkl')

Load the model

In [None]:
model_gbr = joblib.load('fruits_gbr.pkl') 

In [None]:
print("Values after saving the model:",model_gbr.predict(X_test))

In [None]:
result = model_gbr.score(X_test, y_test)
print(result)

Use the loaded model to make predictions with new data

In [None]:
# Predict on new dataset
pr = pd.read_csv('fruits_ds_unseen.csv')
pred_cols = list(pr.columns.values)[:-1]

# Apply the pipeline to new data
pred = pd.Series(regressor_gbr.predict(pr[pred_cols]))
print (pred)

In [None]:
import dalex as dx
regressor_gbr = dx.Explainer(regressor_gbr, X, y)

In [None]:
md_gbr = regressor_gbr.model_diagnostics()
md_gbr.result

In [None]:
md_gbr.plot()

In [None]:
md_gbr = regressor_gbr.model_diagnostics()
md_gbr.plot(variable = "y", yvariable = "y_hat")

### Decision Tree

In [None]:
# Prepare the pipeline
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
import joblib

regressor_dt = make_pipeline( DecisionTreeRegressor())
regressor_dt.fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Predicting Cross Validation Score

# Predicting Cross Validation Score
crossval_dt = cross_val_score(estimator = regressor_dt, X = X_train, y = y_train.ravel(), cv = 10)

# Predicting R2 Score the Train set results
y_pred_dt_train = regressor_dt.predict(X_train)
r2_score_dt_train = r2_score(y_train, y_pred_dt_train)

# Predicting R2 Score the Test set results
y_pred_dt_test = regressor_dt.predict(X_test)
r2_score_dt_test = r2_score(y_test, y_pred_dt_test)

# Predicting RMSE the Test set results
rmse_dt = (np.sqrt(mean_squared_error(y_test, y_pred_dt_test)))

# Print metrics
print('CV: ', crossval_dt.mean())
print('R2_score (train): ', r2_score_dt_train)
print('R2_score (test): ', r2_score_dt_test)
print("RMSE: ", rmse_dt)

CV:  0.9795930938488974

R2_score (train):  1.0

R2_score (test):  0.9887775598176309

RMSE:  122.94832712783497




In [None]:
print("Values before saving the model:",y_pred_dt_train)

Save the model

In [None]:
joblib.dump(regressor_dt, 'fruits_dt.pkl')

Load the model

In [None]:
model_dt = joblib.load('fruits_dt.pkl') 

In [None]:
print("Values after saving the model:",model_dt.predict(X_test))

In [None]:
result = model_dt.score(X_test, y_test)
print(result)

Use the loaded model to make predictions with new data

In [None]:
# Predict on new dataset
pr = pd.read_csv('fruits_ds_unseen.csv')
pred_cols = list(pr.columns.values)[:-1]

# Apply the pipeline to new data
pred = pd.Series(regressor_dt.predict(pr[pred_cols]))
print (pred)

In [None]:
import dalex as dx
regressor_dt = dx.Explainer(regressor_dt, X, y)

In [None]:
md_dt = regressor_dt.model_diagnostics()
md_dt.result

In [None]:
md_dt.plot()

In [None]:
md_dt = regressor_dt.model_diagnostics()
md_dt.plot(variable = "y", yvariable = "y_hat")

### K Neighbors 

In [None]:
# Prepare the pipeline
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor
import joblib

regressor_knn = make_pipeline(KNeighborsRegressor(n_neighbors=5))
regressor_knn.fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Predicting Cross Validation Score
crossval_knn = cross_val_score(estimator = regressor_knn, X = X_train, y = y_train.ravel(), cv = 10)

# Predicting R2 Score the Train set results
y_pred_knn_train = regressor_knn.predict(X_train)
r2_score_knn_train = r2_score(y_train, y_pred_knn_train)

# Predicting R2 Score the Test set results
y_pred_knn_test = regressor_knn.predict(X_test)
r2_score_knn_test = r2_score(y_test, y_pred_knn_test)

# Predicting RMSE the Test set results
rmse_knn = (np.sqrt(mean_squared_error(y_test, y_pred_knn_test)))

# Print metrics
print('CV: ', crossval_knn.mean())
print('R2_score (train): ', r2_score_knn_train)
print('R2_score (test): ', r2_score_knn_test)
print("RMSE: ", rmse_knn)

CV:  0.8755928661656285

R2_score (train):  0.9258680694522883

R2_score (test):  0.8974522657581585

RMSE:  371.6567364306191



In [None]:
print("Values before saving the model:",y_pred_knn_train)

Save the model

In [None]:
joblib.dump(regressor_knn, 'fruits_knn.pkl')

Load the model

In [None]:
model_knn = joblib.load('fruits_knn.pkl') 

In [None]:
print("Values after saving the model:",model_knn.predict(X_test))

In [None]:
result = model_knn.score(X_test, y_test)
print(result)

Use the loaded model to make predictions with new data

In [None]:
# Predict on new dataset
pr = pd.read_csv('fruits_ds_unseen.csv')
pred_cols = list(pr.columns.values)[:-1]

# Apply the pipeline to new data
pred = pd.Series(regressor_knn.predict(pr[pred_cols]))
print (pred)

In [None]:
import dalex as dx
regressor_knn = dx.Explainer(regressor_knn, X, y)

In [None]:
md_knn = regressor_knn.model_diagnostics()
md_knn.result

In [None]:
md_knn.plot()

In [None]:
md_knn = regressor_knn.model_diagnostics()
md_knn.plot(variable = "y", yvariable = "y_hat")

# Evaluate models 

In [None]:
models = [('Gradient Boosting Regressor', rmse_gbr, r2_score_gbr_train,r2_score_gbr_test, crossval_gbr.mean()),
          ('Random Forest Regressor', rmse_rf, r2_score_rf_train,r2_score_rf_test,crossval_rf.mean()),
          ('K Neighbors Regressor', rmse_knn, r2_score_knn_train,r2_score_knn_test,crossval_knn.mean()),
          ('Decision Tree Regressor', rmse_dt, r2_score_dt_train,r2_score_dt_test, crossval_dt.mean()) 
         ]

In [None]:
predict = pd.DataFrame(data = models, columns=['Model', 'RMSE', 'R² Score(training)', 'R² Score(test)', 'Cross-Validation'])
predict

In [None]:
f, axe = plt.subplots(1,1, figsize=(8,3))

predict.sort_values(by=['Cross-Validation'], ascending=False, inplace=True)

sns.barplot(x='Cross-Validation', y='Model', data = predict, palette='Blues',ax = axe)
#axes[0].set(xlabel='Region', ylabel='Charges')
axe.set_xlabel('Cross-Validaton Score', size=10)
axe.set_ylabel('Model')
axe.set_xlim(0,1.0)
plt.show()

In [None]:
f, axes = plt.subplots(2,1, figsize=(7,8))

predict.sort_values(by=['R² Score(training)'], ascending=False, inplace=True)

sns.barplot(x='R² Score(training)', y='Model', data = predict, palette='Blues', ax = axes[0])
#axes[0].set(xlabel='Region', ylabel='Charges')
axes[0].set_xlabel('R2 Score (Training)', size=10)
axes[0].set_ylabel('Model')
axes[0].set_xlim(0,1.0)

predict.sort_values(by=['R² Score(test)'], ascending=False, inplace=True)

sns.barplot(x='R² Score(test)', y='Model', data = predict, palette='Blues', ax = axes[1])
axes[1].set_xlabel('R² Score (Test)', size=10)
axes[1].set_ylabel('Model')
axes[1].set_xlim(0,1.0)

plt.show()

In [None]:
predict.sort_values(by=['RMSE'], ascending=False, inplace=True)

f, axe = plt.subplots( figsize=(7,5))
sns.barplot(x='Model', y='RMSE', data=predict, palette='Blues',ax = axe)
axe.set_xlabel('Model', size=10)
axe.set_ylabel('RMSE', size=10)
plt.xticks(rotation= 25)
plt.show()

The models Random Forest Regressor and Decision Tree have better performance. 

# References


http://dati.istat.it

https://maps.princeton.edu/catalog/stanford-mn871sp9778

https://www.crea.gov.it/documents/68457/0/ITACONTA+2020_ENG+DEF+xweb+%281%29.pdf/95c6b30a-1e18-8e94-d4ac-ce884aef76e8?t=1619527317576

https://seaborn.pydata.org/generated/seaborn.relplot.html

https://www.statisticshowto.com/variance-inflation-factor/

https://statisticsbyjim.com/regression/multicollinearity-in-regression-analysis/

https://lost-stats.github.io/Presentation/Figures/heatmap_colored_correlation_matrix.html

https://plotly.com/python/box-plots/

https://numpy.org/doc/stable/reference/generated/numpy.zeros_like.html