# Omdena  - Milan Chapter Agrifoods
## AI for Sustainable agri-food systems: use of Satellite Imagery
### Data Exploration for cereals and legumes in Italy 2006-2021
#### Author: Maria Fisher 


The main objective of this study is to have gather information about crop production in Italy for the period of 2006-2021. 

Crop dataset used in this study was downloaded from the Italian National Institute of Statistics (Istat).



In [1]:
import warnings 
warnings.filterwarnings("ignore")

import os
import pandas as pd
pd.options.display.float_format = "{:.2f}".format
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import seaborn as sns 
import scipy 
import sklearn
import geopandas as gpd
import pgeocode
import folium
import sys
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot, plot

In [None]:
cereals = pd.read_csv('cereal_final_dataset_2006-2021.csv',skipinitialspace=True)
cereals.head()

In [None]:
cereals.Type_fertilizer.unique()

In [None]:
cereals = cereals[cereals["Year"] < 2022]

In [None]:
cereals = cereals[cereals.Type_fertilizer != '0']


In [None]:
cereals.Type_fertilizer.unique()

In [None]:
cereals.describe()

## Pre-processing dataset 

In [None]:
def show_info(cereals):
    print('DATASET SHAPE: ', cereals.shape, '\n')
    print('-'*50)
    print('FEATURE DATA TYPES:')
    print(cereals.info())
    print('\n', '-'*50)
    print('NUMBER OF UNIQUE VALUES PER FEATURE:', '\n')
    print(cereals.nunique())
    print('\n', '-'*50)
    print('NULL VALUES PER FEATURE')
    print(cereals.isnull().sum())
show_info(cereals)

In [None]:
cereals.describe()

In [None]:
print(cereals.total_ha.max())
print(cereals.total_ha.min())
print(cereals.total_ha.value_counts())
print(cereals.total_ha.nunique())

## Total cereal production in 2006-2021

In [None]:
plt.figure(figsize= (10,5))
sns.barplot(x= 'Year', y= 'production_tonnes',data = cereals,palette='coolwarm')
plt.title('Total cereal and legume production 2006-2021')
plt.xlabel('Year')
plt.ylabel('Total production (tonnes)')
plt.show()



## Cereal production by Cities 

In [None]:
cereals_region = cereals.groupby(by = cereals.City)['production_tonnes','City'].sum().reset_index().sort_values(by = 'production_tonnes', ascending = False).head(10)
cereals_region

In [None]:
plt.figure(figsize= (10,5))
sns.barplot(x=cereals_region['production_tonnes'],y= cereals_region['City'], orient='h', palette='coolwarm');
plt.title('Total cereal and legume production 2006-2021 by City')
plt.xlabel('Total production')
plt.ylabel('Cities')
plt.show()

## Cereals highest production 2006-2022

In [None]:
print(cereals.Type_crop.max())
print(cereals.Type_crop.value_counts())
print(cereals.Type_crop.nunique())



Dataset shows there are 29 different types of cereals cultivated in Italy. Ten crop produced are Common wheat, Durum wheat, Potatoes, Barley, Maize, Beans, Chick-peas, Rye, Rice and Oats. 

In [None]:
cereals.describe()

In [None]:
# Rename name of crops 
cereals = cereals.replace('oats and spring cereal mixtures (mixed grain other than maslin)','oats mix')
cereals = cereals.replace('rye and winter cereal mixtures (maslin)','rye mix')
cereals = cereals.replace('spring cereal mixtures (mixed grain other than maslin)','cereal mix')
cereals = cereals.replace('common spring wheat and spelt','c-spr-wheat&spelt')
cereals = cereals.replace('common winter wheat and spelt','c-wint-wheat&spelt')
cereals = cereals.replace('winter cereal mixtures (maslin)','wint-cereal-mix')
cereals = cereals.replace('dried kidney bean','dry-k-bean')
cereals = cereals.replace('common wheat','c-wheat')
cereals = cereals.replace('durum wheat','d-wheat')
cereals = cereals.replace('broad bean','bro-bean')
cereals = cereals.replace('grain maize','maize')



 

Check which cereal and legume has the higest production value in the dataset analysed. 

In [None]:
#cereals.groupby(['Type_crop','total_ha'],sort=True)['production_tonnes'].sum().nlargest(10)

In [None]:
plt.figure(figsize=(10,5))
cereals['Type_crop'].value_counts().plot.bar()
plt.title('Total cereal and legume production 2006-2021 by crop')
plt.ylabel('Total production (tonnes)')
plt.show()


## Subseting data

In [None]:
cereals_top10 = cereals.apply(lambda row: row[cereals['Type_crop'].isin(['barley','oats', 
                                         'd-wheat','c-wheat', 'maize', 'potatoes','dry-k-bean',
                                          'bro-bean','chick-peas','rye'])])

cereals_top10.head()

In [None]:
cereals_top10.total_ha.unique()

In [None]:
Q1 = cereals_top10.quantile(0.25)
Q3 = cereals_top10.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
# Remove outliers
cereals_top10 = cereals_top10[~((cereals_top10 < (Q1 - 1.5 * IQR)) | (cereals_top10 > (Q3 + 1.5 * IQR))).any(axis=1)]
print(cereals_top10.shape)


In [None]:
#Box plot and histplot (crop production distribuition)

plt.figure(figsize=(13,5))

plt.subplot(1,2,1)
sns.boxplot(data=cereals_top10, x='production_tonnes', y="Type_crop",  color = 'darkblue', dodge=False)

plt.title(f'Box Plot total cereal and legumes production ')

plt.subplot(1,2,2)
sns.histplot(data=cereals_top10, x= 'production_tonnes',  color = 'darkblue', kde=True, bins = 80)
plt.title(f'Distribution total cereal and legumes production')

plt.show()



In [None]:
#Box plot and histplot of total area(ha)

plt.figure(figsize=(17,5))

plt.subplot(1,2,1)
sns.boxplot(data=cereals_top10, x='Year', y="total_ha",  color = 'darkblue', dodge=False)

plt.title(f'Box Plot total area (ha)')

plt.subplot(1,2,2)
sns.histplot(data=cereals_top10, x= 'total_ha',  color = 'darkblue', kde=True, bins = 80)
plt.title(f'Distribution total total area (ha)')

plt.show()



In [None]:
print(cereals_top10.total_ha.max())
print(cereals_top10.total_ha.min())
print(cereals_top10.total_ha.value_counts())
print(cereals_top10.total_ha.nunique())

In [None]:
print(cereals_top10.Fertilizers_tonnes.max())


In [None]:
fig, ax = plt.subplots(3,1, sharex=True, figsize=(8,11))
fig.autofmt_xdate()
cereals_top10['Year'] = (cereals_top10['Year']).astype(str)

# Total crop production
crops = cereals_top10[['Year', 'production_tonnes']]
crops = crops.groupby('Year').mean()
sns.lineplot(data= crops, x="Year", y="production_tonnes", ax=ax[0])

# Total area
crops = cereals_top10[['Year', 'total_ha']]
crops = crops.groupby('Year').mean()
sns.lineplot(data= crops, x="Year", y="total_ha", ax=ax[1])

# Ferlizers
fertilizers = cereals_top10[['Year', 'Fertilizers_tonnes']]
fertilizers = fertilizers.groupby('Year').mean()

sns.lineplot(data= fertilizers, x="Year", y="Fertilizers_tonnes", ax=ax[2])




In [None]:
#Crop harvested by year

plotcrop10 = sns.relplot(kind='line', data=cereals_top10, x='Year', 
                         y='production_tonnes', hue='Type_crop', style='Type_crop', aspect=1.75)


In [None]:
#Fertilizers distributed by year

plotfertilizers10 = sns.relplot(kind='line', data=cereals_top10, x='Year', 
                         y='Fertilizers_tonnes', hue='Type_fertilizer', style='Type_fertilizer', aspect=1.75)

organic fertilizer = vegetable soil amendment

# Climate data analysis

|Param_Code|Param_Name|Resolution|Depth [m]|Units|
|:--------:|:--------:|:--------:|:-------:|:---:|
|GWETPROF|Profile Soil Moisture|0.5° x 0.625°|1.34 - 8.53|water-free [0], saturated soil[1]| 
|GWETTOP|Surface Soil Wetness|0.5° x 0.625°|0.00 - 0.05| water-free [0], saturated soil[1]|
|GWETROOT|Root Zone Soil Wetness|0.5° x 0.625°|0.10 - 1.00|water-free [0], saturated soil[1]|
|CLOUD_AMT|Cloud Amount|---|---|%|
|TS|Earth Skin Temperature|---|---|°C|
|PS|Surface Pressure|---|---|kPa|
|RH2M|Relative Humidity at 2 Meters|---|---|%|
|QV2M|Specific Humidity at 2 Meters|---|---|g/kg|
|PRECTOTCORR|Precipitation Corrected|---|---|mm/day|
|T2M_MAX|Temperature at 2 Meters Maximum|---|---|°C|
|T2M_MIN|Temperature at 2 Meters Minimum|---|---|°C|
|T2M_RANGE|Temperature at 2 Meters Range|---|---|°C|
|WS2M|Wind Speed at 2 Meters|---|---|m/s|


  

*Precipitation Corrected (PRECTOTCORR)- [The bias corrected average of total precipitation at the surface of the earth in water mass (includes water content in snow)]

*Source? https://power.larc.nasa.gov/#resources

In [None]:

fig, ax = plt.subplots(3, 2, sharex=True,  figsize=(12,10))
fig.autofmt_xdate()


#fig.suptitle('Climate in Italy 2006 - 2021')

cereals['Year'] = (cereals['Year']).astype(str)

# relative_humidity
sns.lineplot(data= cereals, x="Year", y="RH2M", ax=ax[0,0])

# precipitation
sns.lineplot(data= cereals, x="Year", y="PRECTOTCORR", ax=ax[0,1])

# temp_max
sns.lineplot(data= cereals, x="Year", y="T2M_MIN", ax=ax[1,0])

# temp_min
sns.lineplot(data= cereals, x="Year", y="T2M_MAX", ax=ax[1,1])

# surface_soil_wetness
sns.lineplot(data= cereals, x="Year", y="GWETTOP", ax=ax[2,0])

# root_zone_wetness
sns.lineplot(data= cereals, x="Year", y="GWETROOT", ax=ax[2,1])


In [None]:
cereals

In [None]:
cereals_model = cereals_top10.drop(columns =['Year','PS','TS','GWETTOP', 'QV2M','WS2M','T2M_RANGE','GWETPROF','CLOUD_AMT','PRECTOTCORR','lat', 'lon'  ])
cereals_model

In [None]:
cereals_model = cereals_model.to_csv('cereals_model.csv', index=False)

# References


http://dati.istat.it

https://maps.princeton.edu/catalog/stanford-mn871sp9778

https://www.crea.gov.it/documents/68457/0/ITACONTA+2020_ENG+DEF+xweb+%281%29.pdf/95c6b30a-1e18-8e94-d4ac-ce884aef76e8?t=1619527317576

https://seaborn.pydata.org/generated/seaborn.relplot.html

https://www.statisticshowto.com/variance-inflation-factor/

https://statisticsbyjim.com/regression/multicollinearity-in-regression-analysis/

https://lost-stats.github.io/Presentation/Figures/heatmap_colored_correlation_matrix.html

https://plotly.com/python/box-plots/

https://numpy.org/doc/stable/reference/generated/numpy.zeros_like.html

In [None]:
!pip show pycaret
