In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import math as mt
import seaborn as sns
from datetime import date, time, datetime, timedelta
from IPython.display import clear_output
import scipy.stats as ss
from statistics import mode
import os
import sys
import multiprocessing as mp
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import IsolationForest

In [2]:
cw_directory = os.getcwd()

growth_data_path = f"{cw_directory}//growth_db.csv"
weather_data_path = f"{cw_directory}//weather_db.csv"
zone_data_path = f"{cw_directory}//zone_db.csv"
    
growth_data = pd.read_csv(growth_data_path)
weather_data = pd.read_csv(weather_data_path)
zone_data = pd.read_csv(zone_data_path)

In [3]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [4]:
linearisation_coef = 0.625
stripping_coef = 0.92
min_grow_temp = 3
max_grow_temp = 27
future_days = 20
season = datetime(2021, 1, 1)

In [5]:
weather_data['date'] = pd.to_datetime(weather_data['date'], format='%d/%m/%Y')
weather_data['time'] = pd.to_datetime(weather_data['time'], format='%H:%M:%S')
weather_data['day'] = weather_data.date.dt.day
weather_data['month'] = weather_data.date.dt.month
weather_data['day_month'] = weather_data['day'].astype(str) + " - " + weather_data['month'].astype(str)

weather_data['heat_units'] = weather_data['avg_temp'] - min_grow_temp
weather_data['heat_units'] = np.where((weather_data['heat_units'] < 0), 0, weather_data['heat_units'])
weather_data['heat_units'] = np.where((weather_data['heat_units'] > 24), 24, weather_data['heat_units']/24)

In [6]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65823 entries, 0 to 65822
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   date             65823 non-null  datetime64[ns]
 1   time             65823 non-null  datetime64[ns]
 2   avg_temp         65823 non-null  float64       
 3   rh               65823 non-null  float64       
 4   rain             65823 non-null  float64       
 5   solar_radiation  65823 non-null  float64       
 6   wind_speed_avg   65823 non-null  float64       
 7   wind_speed_gust  9591 non-null   float64       
 8   soil_temp        9591 non-null   float64       
 9   bat_volt         65823 non-null  float64       
 10  leaf_wet         60962 non-null  float64       
 11  dew_point        60962 non-null  float64       
 12  day              65823 non-null  int64         
 13  month            65823 non-null  int64         
 14  day_month        65823 non-null  objec

In [7]:
weather_data.head()

Unnamed: 0,date,time,avg_temp,rh,rain,solar_radiation,wind_speed_avg,wind_speed_gust,soil_temp,bat_volt,leaf_wet,dew_point,day,month,day_month,heat_units
0,2014-05-14,1900-01-01 14:00:00,15.8,55.9,0.0,457.0,2.7,,,6904.0,0.0,7.2,14,5,14 - 5,0.533333
1,2014-05-14,1900-01-01 15:00:00,17.0,51.2,0.0,1212.0,2.2,,,6952.0,0.0,7.0,14,5,14 - 5,0.583333
2,2014-05-14,1900-01-01 16:00:00,17.3,48.9,0.0,862.0,2.1,,,6922.0,0.0,6.6,14,5,14 - 5,0.595833
3,2014-05-14,1900-01-01 17:00:00,17.5,48.9,0.0,611.0,2.0,,,6904.0,0.0,6.7,14,5,14 - 5,0.604167
4,2014-05-14,1900-01-01 18:00:00,16.8,51.0,0.0,232.0,1.7,,,6894.0,0.0,6.7,14,5,14 - 5,0.575


In [8]:
growth_data['sample_date'] = pd.to_datetime(growth_data['sample_date'], format='%d/%m/%Y')
growth_data['fieldzone'] = growth_data["field"] + " - " + growth_data["zone"].astype(str)
growth_data['stripped_diameter'] = growth_data['diameter'] * stripping_coef

In [9]:
growth_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131776 entries, 0 to 131775
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   sample_date        131776 non-null  datetime64[ns]
 1   field              131776 non-null  object        
 2   zone               131776 non-null  int64         
 3   diameter           131776 non-null  float64       
 4   fieldzone          131776 non-null  object        
 5   stripped_diameter  131776 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(2)
memory usage: 6.0+ MB


In [10]:
growth_data.head()

Unnamed: 0,sample_date,field,zone,diameter,fieldzone,stripped_diameter
0,2019-07-17,Allans 18,1,4.0,Allans 18 - 1,3.68
1,2019-07-17,Allans 18,1,3.0,Allans 18 - 1,2.76
2,2019-07-17,Allans 18,1,5.0,Allans 18 - 1,4.6
3,2019-07-17,Allans 18,1,5.0,Allans 18 - 1,4.6
4,2019-07-17,Allans 18,1,3.0,Allans 18 - 1,2.76


In [11]:
zone_data['planting_date'] = pd.to_datetime(zone_data['planting_date'], format='%d/%m/%Y')
zone_data['harvest_date'] = pd.to_datetime(zone_data['harvest_date'], format='%d/%m/%Y')
zone_data['zone'] = zone_data['zone'].astype(int)
zone_data["fieldzone"] = zone_data["field"] + " - " + zone_data["zone"].astype(str)
zone_data["fieldvariety"] = zone_data["field"] + " - " + zone_data["variety"]

In [12]:
zone_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 755 entries, 0 to 754
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   field           755 non-null    object        
 1   zone            755 non-null    int32         
 2   planting_date   755 non-null    datetime64[ns]
 3   variety         755 non-null    object        
 4   method          755 non-null    object        
 5   inputs          755 non-null    object        
 6   protection      755 non-null    object        
 7   planting_rate   755 non-null    int64         
 8   sand            755 non-null    int64         
 9   silt            755 non-null    int64         
 10  clay            755 non-null    int64         
 11  organic_matter  755 non-null    float64       
 12  harvest_date    565 non-null    datetime64[ns]
 13  fieldzone       755 non-null    object        
 14  fieldvariety    755 non-null    object        
dtypes: dat

In [13]:
zone_data.head()

Unnamed: 0,field,zone,planting_date,variety,method,inputs,protection,planting_rate,sand,silt,clay,organic_matter,harvest_date,fieldzone,fieldvariety
0,Allans 07,1,2019-04-01,Krypton,Drilled,Conventional,Barley,249820,39,37,24,29.6,2019-12-15,Allans 07 - 1,Allans 07 - Krypton
1,Allans 07,2,2019-04-01,Krypton,Drilled,Conventional,Barley,249820,39,37,24,29.6,2019-12-15,Allans 07 - 2,Allans 07 - Krypton
2,Allans 07,3,2019-04-01,Longton,Drilled,Conventional,Barley,256524,39,37,24,29.6,2019-12-15,Allans 07 - 3,Allans 07 - Longton
3,Allans 07,4,2019-04-01,Longton,Drilled,Conventional,Barley,256524,39,37,24,29.6,2019-12-15,Allans 07 - 4,Allans 07 - Longton
4,Allans 07,5,2019-04-01,Batter,Drilled,Conventional,Barley,247048,39,37,24,29.6,2019-12-15,Allans 07 - 5,Allans 07 - Batter


In [14]:
summary_data = growth_data.copy()

summary_data["zone"] = summary_data["zone"].astype(str)
summary_data["fieldzone"] = summary_data["field"] + " - " + summary_data["zone"]
summary_data['fieldzonedate'] = summary_data['fieldzone'] + " - " + summary_data['sample_date'].astype(str)

summary_data = summary_data.set_index('fieldzone')
summary_data = summary_data.join(zone_data.set_index('fieldzone'), rsuffix = '_join')

summary_data['fieldvarietydate'] = summary_data['fieldvariety'] + " - " + summary_data['sample_date'].astype(str)
summary_data['heat_units'] = 0
summary_data['solar_radiation'] = 0

summary_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 131776 entries, Allans 06 - 1 to Wissey O (S) - 3
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   sample_date        131776 non-null  datetime64[ns]
 1   field              131776 non-null  object        
 2   zone               131776 non-null  object        
 3   diameter           131776 non-null  float64       
 4   stripped_diameter  131776 non-null  float64       
 5   fieldzonedate      131776 non-null  object        
 6   field_join         131776 non-null  object        
 7   zone_join          131776 non-null  int32         
 8   planting_date      131776 non-null  datetime64[ns]
 9   variety            131776 non-null  object        
 10  method             131776 non-null  object        
 11  inputs             131776 non-null  object        
 12  protection         131776 non-null  object        
 13  planting_rate      131776 n

In [15]:
summary_data.head()

Unnamed: 0_level_0,sample_date,field,zone,diameter,stripped_diameter,fieldzonedate,field_join,zone_join,planting_date,variety,method,inputs,protection,planting_rate,sand,silt,clay,organic_matter,harvest_date,fieldvariety,fieldvarietydate,heat_units,solar_radiation
fieldzone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Allans 06 - 1,2021-07-29,Allans 06,1,13.0,11.96,Allans 06 - 1 - 2021-07-29,Allans 06,1,2021-04-10,Krypton,Drilled,Conventional,Barley,262270,40,41,19,24.6,NaT,Allans 06 - Krypton,Allans 06 - Krypton - 2021-07-29,0,0
Allans 06 - 1,2021-07-29,Allans 06,1,14.0,12.88,Allans 06 - 1 - 2021-07-29,Allans 06,1,2021-04-10,Krypton,Drilled,Conventional,Barley,262270,40,41,19,24.6,NaT,Allans 06 - Krypton,Allans 06 - Krypton - 2021-07-29,0,0
Allans 06 - 1,2021-07-29,Allans 06,1,8.0,7.36,Allans 06 - 1 - 2021-07-29,Allans 06,1,2021-04-10,Krypton,Drilled,Conventional,Barley,262270,40,41,19,24.6,NaT,Allans 06 - Krypton,Allans 06 - Krypton - 2021-07-29,0,0
Allans 06 - 1,2021-07-29,Allans 06,1,10.0,9.2,Allans 06 - 1 - 2021-07-29,Allans 06,1,2021-04-10,Krypton,Drilled,Conventional,Barley,262270,40,41,19,24.6,NaT,Allans 06 - Krypton,Allans 06 - Krypton - 2021-07-29,0,0
Allans 06 - 1,2021-07-29,Allans 06,1,7.0,6.44,Allans 06 - 1 - 2021-07-29,Allans 06,1,2021-04-10,Krypton,Drilled,Conventional,Barley,262270,40,41,19,24.6,NaT,Allans 06 - Krypton,Allans 06 - Krypton - 2021-07-29,0,0


In [16]:
summary_data = summary_data.reset_index(inplace=False)
summary_data = summary_data.drop(columns=['field_join', 'zone_join'], inplace=False)
summary_data.head()

Unnamed: 0,fieldzone,sample_date,field,zone,diameter,stripped_diameter,fieldzonedate,planting_date,variety,method,inputs,protection,planting_rate,sand,silt,clay,organic_matter,harvest_date,fieldvariety,fieldvarietydate,heat_units,solar_radiation
0,Allans 06 - 1,2021-07-29,Allans 06,1,13.0,11.96,Allans 06 - 1 - 2021-07-29,2021-04-10,Krypton,Drilled,Conventional,Barley,262270,40,41,19,24.6,NaT,Allans 06 - Krypton,Allans 06 - Krypton - 2021-07-29,0,0
1,Allans 06 - 1,2021-07-29,Allans 06,1,14.0,12.88,Allans 06 - 1 - 2021-07-29,2021-04-10,Krypton,Drilled,Conventional,Barley,262270,40,41,19,24.6,NaT,Allans 06 - Krypton,Allans 06 - Krypton - 2021-07-29,0,0
2,Allans 06 - 1,2021-07-29,Allans 06,1,8.0,7.36,Allans 06 - 1 - 2021-07-29,2021-04-10,Krypton,Drilled,Conventional,Barley,262270,40,41,19,24.6,NaT,Allans 06 - Krypton,Allans 06 - Krypton - 2021-07-29,0,0
3,Allans 06 - 1,2021-07-29,Allans 06,1,10.0,9.2,Allans 06 - 1 - 2021-07-29,2021-04-10,Krypton,Drilled,Conventional,Barley,262270,40,41,19,24.6,NaT,Allans 06 - Krypton,Allans 06 - Krypton - 2021-07-29,0,0
4,Allans 06 - 1,2021-07-29,Allans 06,1,7.0,6.44,Allans 06 - 1 - 2021-07-29,2021-04-10,Krypton,Drilled,Conventional,Barley,262270,40,41,19,24.6,NaT,Allans 06 - Krypton,Allans 06 - Krypton - 2021-07-29,0,0


In [17]:
def skewness(series):
    return ss.skew(series, bias = False)

def kurt(series):
    return ss.kurtosis(series, bias = False)

In [18]:
summary_data_avg = summary_data.copy()

summary_data_avg = summary_data_avg.groupby(['fieldzonedate']).agg({'stripped_diameter' : ['mean', 'std', 'count', skewness, kurt],
                                                                'method' : ['first'],
                                                                'inputs' : ['first'],
                                                                'variety' : ['first'],
                                                                'protection' : ['first'],
                                                                'sand' : ['mean'],
                                                                'silt' : ['mean'],
                                                                'clay' : ['mean'],
                                                                'organic_matter' : ['mean'],
                                                                'planting_date' : ['first'],
                                                                'sample_date' : ['first'],
                                                                'fieldzone' : ['first']}).reset_index()

summary_data_avg.columns = ['fieldzonedate',
                            'mean_diameter',
                            'std_dev_diameter',
                            'pp2m2',
                            'skewness',
                            'kurtosis',
                            'method',
                            'inputs',
                            'variety',
                            'protection',
                            'sand',
                            'silt',
                            'clay',
                            'organic_matter',
                            'planting_date',
                            'sample_date',
                            'fieldzone']

summary_data_avg['d_lin'] = (summary_data_avg['mean_diameter'])**linearisation_coef
summary_data_avg['s_lin'] = (summary_data_avg['std_dev_diameter'])**linearisation_coef
summary_data_avg['heat_units'] = 0
summary_data_avg['solar_radiation'] = 0

summary_data_avg.tail()

Unnamed: 0,fieldzonedate,mean_diameter,std_dev_diameter,pp2m2,skewness,kurtosis,method,inputs,variety,protection,sand,silt,clay,organic_matter,planting_date,sample_date,fieldzone,d_lin,s_lin,heat_units,solar_radiation
3353,Wissey O (S) - 1 - 2019-09-19,34.209474,5.277874,38,-0.366295,-0.436126,Drilled,Conventional,Shafton,Barley,11,64,25,71.2,2019-03-26,2019-09-19,Wissey O (S) - 1,9.095812,2.828374,0,0
3354,Wissey O (S) - 2 - 2019-07-30,18.447179,3.409484,39,0.261426,0.527252,Drilled,Conventional,Shafton,Barley,11,64,25,71.2,2019-03-26,2019-07-30,Wissey O (S) - 2,6.183099,2.152437,0,0
3355,Wissey O (S) - 2 - 2019-09-19,30.631818,4.797228,44,-0.065718,-0.2926,Drilled,Conventional,Shafton,Barley,11,64,25,71.2,2019-03-26,2019-09-19,Wissey O (S) - 2,8.489028,2.66452,0,0
3356,Wissey O (S) - 3 - 2019-07-30,20.079,2.996661,40,0.333063,-0.493385,Drilled,Conventional,Shafton,Barley,11,64,25,71.2,2019-03-26,2019-07-30,Wissey O (S) - 3,6.519493,1.985631,0,0
3357,Wissey O (S) - 3 - 2019-09-19,29.920909,3.82935,44,-0.133634,-0.709054,Drilled,Conventional,Shafton,Barley,11,64,25,71.2,2019-03-26,2019-09-19,Wissey O (S) - 3,8.365352,2.314479,0,0


In [19]:
weather_data_avg = weather_data.copy()

weather_data_avg = weather_data_avg.groupby(['date']).agg({'rain' : ['sum'],
                                                       'heat_units' : ['sum'],
                                                       'solar_radiation' : ['sum'],
                                                       'wind_speed_avg' : ['mean'],
                                                       'rh' : ['mean'],
                                                       'avg_temp' : ['mean']}).reset_index()

weather_data_avg.columns = ['date',
                            'rain',
                            'heat_units',
                            'solar_radiation',
                            'wind_speed_avg',
                            'rh',
                            'avg_temp']

weather_data_avg['day'] = weather_data_avg.date.dt.day
weather_data_avg['month'] = weather_data_avg.date.dt.month
weather_data_avg['day_month'] = weather_data_avg['day'].astype(str) + " - " + weather_data_avg['month'].astype(str)  

weather_data_avg_group = weather_data_avg.copy()

weather_data_avg_group = weather_data_avg_group.groupby(['day_month']).agg({'rain' : ['mean'],
                                                                            'heat_units' : ['mean'],
                                                                            'solar_radiation' : ['mean'],
                                                                            'wind_speed_avg' : ['mean'],
                                                                            'rh' : ['mean'],
                                                                            'avg_temp' : ['mean']}).reset_index()

weather_data_avg_group.columns = ['day_month',
                                  'rain',
                                  'heat_units',
                                  'solar_radiation',
                                  'wind_speed_avg',
                                  'rh',
                                  'avg_temp']

max_date = max(weather_data_avg.date)

for i in range(1, 300):
    
    clear_output(wait=True)
    
    date = max_date + timedelta(days=i)
    weather_data_avg = weather_data_avg.append({'date': date,
                                                'rain': np.nan,
                                                'heat_units':np.nan,
                                                'solar_radiation':np.nan,
                                                'wind_speed_avg':np.nan,
                                                'rh':np.nan,
                                                'avg_temp':np.nan }, ignore_index=True)
    
    print("Current Progress:", np.round(i/300*100,0),"%")
    
weather_data_avg['day'] = weather_data_avg.date.dt.day
weather_data_avg['month'] = weather_data_avg.date.dt.month
weather_data_avg['day_month'] = weather_data_avg['day'].astype(str) + " - " + weather_data_avg['month'].astype(str)  

Current Progress: 100.0 %


In [20]:
###THIS IS A VERY SLOW PROCESS (COULD PARALLEL PROCESSING BE INTRODUCED)

def mean_weather(day_month, variable):
    df = weather_data_avg_group[weather_data_avg_group['day_month']==day_month]
    weather_value = df[variable].sum()
    return weather_value

for variable in ['rain', 'heat_units', 'solar_radiation', 'wind_speed_avg', 'rh', 'avg_temp']:
    for i in weather_data_avg.index:
        
        clear_output(wait=True)
        
        if weather_data_avg[variable][i] == np.nan:
            test = 'test'
            
        else:
            day_month = weather_data_avg['day_month'][i]
            weather_data_avg[variable][i] = mean_weather(day_month, variable)
            
        print(f"{variable} progress:", np.round(i/len(weather_data_avg)*100,0),"%")
    
weather_data_avg.tail()

avg_temp progress: 100.0 %


Unnamed: 0,date,rain,heat_units,solar_radiation,wind_speed_avg,rh,avg_temp,day,month,day_month
3038,2022-09-07,0.425,34.302083,6216.56,2.562083,82.467708,14.3875,7,9,7 - 9
3039,2022-09-08,0.95,29.063021,6372.23,3.309219,83.4625,14.85625,8,9,8 - 9
3040,2022-09-09,0.9125,11.608333,6479.035,3.017396,85.604688,14.608333,9,9,9 - 9
3041,2022-09-10,0.6875,10.983333,6113.86,3.051042,83.671875,13.983333,10,9,10 - 9
3042,2022-09-11,0.7625,11.714063,5075.02,4.284167,85.128646,14.714062,11,9,11 - 9


In [21]:
weather_data_avg

Unnamed: 0,date,rain,heat_units,solar_radiation,wind_speed_avg,rh,avg_temp,day,month,day_month
0,2014-05-14,0.825,6.506771,9217.82,4.122604,75.314271,10.227083,14,5,14 - 5
1,2014-05-15,0.925,8.078125,11512.52,3.593802,80.554687,10.857813,15,5,15 - 5
2,2014-05-16,0.425,9.3125,9943.06,4.079896,78.120833,12.310417,16,5,16 - 5
3,2014-05-17,4.65,9.153125,9964.665,3.792969,81.764583,12.132813,17,5,17 - 5
4,2014-05-18,5.85,9.229167,10198.58,3.074792,82.645833,12.218229,18,5,18 - 5
5,2014-05-19,0.875,9.851562,11232.215,3.850104,81.060417,12.848958,19,5,19 - 5
6,2014-05-20,0.825,10.395312,10693.015,5.16099,81.995313,13.395313,20,5,20 - 5
7,2014-05-21,0.525,10.948438,11017.475,8.278646,77.761979,13.948437,21,5,21 - 5
8,2014-05-22,1.675,10.759375,11270.72,4.674531,78.649479,13.754688,22,5,22 - 5
9,2014-05-23,0.45,9.909375,11287.27,5.625417,80.210938,12.904687,23,5,23 - 5


In [22]:
def cum_heat_units(start, finish, weather_data):
    df = weather_data.loc[(weather_data['date'] > start) & (weather_data['date'] < finish), ['heat_units']]
    total_hu = df['heat_units'].sum()
    return total_hu

def cum_solar_radiation(start, finish, weather_data):
    df = weather_data.loc[(weather_data['date'] > start) & (weather_data['date'] < finish), ['solar_radiation']]
    total_sr = df['solar_radiation'].sum()
    return total_sr

In [23]:
for i in summary_data_avg.index:
    clear_output(wait=True)
    
    planting_date = summary_data_avg['planting_date'][i]
    sample_date = summary_data_avg['sample_date'][i]
    summary_data_avg['solar_radiation'][i] = cum_solar_radiation(planting_date, sample_date, weather_data_avg)
    
    print("Current Progress:", np.round(i/len(summary_data_avg)*100,0),"%")

Current Progress: 100.0 %


In [24]:
for i in summary_data_avg.index:
    clear_output(wait=True)
    
    planting_date = summary_data_avg['planting_date'][i]
    sample_date = summary_data_avg['sample_date'][i]
    summary_data_avg['heat_units'][i] = cum_heat_units(planting_date, sample_date, weather_data_avg)
    
    print("Current Progress:", np.round(i/len(summary_data_avg)*100,0),"%")

Current Progress: 100.0 %


def heat_units_received(summary_data_avg):
    for i in summary_data_avg.index:
        clear_output(wait=True)
        planting_date = summary_data_avg['planting_date'][i]
        sample_date = summary_data_avg['sample_date'][i]
        summary_data_avg['heat_units'][i] = cum_heat_units(planting_date, sample_date, weather_data_avg)
        print("Current Progress:", np.round(i/len(summary_data_avg)*100,0),"%")


def process_chunk(proc_chunk):
    """
    Process the partial array/dataframe `proc_chunk` passed to this process worker.
    Return an array/dataframe of the same length and same indices.
    """

    chunk_res = heat_units_received(proc_chunk)
    chunk_res.index = proc_chunk.index
    
    return chunk_res

# set the number of processes
n_proc = mp.cpu_count()

# this often can't be devided evenly (handle this in the for-loop below)
chunksize = len(summary_data_avg.index) // n_proc

# devide into chunks
proc_chunks = []
for i_proc in range(n_proc):
    chunkstart = i_proc * chunksize
    # make sure to include the division remainder for the last process
    chunkend = (i_proc + 1) * chunksize if i_proc < n_proc - 1 else None

    proc_chunks.append(summary_data_avg.iloc[slice(chunkstart, chunkend)])

assert sum(map(len, proc_chunks)) == len(summary_data_avg.index)   # make sure all data is in the chunks

print('x')

# distribute work to the worker processes
with mp.Pool(processes=n_proc) as pool:
    # starts the sub-processes without blocking
    # pass the chunk to each worker process
    proc_results = [pool.apply_async(process_chunk, args=(chunk,)) for chunk in proc_chunks]

    # blocks until all results are fetched
    result_chunks = [r.get() for r in proc_results]

    # concatenate results from worker processes
    results = pd.concat(result_chunks)
    
print('x')

results = pd.concat((summary_data_avg, results), axis=1)
assert len(results) == len(summary_data_avg.index)   # make sure we got a result for each coordinate pair

In [25]:
summary_data_avg['protection_2'] = summary_data_avg['protection'].copy()
summary_data_avg = pd.get_dummies(summary_data_avg, columns = ['protection_2'], drop_first = False)

In [26]:
summary_data_avg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3358 entries, 0 to 3357
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   fieldzonedate        3358 non-null   object        
 1   mean_diameter        3358 non-null   float64       
 2   std_dev_diameter     3358 non-null   float64       
 3   pp2m2                3358 non-null   int64         
 4   skewness             3358 non-null   float64       
 5   kurtosis             3358 non-null   float64       
 6   method               3358 non-null   object        
 7   inputs               3358 non-null   object        
 8   variety              3358 non-null   object        
 9   protection           3358 non-null   object        
 10  sand                 3358 non-null   int64         
 11  silt                 3358 non-null   int64         
 12  clay                 3358 non-null   int64         
 13  organic_matter       3358 non-nul

In [27]:
summary_data_avg[summary_data_avg.isna().any(axis=1)]

Unnamed: 0,fieldzonedate,mean_diameter,std_dev_diameter,pp2m2,skewness,kurtosis,method,inputs,variety,protection,sand,silt,clay,organic_matter,planting_date,sample_date,fieldzone,d_lin,s_lin,heat_units,solar_radiation,protection_2_Barley,protection_2_Fleece,protection_2_None,protection_2_Poly


In [28]:
summary_data_avg = summary_data_avg.dropna()

In [29]:
def average_count(fieldzone, df_1 = summary_data_avg):
    
    df_1 = df_1[df_1['fieldzone']==fieldzone]
    average_count = df_1['pp2m2'].mean()
    
    if mt.isnan(average_count):
        average_count = 40
      
    return average_count

average_count('RH33 - 1')

36.333333333333336

In [30]:
def max_sample_date(fieldzone, df_1 = summary_data_avg, df_2 = zone_data):
    
    df_1 = df_1[df_1['fieldzone'] == fieldzone]
    max_sample_date = max(df_1['sample_date'], default = 0)
    if max_sample_date == 0:
        df_2 = df_2[df_2['fieldzone'] == fieldzone]
        max_sample_date = df_2['planting_date'].max()
    
    return max_sample_date

max_sample_date('RH33 - 1')

Timestamp('2021-11-18 00:00:00')

In [31]:
def max_mean_diameter_lin(fieldzone, df_1 = summary_data_avg):
    
    df_1 = df_1[df_1['fieldzone']==fieldzone]
    max_mean_diameter = df_1['mean_diameter'].max()
    max_mean_diameter_lin = max_mean_diameter ** linearisation_coef
    
    if mt.isnan(max_mean_diameter_lin):
        max_mean_diameter_lin = 0
    
    return max_mean_diameter_lin

max_mean_diameter_lin('Allans 07 - 1')

9.26293214352213

In [32]:
def max_std_dev_diameter_lin(fieldzone, df_1 = summary_data_avg):
    
    df_1 = df_1[df_1['fieldzone']==fieldzone]
    max_std_dev_diameter = df_1['std_dev_diameter'].max()
    max_std_dev_diameter_lin = max_std_dev_diameter ** linearisation_coef
    
    if mt.isnan(max_std_dev_diameter_lin):
        max_std_dev_diameter_lin = 0
    
    return max_std_dev_diameter_lin

max_std_dev_diameter_lin('Allans 07 - 1')

4.348294771174176

In [33]:
def max_solar(fieldzone, df_1 = summary_data_avg):
    
    df_1 = df_1[df_1['fieldzone']==fieldzone]
    max_solar = df_1['solar_radiation'].max()
    
    if mt.isnan(max_solar):
        max_solar = 0
    
    return max_solar

max_solar('Allans 07 - 1')

1761357

In [34]:
def max_heat(fieldzone, df_1 = summary_data_avg):
    
    df_1 = df_1[df_1['fieldzone']==fieldzone]
    max_heat = df_1['heat_units'].max()
    
    if mt.isnan(max_heat):
        max_heat = 0
    
    return max_heat

max_heat('Allans 07 - 1')

4079

In [35]:
def filter_data(data, method, inputs, variety):
    
    filtered = data[data['variety'].str.contains(variety)]
    filtered = filtered[filtered['inputs'].str.contains(inputs)]
    filtered = filtered[filtered['method'].str.contains(method)]
    
    return filtered

In [36]:
def predict_weather(start, finish, variable, df_1 = weather_data):
    
    df_1 = df_1.loc[(df_1['date'] > start) & (df_1['date'] < finish), [variable]]
    predicted_weather= df_1[variable].sum()
    
    return predicted_weather

start = datetime(year=2021, month=6, day=2, hour=13, minute=14, second=31)
finish = datetime(year=2022, month=6, day=2, hour=13, minute=14, second=31)

predict_weather(start, finish, 'rain')

363.0

In [37]:
zone_data['mean_pp2m2'] = 0.0
zone_data['max_sample_date'] = 0
zone_data['max_mean_diameter_lin'] = 0.0
zone_data['max_std_dev_diameter_lin'] = 0.0
zone_data['max_heat'] = 0.0
zone_data['max_solar'] = 0.0
zone_data['remaining_heat'] = 0.0
zone_data['remaining_solar'] = 0.0

for i in zone_data.index:
    
    clear_output(wait=True)
    
    fieldzone = zone_data.loc[i, 'fieldzone']
    zone_data.loc[i, 'mean_pp2m2'] = average_count(fieldzone)
    zone_data.loc[i, 'max_sample_date'] = max_sample_date(fieldzone)
    zone_data.loc[i, 'max_mean_diameter_lin'] = max_mean_diameter_lin(fieldzone)
    zone_data.loc[i, 'max_std_dev_diameter_lin'] = max_std_dev_diameter_lin(fieldzone)
    zone_data.loc[i, 'max_heat'] = max_heat(fieldzone)
    zone_data.loc[i, 'max_solar'] = max_solar(fieldzone)
    start = zone_data.loc[i, 'max_sample_date']
    today = datetime.today() + timedelta(days=future_days)
    finish = today.strftime("%d/%m/%Y")
    zone_data.loc[i, 'remaining_heat'] = cum_heat_units(start, finish, weather_data_avg)
    zone_data.loc[i, 'remaining_solar'] = cum_solar_radiation(start, finish, weather_data_avg)   
    
    print("Current Progress:", np.round(i/len(zone_data)*100,0),"%")
    
zone_data['establishment'] = (zone_data['mean_pp2m2']/2*10000)/zone_data['planting_rate']
zone_data['est_mean_diameter_gain'] = 0.0
zone_data['est_std_dev_diameter_gain'] = 0.0
zone_data['est_mean_diameter'] = 0.0
zone_data['est_std_dev_diameter'] = 0.0


Current Progress: 100.0 %


In [39]:
summary_data_avg.sort_values(by='mean_diameter', ascending=False)

Unnamed: 0,fieldzonedate,mean_diameter,std_dev_diameter,pp2m2,skewness,kurtosis,method,inputs,variety,protection,sand,silt,clay,organic_matter,planting_date,sample_date,fieldzone,d_lin,s_lin,heat_units,solar_radiation,protection_2_Barley,protection_2_Fleece,protection_2_None,protection_2_Poly
397,Garretts Decoy 3 - 3 - 2019-11-19,40.725333,7.227684,30,-0.073793,-0.687747,Drilled,Conventional,Linkton,Barley,21,64,15,77.9,2019-03-25,2019-11-19,Garretts Decoy 3 - 3,10.142976,3.442504,3932,1748197,1,0,0,0
2359,Park Farm 50 - 1 - 2019-11-25,38.477647,6.569999,34,-0.166278,0.000815,Drilled,Conventional,Krypton,Barley,27,42,31,48.7,2019-03-22,2019-11-25,Park Farm 50 - 1,9.789383,3.243234,3966,1775818,1,0,0,0
400,Garretts Decoy 3 - 4 - 2019-11-19,37.996,7.0113,30,0.065968,-0.017297,Drilled,Conventional,Linkton,Barley,21,64,15,77.9,2019-03-25,2019-11-19,Garretts Decoy 3 - 4,9.712615,3.377724,3932,1748197,1,0,0,0
394,Garretts Decoy 3 - 2 - 2019-11-19,37.26,6.544521,30,0.662611,0.111289,Drilled,Conventional,Linkton,Barley,21,64,15,77.9,2019-03-25,2019-11-19,Garretts Decoy 3 - 2,9.594598,3.235368,3932,1748197,1,0,0,0
391,Garretts Decoy 3 - 1 - 2019-11-19,37.076,7.20024,30,-0.077724,0.660048,Drilled,Conventional,Linkton,Barley,21,64,15,77.9,2019-03-25,2019-11-19,Garretts Decoy 3 - 1,9.564958,3.434329,3932,1748197,1,0,0,0
37,Allans 07 - 2 - 2020-01-13,36.267368,9.562431,38,-0.226877,-0.893282,Drilled,Conventional,Krypton,Barley,39,37,24,29.6,2019-04-01,2020-01-13,Allans 07 - 2,9.434036,4.100674,4079,1761357,1,0,0,0
962,HH30/31A - 1 - 2020-02-19,35.715714,9.984511,28,-1.035496,0.987714,Drilled,Conventional,Longton,Barley,43,33,24,11.6,2019-05-23,2020-02-19,HH30/31A - 1,9.344092,4.212882,3795,1354035,1,0,0,0
33,Allans 07 - 1 - 2020-01-13,35.220667,10.502927,42,-0.134837,0.28576,Drilled,Conventional,Krypton,Barley,39,37,24,29.6,2019-04-01,2020-01-13,Allans 07 - 1,9.262932,4.348295,4079,1761357,1,0,0,0
2412,Park Farm 55 - 1 - 2020-11-24,35.068235,8.21864,34,-0.669379,0.291129,Drilled,Conventional,Krypton,Barley,61,25,14,30.1,2020-04-14,2020-11-24,Park Farm 55 - 1,9.237856,3.730353,3848,1615695,1,0,0,0
32,Allans 07 - 1 - 2019-10-29,35.047619,7.373478,42,0.340034,0.939739,Drilled,Conventional,Krypton,Barley,39,37,24,29.6,2019-04-01,2019-10-29,Allans 07 - 1,9.234461,3.485742,3781,1666780,1,0,0,0
