# Wild Blueberry Yield Prediction

**Context**: The dataset used for predictive modeling was generated by the Wild Blueberry Pollination Simulation Model, which is an open-source, spatially-explicit computer simulation program that enables exploration of how various factors, including plant spatial arrangement, outcrossing and self-pollination, bee species compositions and weather conditions, in isolation and combination, affect pollination efficiency and yield of the wild blueberry agroecosystem.

**Goal**: Through the provided features predict the wild blueberry yield amount.

**Feature Description**:
- Clonesize: m2 The average blueberry clone size in the field
- Honeybee: bees/m2/min Honeybee density in the field
- Bumbles: bees/m2/min Bumblebee density in the field
- Andrena: bees/m2/min Andrena bee density in the field
- Osmia: bees/m2/min Osmia bee density in the field
- MaxOfUpperTRange: ℃ The highest record of the upper band daily air temperature during the bloom season
- MinOfUpperTRange: ℃ The lowest record of the upper band daily air temperature
- AverageOfUpperTRange: ℃ The average of the upper band daily air temperature
- MaxOfLowerTRange: ℃ The highest record of the lower band daily air temperature
- MinOfLowerTRange: ℃ The lowest record of the lower band daily air temperature
- AverageOfLowerTRange: ℃ The average of the lower band daily air temperature
- RainingDays: Day The total number of days during the bloom season, each of which has precipitation larger than zero
- AverageRainingDays: Day The average of raining days of the entire bloom season

**Resources**:
- [Kaggle Challenge](https://www.kaggle.com/competitions/playground-series-s3e14/overview)
- [Dataset](https://www.kaggle.com/datasets/shashwatwork/wild-blueberry-yield-prediction-dataset)

In [None]:
# Import Standard Libraries
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from colorama import Style, Fore

In [None]:
# Define Seaborn theme parameters
theme_parameters =  {
    'axes.spines.right': False,
    'axes.spines.top': False,
    'grid.alpha':0.3,
    'figure.figsize': (16, 6),
    'font.family': 'Andale Mono',
    'axes.titlesize': 24,
    'figure.facecolor': '#E5E8E8',
    'axes.facecolor': '#E5E8E8'
}

# Set the theme
sns.set_theme(style='whitegrid',
              palette=sns.color_palette('deep'), 
              rc=theme_parameters)

In [None]:
# Define Colors
black = Style.BRIGHT + Fore.BLACK
magenta = Style.BRIGHT + Fore.MAGENTA
red = Style.BRIGHT + Fore.RED
blue = Style.BRIGHT + Fore.BLUE
reset_colors = Style.RESET_ALL

# Read Data

In [None]:
# Switch flag for Kaggle Cloud
kaggle = False

In [None]:
# Read training data
if kaggle:
    
    # Read data from Kaggle FS
    train_data = pd.read_csv('/kaggle/input/playground-series-s3e14/train.csv')
    test_data = pd.read_csv('/kaggle/input/playground-series-s3e14/test.csv')
    
else:
    
    # Define local data file paths
    train_data_file_path = Path(os.path.abspath('')).parents[1] / 'data' / 'S3E14' / 'wild_blueberry_yield_train.csv'
    test_data_file_path = Path(os.path.abspath('')).parents[1] / 'data' / 'S3E14' / 'wild_blueberry_yield_test.csv'
   
    train_data = pd.read_csv(train_data_file_path)
    test_data = pd.read_csv(test_data_file_path)

In [None]:
train_data.head()

In [None]:
train_data.info()

# Exploratory Data Analysis (EDA)

## Shapes Information

In [None]:
# Print shapes information
print(f'{blue}Data Shapes:'
      f'{blue}\n- Train Data  -> {red}{train_data.shape}'
      f'{blue}\n- Test Data   -> {red}{test_data.shape}\n')

## Null Values Information

In [None]:
# Print null values information
print(f'{blue}Data Null Values:'
      f'{blue}\n- Train Data  -> {red}{train_data.isnull().any().sum()}'
      f'{blue}\n- Test Data   -> {red}{test_data.isnull().any().sum()}\n')

## Train vs Test Feature Distribution

In [None]:
# Plot the KDE of each feature
figure, ax = plt.subplots(4, 4, figsize=(16, 12))
ax = ax.flatten()

# Fetch the data to plot (exclude the 'id' and 'label' columns)
for index, column_name in enumerate(train_data.columns[1:-1]):
    
    # Plot data
    sns.kdeplot(data=train_data[column_name],
                label='Train',
                ax=ax[index])
    
    sns.kdeplot(data=test_data[column_name],
                label='Test',
                ax=ax[index])
    
    ax[index].set_title(column_name, fontsize=14)
    
    ax[index].tick_params(labelrotation=45)
    
    # Retrieve legend information
    handles = ax[index].get_legend_handles_labels()[0]
    labels = ax[index].get_legend_handles_labels()[1]
    ax[index].legend().remove()

# Set the legend
figure.legend(handles, 
              labels, 
              loc='upper center', 
              bbox_to_anchor=(0.5, 1.03), 
              fontsize=12,
              ncol=3)

figure.suptitle('Train vs Test Feature Distribution',
                fontweight='bold',
                fontsize=24)

plt.tight_layout()

### Honeybee Distribution

In [None]:
# Define figure and axes
figure, ax = plt.subplots(2, 1, figsize=(12, 6))
ax = ax.flatten()

# Plot the Boxplot of 'honeybee'
sns.boxplot(data=train_data,
            x='honeybee',
            ax=ax[0])

sns.boxplot(data=test_data,
            x='honeybee',
            ax=ax[1])

ax[0].set_title('Train Honeybee Distribution')
ax[1].set_title('Test Honeybee Distribution')

plt.tight_layout()

plt.show()

In [None]:
# Define figure and axes
figure, ax = plt.subplots(2, 1, figsize=(12, 6))
ax = ax.flatten()

# Plot the KDEs of 'honeybee'
sns.kdeplot(data=train_data[train_data['honeybee'] < 2.5]['honeybee'],
            label='Train',
            ax=ax[0])

sns.kdeplot(data=test_data[test_data['honeybee'] < 2.5]['honeybee'],
            label='Test',
            color='orange',
            ax=ax[1])

figure.suptitle('Honeybee KDE', fontsize=24)

figure.legend()

plt.tight_layout()

plt.show()

### Drop Outliers Honeybee

In [None]:
# Drop outlisers in 'honeybee'
train_data = train_data[train_data['honeybee'] < 2.5]
test_data = test_data[test_data['honeybee'] < 2.5]

- Dropped outlisers in `honeybee`
- Exclude `honeybee`, `bumbles`, `andrena` and `osmia` from Standard Scaler
- No differences between Train and Test Feature Distributions
- Temperatures have the exact same distribution
- Fruits and seeds information have almost the same distribution

## Label Distribution

In [None]:
# Plot the KDE of 'yield'
sns.kdeplot(data=train_data['yield'])

plt.title('Yield KDE', fontsize=24)

plt.tight_layout()

plt.show()

### Yield vs Fruit Set vs Fruit Mass vs Seeds KDE

In [None]:
# Define figure and axes
figure, ax = plt.subplots(4, 1, figsize=(16, 9))
ax = ax.flatten()

# Plot the KDEs of 'honeybee'
sns.kdeplot(data=train_data['yield'], 
            label='yield', 
            ax=ax[0])

sns.kdeplot(data=train_data['fruitset'], 
            label='fuitset',  
            ax=ax[1])

sns.kdeplot(data=train_data['fruitmass'], 
            label='fruitmass', 
            ax=ax[2])

sns.kdeplot(data=train_data['seeds'], 
            label='seeds',
            ax=ax[3])

figure.suptitle('Yield vs Fruit Set vs Fruit Mass vs Seeds KDEs', fontsize=24)

plt.tight_layout()

plt.show()

The following features have almost the same distribution as the label:
- `fruitset`    
- `fruitmass`
- `seeds`

## Pearson Correlation

In [None]:
# Compute the correlation matrix
correlation_matrix = train_data.iloc[:, 1:].corr()

In [None]:
# Generate a mask for the upper triangle
correlation_mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

In [None]:
# Define figure and axis
figure, ax = plt.subplots(figsize=(30, 12))

# Plot the correlation matrix
sns.heatmap(correlation_matrix, 
            mask=correlation_mask, 
            cmap='mako',
            vmax=1.0, 
            vmin=-1.0, 
            center=0, 
            square=True, 
            linewidths=.5, 
            annot=True,
            annot_kws={'fontsize': 10},
            cbar_kws={"shrink":.8, 'orientation':'vertical'})

# Set title
ax.set_title('Pearson Correlation', 
             fontsize=24, 
             fontweight='bold')

plt.tight_layout()

plt.show()

- There an important correlation between `clonesize` and `honeybee`
- There is another confirmation that`fruitset`, `fruitmass` and `seeds` are strongly from `yield`. However, the strong correlation and the almost same distribution might mean that those three features are retrieved in the exact same moment in which also the yield is retrieved. They might be not available in a previous time and thus not useful in case of a previous prediction time.
- All the temperatures are duplicated features. Just keep `MaxOfUpperTRange`.
- The features `AverageRainingDays` and `RainingDays` are identical. Keep only `AverageRainingDays`

## Pairplots

In [None]:
# Plot the Pairplot between features and the target
sns.pairplot(train_data[['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia', 'MaxOfUpperTRange', 'AverageRainingDays', 'fruitset', 'fruitmass', 'seeds', 'yield']],
             kind="reg",
             diag_kind='kde',
             plot_kws={'line_kws':{'color':'red'}},
             corner=True)

# Set title plot
plt.suptitle('Features and Target Pairplots', 
             fontsize=20, 
             fontweight='bold')

plt.tight_layout()

plt.show()

## Conclusions

- No null values
- Dropped outlisers in `honeybee`
- No differences between Train and Test Feature Distributions
- The following features have almost the same distribution as the label:
    - `fruitset`    
    - `fruitmass`
    - `seeds`
- There is an important correlation between `clonesize` and `honeybee`
- All the temperatures are duplicated features. Just keep `MaxOfUpperTRange`.
- The features `AverageRainingDays` and `RainingDays` are identical. Keep only `AverageRainingDays`

# Data Preparation

## Feature Engineering

In [None]:
def compute_engineered_features(data: pd.DataFrame) -> pd.DataFrame:
    """
    Create a pre-defined set of engineered feature to the input DataFrame
    
    Args:
        data Pandas.DataFrame input
    
    Returns:
        data Pandas.DataFrame with additional engineered features
    """
    
    # Drop outlisers in 'honeybee'
    data = data[data['honeybee'] < 2.5]
    
    # Create a feature `fruitset per fruitmass`
    data['fruitset per fruitmass'] = data['fruitset'] * data['fruitmass']
    
    # Create a feature `fruitset per seeds`
    data['fruitset per seeds'] = data['fruitset'] * data['seeds']
    
    # Create a feature `fruitmass per seeds`
    data['fruitmass per seeds'] = data['fruitmass'] * data['seeds']
    
    # Create a feature `clonesize per honeybee`
    data['clonesize per honeybee'] = data['clonesize'] * data['honeybee']
    
    return data

In [None]:
# Apply the feature engineering
train_data = compute_engineered_features(train_data.copy())
test_data = compute_engineered_features(test_data.copy())

In [None]:
train_data.info()

## Features and Labels Definition

In [None]:
# Define features and labels
numerical_features = ['clonesize', 
                      'honeybee', 
                      'bumbles', 
                      'andrena', 
                      'osmia', 
                      'MaxOfUpperTRange', 
                      'AverageRainingDays', 
                      'fruitset', 
                      'fruitmass', 
                      'seeds']

numerical_engineered_featuers = ['fruitset per fruitmass',
                                 'fruitset per seeds', 
                                 'fruitmass per seeds', 
                                 'clonesize per honeybee']

categorical_features = []

categorical_engineered_features = []

features = numerical_features + numerical_engineered_featuers + categorical_features + categorical_engineered_features

labels = ['yield']