In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import statements

import os
import sys
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import Dense, Input, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.mixed_precision import set_global_policy
set_global_policy('mixed_float16')

for gpu in tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, True)
    
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
train = pd.read_csv('/kaggle/input/planttraits2024/train.csv') # Puts the train.csv data into the variable train

sd_columns = [col for col in train.columns if col.endswith('_sd')] # Drops the columns ending in _sd which are the trait columns
train = train.drop(columns=sd_columns)

train_image_folder = '/kaggle/input/planttraits2024/train_images' # sets a variable for the training images
train['image_path'] = train['id'].apply(lambda x: os.path.join(train_image_folder, f"{x}.jpeg")) # Creates a variable that finds the image path given the id of the train data

test = pd.read_csv('/kaggle/input/planttraits2024/test.csv') # creates a variable for the teset data
test_image_folder = '/kaggle/input/planttraits2024/test_images' # sets a variable for the test images
test['image_path'] = test['id'].apply(lambda x: os.path.join(test_image_folder, f"{x}.jpeg")) # creates a variable that finds the image path given the id of the test data

mean_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean'] # creates column names for the graphs used later

In [None]:
# Define image augmentation operations
def augment_image(img):
    img = tf.image.random_flip_left_right(img)
    img = tf.image.random_flip_up_down(img)
    img = tf.image.random_brightness(img, max_delta=0.2)
    img = tf.image.random_contrast(img, lower=0.5, upper=1.5)
    img = tf.image.random_hue(img, max_delta=0.2)
    img = tf.image.random_saturation(img, lower=0.5, upper=1.5)
    img = tf.image.random_crop(img, size=[224, 224, 3])  # Random cropping
    return img

# Process image with augmentation
def process_image(file_path):
    img = tf.io.read_file(file_path)
    img = tf.image.decode_jpeg(img, channels=3) #Decodes a JPEG-encoded image to a tensor
    img = tf.image.resize(img, [224, 224])
    img = augment_image(img)  # Apply augmentation
    img = preprocess_input(img) 
    return img

# Define your dataset processing function
def process_path(file_path, tabular_data, targets):
    img = process_image(file_path)
    return (img, tabular_data), targets

In [None]:
def plot_data(df):
    plt.figure(figsize=(15, 3))

    # Setting up a grid of plots with 2 columns
    n_cols = 6
    n_rows = 1

    for i, col in enumerate(mean_columns):
        plt.subplot(n_rows, n_cols, i+1)
        sns.kdeplot(df[col], bw_adjust=0.5, fill=False, color='blue')
        plt.title(f'Distribution of {col}')
        plt.xlabel('Value')
        plt.ylabel('Density')

    plt.tight_layout()
    plt.show()
    
plot_data(train)

In [None]:
for column in mean_columns:
    upper_quantile = train[column].quantile(0.98)  # Creates a variable for the top 2% of the data, the outliers
    train = train[(train[column] < upper_quantile)] 
    train = train[(train[column] > 0)]    # creates a new training set where the values are above 0 and excluding the outliers
plot_data(train) 

In [None]:
original_means = {} # creates variables to hold the original means and standard deviations
original_stds = {}

for column in mean_columns:
    # Calculate the mean and standard deviation for each column
    original_means[column] = train[column].mean()
    original_stds[column] = train[column].std()
    
    # Apply the scaling: (value - mean) / std
    # This standardizes each column to have a mean of 0 and std of 1
    train[column] = (train[column] - original_means[column]) / original_stds[column]
    
plot_data(train)

In [None]:
x = train.drop(columns=['id', 'image_path'] + mean_columns) # creates a variable for train that drops the id, image_path, and mean_columns

for column in x.columns: # Normalizes the data
    min_val = x[column].min()
    max_val = x[column].max()
    x[column] = (x[column] - min_val) / (max_val - min_val)    

In [None]:
y = train[mean_columns] # sets the output variable for the training set
x_paths = train['image_path'] # creates a variable for tracking the image paths

train_tabular, val_tabular, train_targets, val_targets = train_test_split( # Splits the training data into train and validation sets
    x, y, test_size=0.2, random_state=42)

train_paths, val_paths = train_test_split( # does the same split for the image paths
    x_paths, test_size=0.2, random_state=42)

# creates new dataset variables converting data to numpy arrays
train_ds = tf.data.Dataset.from_tensor_slices((train_paths, train_tabular.to_numpy(), train_targets.to_numpy())) 
val_ds = tf.data.Dataset.from_tensor_slices((val_paths, val_tabular.to_numpy(), val_targets.to_numpy()))

# Apply the processing function
train_ds = train_ds.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)
val_ds = val_ds.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
# Image model pathway
image_input = Input(shape=(224, 224, 3)) 
effnet_layer = EfficientNetB0(include_top=False, weights='imagenet', input_tensor=image_input, pooling='avg') #  First layer is the pretrained model for the images
effnet_layer.trainable = False  # Setting this false allows the model to continue using pre-trained features instead of re-training it 

# Tabular model pathway
tabular_input = Input(shape=(train_tabular.shape[1],))
tabular_dense = Dense(512, activation='relu')(tabular_input)
tabular_dense = Dropout(0.5)(tabular_dense)  # Add dropout for regularization, reduces overfitting of the model

# Concatenate both pathways
concat = Concatenate()([effnet_layer.output, tabular_dense])
concat_dense = Dense(256, activation='relu')(concat)
concat_dense = Dropout(0.5)(concat_dense)  # Continue to use dropout for regularization

# Output layer for 6 targets
output = Dense(len(mean_columns), activation='linear')(concat_dense)  # Use linear activation for regression

model = Model(inputs=[image_input, tabular_input], outputs=output)

model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

#model.summary()
model

In [None]:
history = model.fit(train_ds, validation_data=val_ds, epochs=50)

In [None]:
model.save_weights("model.weights.h5") # saves the weights from the model

In [None]:
# Prepare test data (excluding 'id' and 'image_path')
test_tabular = test.drop(columns=['id', 'image_path'])

#normalize test data
for column in test_tabular.columns:
    min_val = test_tabular[column].min()
    max_val = test_tabular[column].max()
    test_tabular[column] = (test_tabular[column] - min_val) / (max_val - min_val)

In [None]:
test_tabular_np = test_tabular.to_numpy() # converts to numpy array

# Create a TensorFlow dataset for the image paths and map them through the preprocessing function
test_images_ds = tf.data.Dataset.from_tensor_slices(test['image_path'])\
    .map(process_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)

# Create a TensorFlow dataset for the tabular data
test_tabular_ds = tf.data.Dataset.from_tensor_slices(test_tabular_np)

# Zip the two datasets together
test_ds = tf.data.Dataset.zip((test_images_ds, test_tabular_ds))

# Prepare the dataset for prediction by ensuring the structure matches the model's expectations
test_ds_for_prediction = test_ds.map(lambda image, tabular: ((image, tabular),), num_parallel_calls=tf.data.experimental.AUTOTUNE)

# Use the model to predict on the dataset
predictions = model.predict(test_ds_for_prediction)

predictions_df = pd.DataFrame(predictions, columns=mean_columns)
test = pd.concat([test.reset_index(drop=True), predictions_df], axis=1) # creates variable to plot

plot_data(test)

#Verify we didn't predict NaNs..
print("NaN values\n", test[mean_columns].isna().sum())
test[mean_columns]

In [None]:
for column in mean_columns:
    original_mean = original_means[column]
    original_std = original_stds[column]

    # Reverse the standardization
    test[column] = test[column] * original_std + original_mean

plot_data(test)
test[mean_columns]

In [None]:
test = test[['id'] + mean_columns]

#rename from _mean to fit the expectations of the competition
test.columns = test.columns.str.replace('_mean', '') 
test.to_csv('submission.csv', index=False) # Puts the test df into submission.csv

test 