In [1]:
# Imports

import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, models, transforms
from torch.utils.data import DataLoader, Dataset, random_split
from PIL import Image
from tqdm import tqdm
import numpy as np
from sklearn.metrics import r2_score
import numpy as np

In [2]:
# Directories

train_dir = 'data/train_images'
test_dir = 'data/test_images'
train_csv = 'data/train.csv'
test_csv = 'data/test.csv'
output_csv = 'data/output.csv'

In [3]:
# Variables 

batch_size = 32
num_epochs = 25
learning_rate = 0.001
num_features = 6

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

In [11]:
# Normalize Train and Test's ancillary information
feature_columns = train_df.columns[1:-6]

# Calculate mean and std for each feature column
feature_means = train_df[feature_columns].mean()
feature_stds = train_df[feature_columns].std()

# Normalize the features
train_df[feature_columns] = (train_df[feature_columns] - feature_means) / feature_stds
test_df[feature_columns] = (test_df[feature_columns] - feature_means) / feature_stds

In [12]:
# Normalize Y label
output_features = train_df.iloc[:, -6:].values

output_mean = np.mean(output_features, axis=0)
output_std = np.std(output_features, axis=0)

print(output_mean)
print(output_std)


[1.03624107e+00 1.48317376e+02 1.97016450e+04 3.48191181e+03
 1.51120666e+01 3.99120598e+05]
[1.37329381e-01 6.91740145e+00 4.31037489e+00 6.70979751e+01
 5.93192463e-01 2.25494269e+03]


In [13]:
# Dataset, returns image, labels, and ID

class PlantDataset(Dataset):
    def __init__(self, dataframe, image_dir, transform=None):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Get the image path from the image directory and DataFrame
        img_name = os.path.join(self.image_dir, str(self.dataframe.iloc[idx, 0]) + '.jpeg')
        
        # Load the image
        image = Image.open(img_name).convert('RGB')
        
        # Extract the labels (last 6 columns) from the DataFrame
        labels = self.dataframe.iloc[idx, -6:].values.astype('float')
        
        # Apply transformations, if any
        if self.transform:
            image = self.transform(image)
        
        # Return the image and its corresponding labels
        return image, torch.tensor(labels), str(self.dataframe.iloc[idx, 0])

In [14]:
# Transformation
# Transforms without normalizing first

# unnormalized_transform = transforms.Compose([
#     transforms.Resize((224, 224)),
#     transforms.ToTensor()
# ])

# train_unnormalized_dataset = PlantDataset(train_df, train_dir, transform=unnormalized_transform)
# train_unnormalized_loader = DataLoader(train_unnormalized_dataset, batch_size=1, shuffle=False)

# test_unnormalize_dataset = PlantDataset(test_df, test_dir, transform=unnormalized_transform)
# test_unnormalized_loader = DataLoader(test_df, batch_size=1, shuffle=False)

# mean_sum = torch.zeros(3)
# std_sum = torch.zeros(3)
# n_images = 0

# print(len(train_unnormalized_loader))

# Calculate mean and std, commented out since only needs to calculate once

# for images in tqdm(train_unnormalized_loader):
#     image = images[0][0]

#     R_channel = image[0]
#     G_channel = image[1]
#     B_channel = image[2]

#     R_array = np.array(R_channel, dtype=np.float32)
#     G_array = np.array(G_channel, dtype=np.float32)
#     B_array = np.array(B_channel, dtype=np.float32)

#     # Stack to form a 3D array (H, W, C)
#     image_array = np.stack((R_array, G_array, B_array), axis=-1)

#     # Update accumulators
#     mean_sum += image_array.mean(axis=(0, 1))
#     std_sum += image_array.std(axis=(0, 1))

# for images in tqdm(test_unnormalized_loader):
#     image = images[0][0]

#     R_channel = image[0]
#     G_channel = image[1]
#     B_channel = image[2]

#     R_array = np.array(R_channel, dtype=np.float32) / 255.0
#     G_array = np.array(G_channel, dtype=np.float32) / 255.0
#     B_array = np.array(B_channel, dtype=np.float32) / 255.0

#     # Stack to form a 3D array (H, W, C)
#     image_array = np.stack((R_array, G_array, B_array), axis=-1)

#     # Update accumulators
#     mean_sum += image_array.mean(axis=(0, 1))
#     std_sum += image_array.std(axis=(0, 1))

# mean_sum /= len(train_unnormalized_loader)
# std_sum /= len(train_unnormalized_loader)

# print(mean_sum)
# print(std_sum)



# Tranfroms Dataset with mean and std

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4467, 0.4507, 0.3363], std=[0.1756, 0.1715, 0.1699])
])

full_dataset = PlantDataset(train_df, train_dir, transform=transform)
train_size = int(0.9 * len(full_dataset))
val_size = len(full_dataset) - train_size

train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

test_dataset = PlantDataset(test_df, test_dir, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Train with Resnet50

model = models.resnet50(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, num_features)
model = model.to(device)

# Using MSELoss and Adam

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)




In [15]:
# Evaluate model using training set with mock R2

def evaluate_model(model, dataloader, device):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for inputs, labels, _ in dataloader:
            labels = (labels - output_mean) / output_std
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            predictions.append(outputs.cpu().numpy())
            true_labels.append(labels.cpu().numpy())

    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)
    return r2_score(true_labels, predictions)

# Save results to csv

def predict_and_save(model, dataloader, device, output_file):
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for inputs, labels, id in dataloader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            outputs = outputs.cpu().numpy()
            outputs = outputs * output_std + output_mean  
                      
            for id_, output in zip(id, outputs):
                predictions.append([id_, *output])  # Combine ID with predicted features

    # Convert predictions to a DataFrame
    prediction_df = pd.DataFrame(predictions, columns=['id', 'X4', 'X11', 'X18' ,'X26', 'X50', 'X3112'])

    # Save to CSV
    prediction_df.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

In [17]:
maxR2 = -100

# Training loop
num_epochs = 15
best_model= model

for epoch in range(num_epochs):

    # Train model

    model.train()
    running_loss = 0.0
    for inputs, labels, _ in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):

        # Update
        labels = (labels - output_mean) / output_std
        inputs, labels = inputs.to(device).float(), labels.to(device).float()
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)

    currentR2 = evaluate_model(model, val_loader, device)
    print(f"Current Estimate R2 is {currentR2}")
    if (currentR2 > maxR2):
        maxR2 = currentR2
        best_model = model
        predict_and_save(model, test_loader, device, output_csv)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f'Epoch {epoch}/{num_epochs - 1}, Loss: {epoch_loss:.4f}')

Epoch 1/15: 100%|██████████| 1220/1220 [03:05<00:00,  6.56it/s]


Current Estimate R2 is 0.11404338730214976
Predictions saved to data/output.csv
Epoch 0/14, Loss: 0.8913


Epoch 2/15: 100%|██████████| 1220/1220 [03:09<00:00,  6.43it/s]


Current Estimate R2 is 0.10234350194368765
Epoch 1/14, Loss: 0.8847


Epoch 3/15: 100%|██████████| 1220/1220 [03:05<00:00,  6.57it/s]


Current Estimate R2 is -0.04922298019206104
Epoch 2/14, Loss: 0.8764


Epoch 4/15: 100%|██████████| 1220/1220 [03:05<00:00,  6.57it/s]


Current Estimate R2 is 0.11487680455972843
Predictions saved to data/output.csv
Epoch 3/14, Loss: 0.8696


Epoch 5/15: 100%|██████████| 1220/1220 [03:04<00:00,  6.61it/s]


Current Estimate R2 is 0.10283938475519148
Epoch 4/14, Loss: 0.8640


Epoch 6/15: 100%|██████████| 1220/1220 [03:02<00:00,  6.68it/s]


Current Estimate R2 is 0.12339798639367626
Predictions saved to data/output.csv
Epoch 5/14, Loss: 0.8553


Epoch 7/15: 100%|██████████| 1220/1220 [03:10<00:00,  6.41it/s]


Current Estimate R2 is 0.12052827802039728
Epoch 6/14, Loss: 0.8453


Epoch 8/15: 100%|██████████| 1220/1220 [03:15<00:00,  6.25it/s]


Current Estimate R2 is 0.12275472525131954
Epoch 7/14, Loss: 0.8317


Epoch 9/15: 100%|██████████| 1220/1220 [03:08<00:00,  6.46it/s]


Current Estimate R2 is 0.08379643566022522
Epoch 8/14, Loss: 0.8170


Epoch 10/15: 100%|██████████| 1220/1220 [03:06<00:00,  6.52it/s]


Current Estimate R2 is 0.09289280137562789
Epoch 9/14, Loss: 0.7978


Epoch 11/15: 100%|██████████| 1220/1220 [03:13<00:00,  6.29it/s]


Current Estimate R2 is 0.07652468423822358
Epoch 10/14, Loss: 0.7680


Epoch 12/15: 100%|██████████| 1220/1220 [03:31<00:00,  5.76it/s]


Current Estimate R2 is 0.06364450042964752
Epoch 11/14, Loss: 0.7357


Epoch 13/15: 100%|██████████| 1220/1220 [03:11<00:00,  6.38it/s]


Current Estimate R2 is 0.04430664819387481
Epoch 12/14, Loss: 0.6920


Epoch 14/15: 100%|██████████| 1220/1220 [03:03<00:00,  6.65it/s]


Current Estimate R2 is -0.009229122996713432
Epoch 13/14, Loss: 0.6472


Epoch 15/15: 100%|██████████| 1220/1220 [03:04<00:00,  6.62it/s]


Current Estimate R2 is 0.015463282769075585
Epoch 14/14, Loss: 0.6021


In [18]:
resnet_train_dataset = PlantDataset(train_df, train_dir, transform=transform)
resnet_train_dataloader = DataLoader(resnet_train_dataset, batch_size=32, shuffle=False)

resnet_test_dataset = PlantDataset(test_df, test_dir, transform=transform)
resnet_test_dataloader = DataLoader(resnet_test_dataset, batch_size=32, shuffle=False)

model.eval()
resnet_train_outputs = []

with torch.no_grad():
    for images, labels, img_ids in tqdm(resnet_train_dataloader):
        images = images.to(device)
        outputs = model(images)  # Assume model outputs a feature vector
        resnet_train_outputs.append(outputs.cpu().numpy())

resnet_train_outputs = np.concatenate(resnet_train_outputs, axis=0)

resnet_train_df = pd.DataFrame(resnet_train_outputs, columns=[f'ResNet_{i+1}' for i in range(resnet_train_outputs.shape[1])])

new_train_df = pd.concat([train_df.iloc[:, 0], resnet_train_df, train_df.iloc[:, 1:]], axis=1)

model.eval()
resnet_test_outputs = []

with torch.no_grad():
    for images, labels, img_ids in tqdm(resnet_test_dataloader):
        images = images.to(device)
        outputs = model(images)  # Assume model outputs a feature vector
        resnet_test_outputs.append(outputs.cpu().numpy())

resnet_test_outputs = np.concatenate(resnet_test_outputs, axis=0)

resnet_test_df = pd.DataFrame(resnet_test_outputs, columns=[f'ResNet_{i+1}' for i in range(resnet_test_outputs.shape[1])])

new_test_df = pd.concat([test_df.iloc[:, 0], resnet_test_df, test_df.iloc[:, 1:]], axis=1)

new_train_df.to_csv('data/enhanced_train.csv', index=False)
new_test_df.to_csv('data/enhanced_test.csv', index=False)

100%|██████████| 1356/1356 [02:00<00:00, 11.29it/s]
100%|██████████| 200/200 [00:20<00:00,  9.88it/s]


In [19]:
print(new_train_df)

              id  ResNet_1  ResNet_2  ResNet_3  ResNet_4  ResNet_5  ResNet_6  \
0      101801795  0.041336  0.138568 -0.299940 -0.194096 -0.077347 -0.142380   
1      115813315 -0.625196 -0.145219 -0.824124 -0.301115 -0.263433 -0.242254   
2      173551949  0.080681 -0.285931 -0.130485 -0.262000  0.202304 -0.114180   
3      148811120  0.089529  0.676764  0.946536  0.532315 -0.614573  1.715887   
4      195108876  0.350251  0.206010  0.893665  0.153691 -0.321190  1.471846   
...          ...       ...       ...       ...       ...       ...       ...   
43358  172502909 -0.199004  0.663847 -0.050354 -0.218364 -0.638626  0.646785   
43359  183294324  0.094806 -0.064742  1.128746  1.620218  0.048013  2.225168   
43360  108577580 -0.292752 -0.073270 -0.489194 -0.256287  0.066474 -0.228707   
43361  139067673 -0.218086  1.067038 -0.337774 -0.165488 -0.712103  0.300434   
43362  195383621 -0.778621  0.939716 -0.456877 -0.284214 -0.872787 -0.224540   

       WORLDCLIM_BIO1_annual_mean_tempe