In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import sys
from preprocess import *

In [2]:
base_directory = os.getcwd()
data_directory = base_directory + "/../Data"
print("Base Directory:", base_directory)
print("Data Directory:", data_directory)

data_dirs = os.listdir(data_directory)

# Add the sibling directory to sys.path
util_dir = os.path.join(base_directory, '..', 'Utility')
sys.path.append(util_dir)

from helper import disp_image   # test sibling directory import

Base Directory: /Users/brad/Desktop/Plant_Pal/Modeling
Data Directory: /Users/brad/Desktop/Plant_Pal/Modeling/../Data


In [3]:
data = []
label_dict = {}

for ind, data_dir in enumerate(sorted(data_dirs)):
    if data_dir == ".DS_Store":
        continue
    label_dict[data_dir] = ind
    species, label = data_dir.split('___')  # how species and label are delineated in filenames
    image_paths = os.listdir(os.path.join(data_directory, data_dir))
    for image in image_paths:
        entry = [species, label, os.path.join(data_dir, image)]
        data.append(entry)

data_df = pd.DataFrame(data, columns=['Species', 'Condition', 'Image'])

In [4]:
# Split the data into train (80%) and temp (20%)
train_df, temp_df = train_test_split(data_df, test_size=0.2, stratify=data_df[['Species', 'Condition']], random_state=2)

# Split the temp data into validation (10%) and test (10%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df[['Species', 'Condition']], random_state=2)

print("Training set size:", len(train_df))
print("Validation set size:", len(val_df))
print("Test set size:", len(test_df))
# Adjust the training dataframe to ensure no more than 300 samples of any particular condition
train_df = train_df.groupby('Condition').apply(lambda x: x.sample(n=min(len(x), 300), random_state=2)).reset_index(drop=True)

print("Adjusted Training set size:", len(train_df))

train_set = CustomDataset(train_df['Image'])
val_set = CustomDataset(val_df['Image'])
test_set = CustomDataset(test_df['Image'])

print(train_set.__getitem__(1))

training_loader = torch.utils.data.DataLoader(train_set, batch_size=16, shuffle=True)
validation_loader = torch.utils.data.DataLoader(val_set, batch_size=32, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=32, shuffle=False)

Training set size: 77836
Validation set size: 9730
Test set size: 9730
Adjusted Training set size: 11052
(tensor([[[ 0.1939,  0.2967, -0.0972,  ...,  0.5707,  0.5022,  0.2796],
         [-0.3369,  0.0398,  0.5707,  ...,  0.5364,  0.5707,  0.4166],
         [-0.3883,  0.0569, -0.3027,  ...,  0.4508,  0.5878,  0.5536],
         ...,
         [-0.2856, -0.3541, -0.1828,  ...,  0.8447,  0.7933,  0.7248],
         [-0.0116,  0.3481,  0.0056,  ...,  0.7419,  0.6563,  0.6049],
         [-0.0629,  0.4337,  0.0912,  ...,  0.7591,  0.6392,  0.5364]],

        [[ 0.2577,  0.3627, -0.0399,  ...,  0.6779,  0.6078,  0.3803],
         [-0.2850,  0.1001,  0.6429,  ...,  0.6429,  0.6779,  0.5203],
         [-0.3375,  0.1176, -0.2500,  ...,  0.5553,  0.6954,  0.6604],
         ...,
         [-0.2325, -0.3025, -0.1275,  ...,  0.9580,  0.9055,  0.8354],
         [ 0.0476,  0.4153,  0.0651,  ...,  0.8529,  0.7654,  0.7129],
         [-0.0049,  0.5028,  0.1527,  ...,  0.8704,  0.7479,  0.6429]],

        [[

In [5]:
def train(model, num_epochs=10, learning_rate=0.001):
    early_stopper = Historian(early_stopping=3)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    model.train()
    for epoch in range(num_epochs):
        print(f'----- Epoch {epoch + 1}/{num_epochs} -----')
        train_preds, val_preds = [0, 0], [0, 0] # (correct, total)
        running_tl, running_vl = 0.0, 0.0
        # --- Training ---
        for data in tqdm(training_loader, desc='Training Batches'):
            inputs = data[0].to(device)
            test1 = data[0]
            test2 = data[1]
            labels = data[1].to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            train_loss = criterion(outputs, labels)
            train_loss.backward()
            optimizer.step()

            running_tl += train_loss.item()
            _, predicted = torch.max(outputs, 1)
            train_preds[0] += (predicted == labels).sum().item()
            train_preds[1] += labels.size(0)
        # ------------------
        
        model.eval()

        # === Validation ===
        with torch.no_grad():
            for data in tqdm(validation_loader, desc='Validation Batches'):
                inputs = data[0].to(device)
                labels = data[1].to(device)
                outputs = model(inputs)
                val_loss = criterion(outputs, labels)

                running_vl += val_loss.item()
                _, predicted = torch.max(outputs, 1)
                val_preds[0] += (predicted == labels).sum().item()
                val_preds[1] += labels.size(0)
        # ==================
        avg_tl, avg_ta = running_tl / len(training_loader), train_preds[0] / train_preds[1]
        avg_vl, avg_va = running_vl / len(validation_loader), val_preds[0] / val_preds[1]
        if not early_stopper.record(avg_tl, avg_ta, avg_vl, avg_va):
            break
        early_stopper.save_model()
        print()
    early_stopper.final_performance(verbose=True)
    return running_tl / len(training_loader)

def cross_validation(rates=[0.1, 0.000001]):
    best_rate = None
    best_loss = float('inf')
    for rate in np.arange(rates[0], rates[1], -0.1):
        loss = train(learning_rate=rate)
        print(f'Learning rate: {rate}, Loss: {loss}')
        if loss < best_loss:
            best_loss = loss
            best_rate = rate
    print(f'Best learning rate: {best_rate}, Best loss: {best_loss}')
    return best_rate, best_loss


In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Using {device} for inference')
model = torchvision.models.resnet50(weights="IMAGENET1K_V2")

train(model, num_epochs=10, learning_rate=0.001)

Using cpu for inference
----- Epoch 1/10 -----


Training Batches:  19%|█▉        | 134/691 [11:29<1:01:52,  6.66s/it]