In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from preprocess import *

In [2]:
base_directory = os.getcwd()
data_directory = base_directory + "/../Data"
print("Base Directory:", base_directory)
print("Data Directory:", data_directory)

data_dirs = os.listdir(data_directory)

Base Directory: /Users/brad/Desktop/Plant_Pal/Modeling
Data Directory: /Users/brad/Desktop/Plant_Pal/Modeling/../Data


In [3]:
data = []

for data_dir in data_dirs:
    if data_dir == ".DS_Store":
        continue
    species, label = data_dir.split('___')  # how species and label are delineated in filenames
    image_paths = os.listdir(os.path.join(data_directory, data_dir))
    for image in image_paths:
        entry = [species, label, os.path.join(data_dir, image)]
        data.append(entry)

data_df = pd.DataFrame(data, columns=['Species', 'Condition', 'Image'])

In [None]:
# Split the data into train (80%) and temp (20%)
train_df, temp_df = train_test_split(data_df, test_size=0.2, stratify=data_df[['Species', 'Condition']], random_state=2)

# Split the temp data into validation (10%) and test (10%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df[['Species', 'Condition']], random_state=2)

print("Training set size:", len(train_df))
print("Validation set size:", len(val_df))
print("Test set size:", len(test_df))
# Adjust the training dataframe to ensure no more than 300 samples of any particular condition
train_df = train_df.groupby('Condition').apply(lambda x: x.sample(n=min(len(x), 300), random_state=2)).reset_index(drop=True)

print("Adjusted Training set size:", len(train_df))

train_set = CustomDataset(train_df['Image'])
val_set = CustomDataset(val_df['Image'])
test_set = CustomDataset(test_df['Image'])

print(train_set.__getitem__(0))

training_loader = torch.utils.data.DataLoader(train_set, batch_size=8, shuffle=True)
validation_loader = torch.utils.data.DataLoader(val_set, batch_size=32, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=32, shuffle=False)

Training set size: 77836
Validation set size: 9730
Test set size: 9730
Adjusted Training set size: 11052
tensor([[[-0.9363, -1.1075, -1.1418,  ...,  0.2453,  0.1083,  0.5878],
         [-0.7650, -1.0562, -1.2617,  ...,  0.2282,  0.2967,  0.2111],
         [-0.9020, -1.0733, -1.1075,  ...,  0.6906,  0.9474,  0.0912],
         ...,
         [ 1.3755,  1.4269,  1.2214,  ...,  0.8961,  0.5193,  0.7077],
         [ 1.2385,  1.3242,  1.6667,  ...,  0.7248,  0.6221,  0.7077],
         [ 1.2728,  1.1187,  1.3070,  ...,  0.7248,  0.8618,  0.8104]],

        [[-1.0028, -1.1779, -1.2129,  ...,  0.1702,  0.0301,  0.5203],
         [-0.8277, -1.1253, -1.3354,  ...,  0.1527,  0.2227,  0.1352],
         [-0.9678, -1.1429, -1.1779,  ...,  0.6254,  0.8880,  0.0126],
         ...,
         [ 1.5007,  1.5532,  1.3431,  ...,  0.9755,  0.5903,  0.7829],
         [ 1.3606,  1.4482,  1.7983,  ...,  0.8004,  0.6954,  0.7829],
         [ 1.3957,  1.2381,  1.4307,  ...,  0.8004,  0.9405,  0.8880]],

        [[-

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Using {device} for inference')
model = torchvision.models.resnet50(weights="IMAGENET1K_V2")

Using cpu for inference




In [None]:
def train(learning_rate=0.001):
    early_stopper = Historian(early_stopping=3)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    model.train()
    running_vl = 0.0
    for i, data in enumerate(training_loader, 0):
        inputs = data[0].to(device)
        labels = data[1].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        train_loss = criterion(outputs, labels)
        train_loss.backward()
        optimizer.step()
        running_tl += train_loss.item()

        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)
    return running_tl / len(training_loader)

def cross_validation(rates=[0.1, 0.000001]):
    best_rate = None
    best_loss = float('inf')
    for rate in np.arange(rates[0], rates[1], -0.1):
        loss = train(learning_rate=rate)
        print(f'Learning rate: {rate}, Loss: {loss}')
        if loss < best_loss:
            best_loss = loss
            best_rate = rate
    print(f'Best learning rate: {best_rate}, Best loss: {best_loss}')
    return best_rate, best_loss
