In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from preprocess import *

In [7]:
base_directory = os.getcwd()
data_directory = base_directory + "/../Data"
print("Base Directory:", base_directory)
print("Data Directory:", data_directory)

data_dirs = os.listdir(data_directory)

Base Directory: /Users/brad/Desktop/Plant_Pal/Modeling
Data Directory: /Users/brad/Desktop/Plant_Pal/Modeling/../Data


In [9]:
data = []

for data_dir in data_dirs:
    if data_dir == ".DS_Store":
        continue
    species, label = data_dir.split('___')  # how species and label are delineated in filenames
    image_files = os.listdir(os.path.join(data_directory, data_dir))
    for image in image_files:
        entry = [species, label, image]
        data.append(entry)

data_df = pd.DataFrame(data, columns=['Species', 'Condition', 'Image'])

In [8]:
# Split the data into train (80%) and temp (20%)
train_df, temp_df = train_test_split(data_df, test_size=0.2, stratify=data_df[['Species', 'Condition']], random_state=2)

# Split the temp data into validation (10%) and test (10%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df[['Species', 'Condition']], random_state=2)

print("Training set size:", len(train_df))
print("Validation set size:", len(val_df))
print("Test set size:", len(test_df))
# Adjust the training dataframe to ensure no more than 300 samples of any particular condition
train_df = train_df.groupby('Condition').apply(lambda x: x.sample(n=min(len(x), 300), random_state=2)).reset_index(drop=True)

print("Adjusted Training set size:", len(train_df))

Training set size: 77836
Validation set size: 9730
Test set size: 9730
Adjusted Training set size: 11052


In [9]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Using {device} for inference')
resnet50 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_resnet50', pretrained=True)

Using cpu for inference


Downloading: "https://github.com/NVIDIA/DeepLearningExamples/zipball/torchhub" to /Users/brad/.cache/torch/hub/torchhub.zip
Downloading: "https://api.ngc.nvidia.com/v2/models/nvidia/resnet50_pyt_amp/versions/20.06.0/files/nvidia_resnet50_200821.pth.tar" to /Users/brad/.cache/torch/hub/checkpoints/nvidia_resnet50_200821.pth.tar
100%|██████████| 97.7M/97.7M [00:02<00:00, 42.5MB/s]
