In [1]:
%pip install torch torchvision

[33mDEPRECATION: amazon-textract-overlayer 0.0.11 has a non-standard dependency specifier Pillow>=9.2.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of amazon-textract-overlayer or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[33mDEPRECATION: amazon-textract-overlayer 0.0.11 has a non-standard dependency specifier pypdf>=2.5.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of amazon-textract-overlayer or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import torchvision.models as models

In [3]:
resnet18 = models.resnet18(pretrained=True)



### Prepare images and "Train" 

In [4]:
# Write a custom class to add image identifiers to vector representations for better analysis
import os
from PIL import Image
from torch.utils.data import Dataset

class UnderstandImages(Dataset):
    def __init__(self, img_dir, transform=None):
        self.img_dir = img_dir # this will be the dictionary passed in the custom class UnderstandImages {'className' : 'file/directory/for/class'}
        self.transform = transform # transform is the same as the block below - we want to begin converting it into tensor-like format
        self.img_names = [] # initializing this as an empty list because I want to iterate over each class directory and list their files

        for classname, directoryname in self.img_dir.items():
            image_names = os.listdir(directoryname) # list the items in the directory - does this list the items on every loop?
            for img_name in image_names:
                img_path = os.path.join(directoryname, img_name) # join the directory with the image names
                self.img_names.append((img_path, classname))
    
    def __len__(self):
        return len(self.img_names) 

    # override this class so I can get image and filename
    def __getitem__(self, idx):
        #img_path = os.path.join(self.img_dir, self.img_names[idx]) # is this still right?
        img_path, classname = self.img_names[idx] # we need to tuple unpack because we get two items in self.img_names based on our append
        image = Image.open(img_path) # convert raw images into tensors
        if self.transform: # apply the transform to each image 
            image = self.transform(image)
    
        return image, classname # return the image and classname because we want to use these for the labeling later on

In [5]:
from torchvision import transforms

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224), # crop to the center since we assume most of the important features are in the center
    transforms.ToTensor(), # converts a PIL image, like the one defined in the above code block to a PyTorch tensor or np.ndarray
    transforms.Normalize(mean = [0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # normalize based on the values of the resnet when it was originally trained
])  

In [6]:
import torch
resnet18.fc = torch.nn.Identity() # remove the final classification layer of the NN so I can just get the feature vectors

In [7]:
from torch.utils.data import DataLoader
import numpy as np # need a way to persist data so use np array to save items to .npy files too
dataset = UnderstandImages(img_dir = '/Users/zaynpatel/vision/visionLLM/CompRobo-VisionLLM/featureExtraction/train', transform=transform) # use the custom defined class to get filename in the dataloader
dataloader = DataLoader(dataset, batch_size=9, shuffle=False, num_workers=0) # pass in dataset of images, batch_size which is the number of images getting passed to the model, num_workers is number working in parallel on machine

resnet18.eval()

featuresList = []
filenamesList = []
# what are they being saved too? Ie - how do I know what next step to take to save items to file?
with torch.no_grad():
    for inputs, filenames in dataloader: # based on the class we defined, we know there are two inputs to dataloader so we unpack those here for use
        features = resnet18(inputs) # apply the resnet on the images
        for feature, filename in zip(features, filenames): # zip all output feature vectors and filenames together so we can visualize and compare after
            features_np = feature.detach().cpu().numpy().flatten() # error: did features.detch() instead of feature.detach() which meant ...
            featuresList.append(features_np)
            filenamesList.append(filename)

# clarify why convert from list to np.array
featuresArray = np.array(featuresList)
# clarify why we save as .npz and how np.load() does the unzipping for us
np.save('data.npy', featuresArray)

with open('filenames.txt', 'w') as f:
    for filename in filenamesList:
        f.write(f"{filename}\n")

In [14]:
from sklearn.neighbors import NearestNeighbors

data = np.load('data.npy', allow_pickle=True) # we said pickle is used because ...
with open('filenames.txt', 'r') as f:
    filename = f.readlines()

nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(data) # ball_tree eliminates datapoints that are too far from the query point, making search faster
distances, indices = nbrs.kneighbors(data)
print(f'these are distances {distances}, and indices {indices}')

these are distances [[ 0.          8.96715998]
 [ 0.         10.63652766]
 [ 0.         11.08275723]
 [ 0.         10.48828647]
 [ 0.         10.48828647]
 [ 0.          9.67516517]
 [ 0.          9.38490528]
 [ 0.          8.96715998]
 [ 0.         14.08209505]
 [ 0.          9.38490528]
 [ 0.         10.61641669]], and indices [[ 0  7]
 [ 1  3]
 [ 2 10]
 [ 3  4]
 [ 4  3]
 [ 5  6]
 [ 6  9]
 [ 7  0]
 [ 8  9]
 [ 9  6]
 [10  9]]
