## Use Case: Pruning Tile2Vec ResNet-18 Neurons
We use DnD to identify low contributing neurons in Tile2Vec ResNet-18 models pre-trained on the NAIP dataset. Embeddings are used to train a Random Forest Classifier to predict corresponding Cropland Data Layer (CDL) labels.

In [None]:
import numpy as np
import os
import torch
from time import time
import random
from torch.autograd import Variable

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

import sys
sys.path.append('../')
from src.resnet import ResNet18

sys.path.append('sandbox-DnD')
import data_utils

### Loading pre-trained TileNet model

In [None]:
# Setting up model
in_channels = 4
z_dim = 512
cuda = torch.cuda.is_available()
tilenet = ResNet18()
if cuda: tilenet.cuda()

In [None]:
# Load parameters
model_fn = 'sandbox-DnD/tile2vec/models/naip_trained.ckpt'
checkpoint = torch.load(model_fn)
tilenet.load_state_dict(checkpoint)
tilenet.eval()

### Data Processing

In [None]:
# Embeds NAIP tiles using ResNet
def embed_tiles(tilenet, n_tiles = 1000):
    
    t0 = time()
    X = np.zeros((n_tiles, z_dim))
    
    for idx in range(n_tiles):
        tile = np.load(os.path.join(tile_dir, '{}tile.npy'.format(idx+1)))
        tile = tile[:,:,:4] # Get first 4 NAIP channels (5th is CDL mask)
        tile = np.moveaxis(tile, -1, 0)
        tile = np.expand_dims(tile, axis=0)
        tile = tile / 255
        tile = torch.from_numpy(tile).float()
        tile = Variable(tile)
        if cuda: tile = tile.cuda()
        z = tilenet.encode(tile)
        if cuda: z = z.cpu()
        z = z.data.numpy()
        X[idx,:] = z
        
    t1 = time()
    print('Embedded {} tiles: {:0.3f}s'.format(n_tiles, t1-t0))
    return X

# Load CDL Classes
def load_cdl_classes(cdl_tiles_path, cdl_labels_path):
    
    y = np.load(os.path.join(tile_dir, 'y.npy'))
    y_set = set(y)
    cdl_df = pd.read_csv(cdl_labels_path)

    cdl_labels = []
    for code in y_set:
        idx = cdl_df.loc[(cdl_df == code).any(axis=1)].index[0]
        cdl_labels.append(cdl_df.iloc[idx][1])
    
    y = LabelEncoder().fit_transform(y) # Reindex CDL classes

    return y

### 1. Classifier Accuracy (No Pruning)
We first embed 1,000 NAIP tiles provided in `tile2vec/data/tiles` and evaluate on a Random Forest Classifier trained on NAIP embeddings.

In [None]:
# Get NAIP tiles
tile_dir = 'sandbox-DnD/tile2vec/data/tiles'
labels_dir = 'sandbox-DnD/tile2vec/data/CDL_labels.csv'
n_tiles = 1000

# Initialize ResNet layer dimensions
layer_dims = {"layer1" : 64, "layer2" : 128, "layer3" : 256, "layer4" : 512, "layer5" : 512}

# Embed tiles
X = embed_tiles(tilenet, n_tiles)

# Load CDL classes
y = load_cdl_classes(tile_dir, labels_dir)

In [None]:
# Train Classifier

accs = np.zeros((0,))
random_state = random.randint(0, 999999)

# Splitting data and training RF classifer
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state = random_state)
rf = RandomForestClassifier()
rf.fit(X_tr, y_tr)
accs = rf.score(X_te, y_te) 

print("Random State: {}\nClassifier Accuracy - {}".format(random_state, accs))

### 2. Fixed Classifier (With Pruning)
We prune all ungrouped neurons identified by DnD, then use the original fixed classifier to evaluate. 

In [None]:
# NOTE: You may need to retrain the classifier without pruning (above)
#       if you have evaluated on the 3. Retrained Classifier Experiment (below)

target_layer = "layer5"

# Insert list of ungrouped DnD neurons
ids_to_prune = []

"""
# Use for Random Pruning
num_neurons = 100
ids_to_prune = random.sample(range(0, layer_dims[target_layer]), num_neurons)
"""

ids_to_prune.sort()

# Mask pruned neurons
print("Number of pruned neurons {}/{}".format(len(ids_to_prune), (layer_dims[target_layer])))
to_prune = (target_layer, ids_to_prune)
tilenet, handle = data_utils.prune(tilenet, to_prune)

# Embed tiles
X = embed_tiles(tilenet, n_tiles)

# Load CDL classes
y = load_cdl_classes(tile_dir, labels_dir)

# Remove pruning mask
handle.remove()

In [None]:
accs = np.zeros((0,))

# Splitting data and training and re-evaluate using original RF classifier
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state = random_state)
accs = rf.score(X_te, y_te) 

print("Random State: {}\nFixed Classifier Accuracy - {}".format(random_state, accs))

### 3. Retrained Classifier (With Pruning)
We prune all ungrouped neurons identified by DnD, then retrain a new Random Forest classifier to evaluate. 

In [None]:
print("Number of pruned neurons {}/{}".format(len(ids_to_prune), (layer_dims[target_layer])))

# Mask pruned neurons
to_prune = (target_layer, ids_to_prune)
tilenet, handle = data_utils.prune(tilenet, to_prune)

# Embed tiles
X = embed_tiles(tilenet, n_tiles)
    
# Check CDL classes
y = load_cdl_classes(tile_dir, labels_dir)

# Remove pruning mask
handle.remove()

In [None]:
accs = np.zeros((0,))

# Splitting data and training and re-evaluate using retrained RF classifier
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state = random_state)
rf = RandomForestClassifier()
rf.fit(X_tr, y_tr)

accs = rf.score(X_te, y_te) 
print("Random State: {}\nRetrained Classifier Accuracy - {}".format(random_state, accs))