In [None]:
import os
import shutil
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import pickle

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import copy

In [None]:
BASE_DIR1 = '/content/drive/My Drive/Mali'
PROCESSED_DIR = os.path.join(BASE_DIR1,'processed')
RESULTS_DIR = os.path.join(BASE_DIR1, 'results')
CNN_TRAIN_IMAGE_DIR = os.path.join(BASE_DIR1,'cnn_images')
CNN_DIR = os.path.join(BASE_DIR1, 'models','mali_trained_model', 'mali_trained_model1.pt')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_images = pd.read_csv(os.path.join(PROCESSED_DIR, 'image_download_actual.csv'))

In [None]:
df_images.head()

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights,country,nightlights_bin,is_train
0,10.270931292294023_-6.135291430593985_10.31584...,10.270931,-6.135291,10.315847,-6.105348,2.159553,0.0,mli,0,True
1,10.285903213696015_-6.135291430593985_10.31584...,10.285903,-6.135291,10.315847,-6.105348,2.159553,0.0,mli,0,True
2,10.345790899303983_-6.120319509191993_10.31584...,10.345791,-6.12032,10.315847,-6.105348,2.159553,0.0,mli,0,True
3,10.300875135098007_-6.105347587790001_10.31584...,10.300875,-6.105348,10.315847,-6.105348,2.159553,0.0,mli,0,True
4,10.330818977901991_-6.105347587790001_10.31584...,10.330819,-6.105348,10.315847,-6.105348,2.159553,0.0,mli,0,True


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using {device} as backend')
model = torch.load(CNN_DIR, map_location=device)

Using cuda as backend


In [None]:
model.classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace=True)
  (5): Dropout(p=0.5, inplace=False)
  (6): Linear(in_features=4096, out_features=3, bias=True)
)

In [None]:
# rip off the final layers
model.classifier = model.classifier[:4]

In [None]:
model.classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
)

In [None]:
transformer = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

# custom dataset for fast image loading and processing
# does not follow the usual style of folder -> folder for each class -> image
# we just want one folder with images
class ForwardPassDataset(torch.utils.data.Dataset):
    def __init__(self, image_dir, transformer):
        self.image_dir = image_dir
        self.image_list = os.listdir(self.image_dir)
        self.transformer = transformer

    def __len__(self):
        print(len(self.image_list)-7)
        return len(self.image_list)-7

    def __getitem__(self, index):
        image_name = self.image_list[index]

        # Load image
        X = self.filename_to_im_tensor(self.image_dir + '/' + image_name)
        
        # dataloaders need to return a label, but for the forward pass we don't really care
        return X, -1
    
    def filename_to_im_tensor(self, file):
        im = plt.imread(file)[:,:,:3]
        im = self.transformer(im)
        return im

model.eval()  
classes = [0, 1, 2]
# shape of final array will be (num_validation_images, 4096)
# we also want to record the image each index represents
feats = np.zeros(((~df_images['is_train']).sum(), 4096))
image_order = []
i = 0
for c in classes:
    # use the validation images to do the forward pass
    dataset = ForwardPassDataset(os.path.join(CNN_TRAIN_IMAGE_DIR, 'valid', str(c)), transformer)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=6, shuffle=False, num_workers=4)
    image_order += dataset.image_list
    # forward pass for this class
    for inputs, _ in tqdm(dataloader):
        inputs = inputs.to(device)
        outputs = model(inputs)
        feats[i:i+len(inputs),:] = outputs.cpu().detach().numpy()
        i += len(inputs)

2232


HBox(children=(FloatProgress(value=0.0, max=372.0), HTML(value='')))

2232

643


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))

643

72


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

72



In [None]:
feats[2952]

In [None]:
forward_pass_df = pd.DataFrame.from_dict({'image_name': image_order, 'feat_index': np.arange(len(image_order))})
forward_pass_df.head()

Unnamed: 0,image_name,feat_index
0,13.958338091398007_-5.532840151156016_13.97331...,0
1,14.003253855603983_-5.532840151156016_13.97331...,1
2,13.959934289298007_-4.762762650695977_13.97490...,2
3,14.019821974905975_-4.672931122284024_13.97490...,3
4,14.023840172205976_-4.747768482890001_13.97892...,4


In [None]:
df_consumption = pd.merge(left=df_images, right=forward_pass_df, on='image_name')

In [None]:
# have we maintained all validation images?
print((~df_images['is_train']).sum())
print(len(df_consumption))
# assert len(df_consumption) == (~df_images['is_train']).sum()

2952
2948


In [None]:
df_consumption.head()

Unnamed: 0,image_name,image_lat,image_lon,cluster_lat,cluster_lon,cons_pc,nightlights,country,nightlights_bin,is_train,feat_index
0,10.330818977901991_-6.0754037449860165_10.3158...,10.330819,-6.075404,10.315847,-6.105348,2.159553,0.0,mli,0,False,2000
1,10.300875135098007_-6.060431823584024_10.31584...,10.300875,-6.060432,10.315847,-6.105348,2.159553,0.0,mli,0,False,2001
2,10.359734092694024_-5.89172523952_10.404649856...,10.359734,-5.891725,10.40465,-5.891725,0.778903,0.0,mli,0,False,2002
3,10.449565621105977_-5.89172523952_10.404649856...,10.449566,-5.891725,10.40465,-5.891725,0.778903,0.0,mli,0,False,2003
4,10.436986876796016_-7.968415206853984_10.46693...,10.436987,-7.968415,10.466931,-7.938471,2.292946,0.0,mli,0,False,2004


In [None]:
country_abbrv = ['mli']
country_dir = ['mali_2015']

for ca, cd in zip(country_abbrv, country_dir):
    df_c = df_consumption[df_consumption['country'] == ca]
    group = df_c.groupby(['cluster_lat', 'cluster_lon'])
    x = np.zeros((len(group), 4096))
    cluster_list = [] # the corresponding clusters (lat, lon) to the x aggregate feature array
    for i, g in enumerate(group):
        lat, lon = g[0]
        im_sub = df_consumption[(df_consumption['cluster_lat'] == lat) & (df_consumption['cluster_lon'] == lon)].reset_index(drop=True)
        agg_feats = np.zeros((len(im_sub), 4096))
        for j, d in im_sub.iterrows():
         ## to limit the feature index within bound
            if d.feat_index >= 2952:
              break
            else:
              agg_feats[j,:] = feats[d.feat_index]
        agg_feats = agg_feats.mean(axis=0) # averages the features across all images in the cluster

        x[i,:] = agg_feats
        cluster_list.append([lat, lon])
    # save to the correct directory
    save_dir = os.path.join(RESULTS_DIR, cd, 'cnn')
    os.makedirs(save_dir, exist_ok=True)
    np.save(os.path.join(save_dir, 'cluster_feats.npy'), x)
    pickle.dump(cluster_list, open(os.path.join(save_dir, 'cluster_order.pkl'), 'wb')) 
    