# Part 2: CAVs and scenicness

In [177]:
import os
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import random
import pickle
import collections
import nbimporter
import cv2
import torch.nn.functional as F
import math
import tqdm
import glob
import re

#for maps
import geopandas
import geoplot

from skimage import io, transform
from torch.utils.data import DataLoader, Dataset
from torchvision import utils, transforms, models
from torch.autograd import Variable
from PIL import Image
from sklearn import linear_model, metrics
from sklearn.model_selection import train_test_split
from statistics import mean, stdev
from scipy import stats

from p1_CreateTrainingDataframe import CreateTrainingDataframe
from p1_CreateTestDataframe import CreateTestDataframe
from p1_BrodenDataSet import BrodenDataset
from p1_RescaleImage import Rescale
from p1_TransformToTensor import ToTensor
from p1_GetVectorFromImage import GetVector
from p1_GetCosineSimilarityDistance import GetCosineSimilarityDistance
from p1_MakeVectorDictionary import MakeVectorDictionary
from p1_SubsetConceptImages import SubsetConceptImages

from p2_SoN_Dataset import SonDataset

ModuleNotFoundError: No module named 'geoplot'

Import necessary elements from part 1

In [2]:
basenet = models.resnet50(pretrained=True, progress=True)
out_layer = basenet._modules.get('avgpool')

In [3]:
## my own laptop:
# broden_dataset_path = '../data/broden1_384/'

## on guanabana:
broden_dataset_path = '/raid/data/datasets/broden1_384'

index_file_path = os.path.join(broden_dataset_path, 'index.csv')
label_file_path = os.path.join(broden_dataset_path, 'label.csv')

In [4]:
training_data_path = '../data/training_data.csv'
filtered_training_data_path = '../data/filtered_training_data.csv'
test_data_path = '../data/test_data.csv'

In [5]:
training_data = pd.read_csv(training_data_path, sep=',')
filtered_training_data = pd.read_csv(filtered_training_data_path, sep=',')
test_data = pd.read_csv(test_data_path)

In [6]:
train_broden_dataset = BrodenDataset(csv_file = training_data_path, 
                               data_path = broden_dataset_path, 
                               transform = transforms.Compose([Rescale(224),
                                                              ToTensor(),
                                                               transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                                                    std=[0.229, 0.224, 0.225])
                                                              ]))

In [7]:
test_broden_dataset = BrodenDataset(csv_file = test_data_path,
                                    data_path = broden_dataset_path, 
                                    transform = transforms.Compose([Rescale(224),
                                                                    ToTensor(),
                                                                    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                                                         std=[0.229, 0.224, 0.225])
                                                                   ]))

In [8]:
training_idxs = list(range(len(training_data)))
test_idxs = list(range(len(test_data)))

training_tensors_filename = 'training_tensors.pickle'
test_tensors_filename = ' test_tensors.pickle'

with open(os.path.join('../data/', training_tensors_filename), 'rb') as handle:
    training_tensors = pickle.load(handle)
    
with open(os.path.join('../data/', test_tensors_filename), 'rb') as handle:
    test_tensors = pickle.load(handle)

---

## Torch CAV class for Scenic-Or-Not images

In [6]:
class SonTorchCAV(object):
    
    def __init__(self, scenicness, num_son_imgs, num_counter_imgs, son_tensors, votes_df):
        ''' 
        scenicness (string); low (1 - 4), medium (4 - 7.5) or high ( 7.5 - 10)
        '''
        
        self.scenicness = scenicness
        self.num_son_imgs = num_son_imgs
        self.num_counter_imgs = num_counter_imgs
        self.son_tensors = son_tensors
        self.votes_df = votes_df
        
        self.train_df, self.test_df = train_test_split(votes_df, test_size = 0.3, train_size = 0.7) 
        
        self.lm = None
        self.cav = None
        self.X_test = None
        self.y_test = None
        self.y_predict = None
        self.accuracy = None
        
        if self.scenicness != 'low' and self.scenicness != 'medium' and self.scenicness != 'high':
            raise ValueError ('scenicness is wrongly defined. Got "%s" but expected "low", "medium" or "high"' % self.scenicness)

    def get_son_indices(self):
            
        if self.scenicness == 'high':
            self.son_idxs = list(self.train_df.loc[self.train_df.Average >= 80,].index)
        elif self.scenicness == 'medium':
            self.son_idxs = list(self.train_df.loc[self.train_df.Average >= 40 and self.train_df.Average < 80].index)
        else:
            self.son_idxs = list(self.train_df.loc[self.train_df.Average < 40].index)
        
    def get_random_son_images(self):
        
        self.get_son_indices()
        
        if self.num_son_imgs > len(self.son_idxs):
            self.num_son_imgs = len(self.son_idxs)
        
        self.random_son_idxs = random.sample(self.son_idxs, self.num_son_imgs)
        
    def get_random_counter_images(self):
        
        self.get_son_indices()
        self.random_counter_idxs = list([i for i in np.asarray(self.train_df.index) if i not in self.son_idxs])
        
        if len(self.random_counter_idxs) < self.num_counter_imgs:
            self.num_counter_imgs = len(self.random_counter_idxs)
        
        self.random_counter_idxs = random.sample(self.random_counter_idxs, self.num_counter_imgs)
        
    def train_lm(self):
        '''
        Train a linear classifier between the concept images and the counter images
        '''
        
        self.get_random_son_images()
        self.get_random_counter_images()
        
        # concatenate the tensors of the selected concept images into a matrix
        self.train_son_tensors = self.son_tensors[str(self.random_son_idxs[0])].unsqueeze(0)
        for i in range(1, self.num_son_imgs):
            self.temp_son_tensor = self.son_tensors[str(self.random_son_idxs[i])].unsqueeze(0)
            self.train_son_tensors = torch.cat((self.train_son_tensors, self.temp_son_tensor),0)
        
        # concatenate the tensors of the selected counter images into a matrix
        self.counter_tensors = self.son_tensors[str(self.random_counter_idxs[0])].unsqueeze(0)
        for i in range(1, self.num_counter_imgs):
            self.temp_counter_tensor = self.son_tensors[str(self.random_counter_idxs[i])].unsqueeze(0)
            self.counter_tensors = torch.cat((self.counter_tensors, self.temp_counter_tensor), 0)
        
        # concatenate all tensors to the same array
        self.X = torch.cat((self.train_son_tensors, self.counter_tensors), 0)
        self.X = self.X.numpy()
        
        # create labels for the tensors
        # 1 = concepts, 0 = not concept
        self.y = np.ones(self.num_son_imgs)
        self.y = np.append(self.y, np.zeros(self.num_counter_imgs))
        
        # fit a linear classifier
        self.lm = linear_model.SGDClassifier()
        self.lm.fit(self.X, self.y)
        
        # the vector of coeffiecients are orthogonal to the decision hyperplane, thus this vector is the CAV
        self.cav = self.lm.coef_
        
    def create_test_data(self):
        
        ''' 
        Creates test data. All tensors from the test data are concatenated and saved as a .npy file.
        '''
        
        self.test_idxs = list(self.test_tensors.keys())
        
        if os.path.exists('../data/test_data_matrix.npy'):
            self.X_test = np.load('../data/test_data_matrix.npy')
        else:
            self.X_test = self.test_tensors[self.test_idxs[0]].unsqueeze(0)
            for idx in tqdm.tqdm_notebook(range(1, len(self.test_idxs))):
                self.X_test = torch.cat((self.X_test, self.test_tensors[self.test_idxs[idx]].unsqueeze(0)),0)
        
            self.X_test = self.X_test.numpy()
            np.save('../data/test_data_matrix.npy', self.X_test)
            
        self.y_test = self.test_dataframe[self.concept].values.astype('int')
        
        
    def predict(self):
        '''
        Uses the linear classifier to predict the test data, also calculates the average accuracy.
        Calls the function to create test data
        '''
        
        ## To test on the 'scene concepts' change to X_scene_test and y_scene_test
        # self.create_scene_test_data()
        
        self.create_test_data()
        self.y_pred = self.lm.predict(self.X_test)
        #self.probability = self.lm.predict_proba(self.X_test)
        
        # calculate true negatives and true positives for average accuracy
        self.total_negatives = len(self.y_test[self.y_test == 0])
        self.true_neg = 0
        for i in range(len(self.y_pred)):
            if self.y_pred[i] == 0 and self.y_test[i] == self.y_pred[i]:
                self.true_neg += 1
        
        self.total_positives = len(self.y_test[self.y_test == 1])
        self.true_pos = 0
        for i in range(len(self.y_pred)):
            if self.y_pred[i] == 1 and self.y_test[i] == self.y_pred[i]:
                self.true_pos += 1
        
        self.accuracy = metrics.accuracy_score(self.y_test, self.y_pred)
        self.score = self.lm.score(self.X_test, self.y_test)
        if self.total_negatives != 0 and self.total_positives != 0:
            self.average_accuracy = ((self.true_neg/self.total_negatives) + (self.true_pos/self.total_positives)) / 2
        else:
            self.average_accuracy = self.accuracy
            
    def view_son_images(self):
        '''
        View the concept images used to train the linear classifier
        '''
        
        if len(self.random_son_idxs) == 0:
            raise ValueError ('No images have been selected yet')
        
        else:
            %matplotlib inline
            dim = math.floor(math.sqrt(len(self.random_son_idxs))) 
            
            fig = plt.figure(figsize=(12,12))
            ax = [fig.add_subplot(dim, dim, i+1) for i in range(dim**2)]

            for idx, a in enumerate(ax):
                img_file = []
                img_name = self.train_df.loc[self.random_son_idxs[idx], 'ID'].astype(str)
                for directory, _ , _ in os.walk('/raid/data/datasets/SoN/images'):
                    img_file.extend(glob.glob(os.path.join(directory, img_name + '.jpg')))
                
                img = plt.imread(img_file[0])
                a.axis('off')
                a.imshow(img)

            fig.subplots_adjust(wspace=0, hspace=0)
            plt.show()
            
    def view_counter_images(self):
        '''
        View the counter images used to train the linear classifier
        ''' 
        
        if len(self.random_counter_idxs) == 0:
            raise ValueError ('No counter images have been selected')
            
        else:
            %matplotlib inline
            dim = math.floor(math.sqrt(len(self.random_counter_idxs)))

            fig = plt.figure(figsize=(12,12))
            ax = [fig.add_subplot(dim, dim, i+1) for i in range(dim**2)]

            for idx, a in enumerate(ax):
                img = plt.imread(os.path.join('/raid/data/datasets/broden1_384/images/', 
                                              training_data.loc[self.random_counter_idxs[idx], 'image']))
                a.axis('off')
                a.imshow(img)

            fig.subplots_adjust(wspace=0, hspace=0)
            plt.show()
        
        
    def view_FN(self):
        '''
        View the false negative images of the linear classifier
        '''
        
        self.FN = []
        self.FN_idxs = []
        
        for i in range(len(self.y_pred)):
            if self.y_pred[i] == 0 and self.y_test[i] != self.y_pred[i]:
                self.FN.append(i)
        
        self.FN_idxs = list(map(lambda x: self.test_scene_idxs[x], self.FN))
                       
        %matplotlib inline
        dim = math.floor(math.sqrt(len(self.FN_idxs)))

        fig = plt.figure(figsize=(12,12))
        ax = [fig.add_subplot(dim, dim, i+1) for i in range(dim**2)]

        for idx, a in enumerate(ax):
            img = plt.imread(os.path.join('/raid/data/datasets/broden1_384/images/', 
                                          self.test_dataframe.loc[int(self.FN_idxs[idx]), 'image']))
            a.axis('off')
            a.imshow(img)

        fig.subplots_adjust(wspace=0, hspace=0)
        plt.show()
   
    def view_FP(self):
        '''
        View false positive images of the linear classifier
        '''
        
        self.FP = []
        self.FP_idxs = []
        
        for i in range(len(self.y_pred)):
            if self.y_pred[i] == 1 and self.y_test[i] != self.y_pred[i]:
                self.FP.append(i)
        
        self.FP_idxs = list(map(lambda x: self.test_scene_idxs[x], self.FP))
        
        %matplotlib inline
        dim = math.floor(math.sqrt(len(self.FP_idxs)))

        fig = plt.figure(figsize=(12,12))
        ax = [fig.add_subplot(dim, dim, i+1) for i in range(dim**2)]

        for idx, a in enumerate(ax):
            img = plt.imread(os.path.join('/raid/data/datasets/broden1_384/images/', 
                                          self.test_dataframe.loc[int(self.FP_idxs[idx]), 'image']))
            a.axis('off')
            a.imshow(img)

        fig.subplots_adjust(wspace=0, hspace=0)
        plt.show()
        
            
        
        

In [7]:
with open(os.path.join('../data/', 'broden_concepts_cavs.pickle'), 'rb') as handle:
    broden_concept_accuracy = pickle.load(handle)
broden_concepts = list(broden_concept_accuracy.keys())

In [8]:
for key in broden_concepts:
    if broden_concept_accuracy[key]['accuracy'] < 0.75:
        del broden_concept_accuracy[key]
        
broden_concepts = list(broden_concept_accuracy.keys())

## Preprocess the Scenic-Or-Not images

In [134]:
son_path = '/raid/data/datasets/SoN/'
csv_path = '../data/votes.tsv'
image_path = son_path + 'images'

Read the csv file and only use the columns with 'ID' and 'Average score'. The image names are equal to the ID + .jpg

In [135]:
data_info = pd.read_csv(csv_path, delimiter ='\t', encoding='utf-8')

Certain images are to be deleted from the dataset as they are not correctly downloaded. These images are stored in the 'no exist' folder. These images are listed and their IDs are extracted. The image IDs are then linked to the dataframe indices, which are then removed. The index is reset afterwards and the dataframe is stored as .csv

In [136]:
if os.path.exists('../data/updated_votes1.csv'):
    updated_votes = pd.read_csv('../data/updated_votes.csv', index_col = 0)
    
else:
    deleted_imgs = [f for f in glob.glob('/raid/data/datasets/SoN/images/no_exist/' + "*.jpg", recursive=False)]
    deleted_imgs = list(map(lambda x: [int(s) for s in re.findall(r'\d+', x)][0], deleted_imgs))

    remove_indices = tuple(map(lambda x: data_info.loc[data_info.ID == x].index[0], deleted_imgs))

    updated_votes = data_info.drop(data_info.index[[remove_indices]])
    updated_votes.reset_index(drop=True, inplace=True)
    
    updated_votes.to_csv('../data/updated_votes.csv')

  result = getitem(key)


Create a Dataset for the Scenic-Or-Not images

In [12]:
son_dataset = SonDataset('../data/updated_votes.csv', 
                         image_path, 
                         transform = transforms.Compose([transforms.Resize(224),
                                                         transforms.ToTensor(),
                                                         transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                                              std=[0.229, 0.224, 0.225])
                                                        ]))

Run all the SoN images through the model and store the activations in a dictionary. In which the image index in de *updated_votes_df* are the keys and the activation tensors the values.

In [13]:
if os.path.exists('../data/son_tensors.pickle'):
    with open('../data/son_tensors.pickle', 'rb') as handle:
        son_tensors = pickle.load(handle)
        
else:
    son_idxs = list(range(len(updated_votes)))
    son_tensors = MakeVectorDictionary(basenet,
                                      out_layer,
                                      son_dataset,
                                      son_idxs,
                                      file_name = 'son_tensors.pickle')

2 images are not able to get through the model and thus need to be removed from the *updated_votes_df*. The indices of the images are:
 - 52642
 - 201047

In [137]:
updated_votes.drop([52642, 201047], inplace=True)

For each SoN image the probability score for each concept from the Broden dataset is calculated. The probability score is calculated as follows: <br>
$CAV_{Broden} * tensor_{image} + bias_{Broden}$

The result is stored in a dataframe and written to a .csv file

In [34]:
son_idxs = list(son_tensors.keys())

In [15]:
concept_score_matrix = np.zeros((len(son_tensors), len(broden_concepts)))

In [16]:
if os.path.exists('../data/concept_scores.csv'):
    concept_score_df = pd.read_csv('../data/concept_scores.csv', index_col = 0)
else:
    for i in tqdm.tqdm_notebook(range(len(son_tensors))):
        son_img_activation = son_tensors[son_idxs[i]].numpy()
    
        for c in range(len(broden_concepts)):
            cav = broden_concept_accuracy[broden_concepts[c]]['cav']
            bias = broden_concept_accuracy[broden_concepts[c]]['bias']

            concept_score = np.dot(cav, son_img_activation) + bias
            concept_score_matrix[i][c] = concept_score
            
    concept_score_df = pd.DataFrame(concept_score_matrix, columns = broden_concepts)
    concept_score_df.to_csv('../data/concept_scores.csv')
        

A Kendall's Tau test is applied to check if which concepts correlate with an increase in scenicness. "Kendall’s tau is a measure of the correspondence between two rankings"

In [67]:
scenic_score = np.asarray(updated_votes.Average)

In [87]:
all_concepts = list(concept_score_df.columns)
kendall_tau_score = {'tau': [],
                    'p_value': []}

for concept in tqdm.notebook.tqdm(all_concepts):
    concept_score = np.asarray(concept_score_df.loc[:,concept])
    tau, p_value = stats.kendalltau(concept_score, scenic_score)
    kendall_tau_score['tau'].append(tau)
    kendall_tau_score['p_value'].append(p_value)

HBox(children=(IntProgress(value=0, max=722), HTML(value='')))




In [97]:
kendall_tau_df = pd.DataFrame.from_dict(kendall_tau_score)

In [98]:
kendall_tau_df['concept'] = all_concepts

In [105]:
kendall_tau_df.sort_values(by=['tau'], ascending=True, inplace = True)
kendall_tau_df.head(20)

Unnamed: 0,tau,p_value,concept
3,-0.388663,0.0,building
26,-0.373547,0.0,street-s
18,-0.366121,0.0,sidewalk
141,-0.360514,0.0,crosswalk
346,-0.34522,0.0,parking_lot-s
225,-0.325832,0.0,windows
443,-0.315826,0.0,parking_garage-indoor-s
548,-0.309767,0.0,bleachers-outdoor-s
148,-0.298709,0.0,platform
10,-0.297742,0.0,road


Unnamed: 0,ID,Lat,Lon,Average,Variance,Votes,Geograph URI
133,136,53.3110,-2.532870,2.3000,0.6100,1222432322,http://www.geograph.org.uk/photo/1363
441,453,50.7721,-0.798666,2.1429,1.8367,5321211,http://www.geograph.org.uk/photo/5058
977,996,51.2340,-1.296650,2.0000,1.2500,11311324,http://www.geograph.org.uk/photo/8856
2077,2114,54.9949,-1.561670,2.0000,1.5000,4112,http://www.geograph.org.uk/photo/17043
3256,3319,51.4877,-0.533629,1.7500,0.6875,1123,http://www.geograph.org.uk/photo/25757
...,...,...,...,...,...,...,...
210801,216341,51.7588,-4.651930,5.7500,3.6875,85957534,http://www.geograph.org.uk/photo/1152924
210855,216398,51.4733,-0.175220,2.1250,1.1094,31143212,http://www.geograph.org.uk/photo/1153170
211551,217104,53.8167,-3.014740,3.0000,2.0000,32154,http://www.geograph.org.uk/photo/1155767
211643,217198,52.8725,-1.314420,5.0000,2.6667,656724,http://www.geograph.org.uk/photo/1156076


In [None]:
img_file = []
img_name = updated_votes.loc[5, 'ID'].astype(str)
for directory, _ , _ in os.walk('/raid/data/datasets/SoN/images'):
    img_file.extend(glob.glob(os.path.join(directory, img_name + '.jpg')))
    
Image.open(img_file[0])

In [79]:
scenic_imgs = np.asarray(updated_votes.loc[updated_votes.Average > 80,].index)
len(scenic_imgs)

3022

In [173]:
concept_imgs = list(concept_score_df.loc[concept_score_df['coast-s'] > 1000,].index)
concept_df = updated_votes.loc[concept_imgs,]
len(concept_df)

27

In [174]:
def PlotDot(df):
    
    folium.Marker(location=[df.Lat, df.Lon],
                  popup = df.ID,
                  radius=2,
                  weight=0).add_to(m)

In [175]:
m = folium.Map(location= [52.9133, -1.6089],
              zoom_start = 6)

concept_df.apply(PlotDot, axis=1)
m

In [132]:
new_votes = pd.read_csv('../data/votes.tsv', delimiter ='\t', encoding='utf-8')

In [133]:
new_votes.head(10)

Unnamed: 0,ID,Lat,Lon,Average,Variance,Votes,Geograph URI
0,1,51.7026,-2.20985,4.1111,1.8765,453514456,http://www.geograph.org.uk/photo/7
1,2,51.7026,-2.19538,4.0,0.5,44354354,http://www.geograph.org.uk/photo/8
2,3,51.7116,-2.18094,4.2222,2.1728,546534146,http://www.geograph.org.uk/photo/11
3,4,53.311,-2.51786,3.8,7.76,24193,http://www.geograph.org.uk/photo/20
4,5,53.3021,-2.50274,4.1667,3.4722,842434,http://www.geograph.org.uk/photo/22
5,6,54.1311,-4.50266,7.5,2.75,5799,http://www.geograph.org.uk/photo/30
6,7,54.1491,-4.50374,5.125,3.8594,47578532,http://www.geograph.org.uk/photo/31
7,8,54.2764,-4.43464,8.5,3.0,610101010679,http://www.geograph.org.uk/photo/35
8,9,54.2857,-4.41982,6.5714,3.3878,8385868,http://www.geograph.org.uk/photo/36
9,10,54.286,-4.40447,7.2857,8.4898,9104105310,http://www.geograph.org.uk/photo/38
