In [None]:
import pandas as pd
import geopy.distance
import numpy as np

## Data processing

In [None]:
# read in city csv file
city = pd.read_csv('Rome.csv')

In [None]:
city.head()

In [None]:
# drop useless cols
del city['V1']
del city['V2']
del city['V3']

In [None]:
# deleting artefacts
city = city[~city.V4.str.contains("type_whatever_weird_strings_might_appear_when_sampling")]

In [None]:
# extract lat and lon
city['lat'] = city.V4.str.split("_", expand=True)[0]
city['lon'] = city.V4.str.split("_", expand=True)[1]

# extract folder name
city['folder'] = city.FileName.str.extract(r'(\d+_\d+)', expand = True)

In [None]:
# drop useless cols
del city['FileName']
del city['V4']

In [None]:
city.head()

## Sampling

In [None]:
# enter city info
center_pt = (41.89, 12.48)
radius = 8.7451556

# enter required sample size
sample_size = 300

In [None]:
# create samples 1 and 2
df = city.sample(n = 1)
sam = city.sample(n = 1)

for i in range(sample_size-1): 
    # condition is for over 100 meters and within city boundaries
    while not ((df.apply(lambda row: geopy.distance.distance((float(row.lat), float(row.lon)), (float(sam.lat), float(sam.lon))).km > 0.1, axis=1).all()) and (geopy.distance.distance(center_pt, (float(sam.lat), float(sam.lon))).km < radius)):
        sam = city.sample(n = 1)
    else:
        df = pd.concat([df, sam])

In [None]:
df.head()

## Print list of folders to download from Mediaflux

In [None]:
# print sorted list of folders to download
download = sorted(df.folder.unique().tolist())

# convert to txt file
np.savetxt('rome_folders.txt', download, delimiter="\n", fmt="%s")

## Creating list of image names to extract 

In [None]:
# creating copy for extraction purposes
extr = df.copy()

# creating file paths for extraction
extr['rot1'] = extr.folder + '/' + extr.lat.astype(str) + '_' + extr.lon.astype(str) + '_0_0.jpg'
extr['rot2'] = extr.folder + '/' + extr.lat.astype(str) + '_' + extr.lon.astype(str) + '_90_0.jpg'
extr['rot3'] = extr.folder + '/' + extr.lat.astype(str) + '_' + extr.lon.astype(str) + '_180_0.jpg'
extr['rot4'] = extr.folder + '/' + extr.lat.astype(str) + '_' + extr.lon.astype(str) + '_270_0.jpg'

In [None]:
# export df of sampled images to csv
extr.to_csv('rome_sampled_images.csv', index=False)

## Kelly: copying sampled images to specific folder

In [None]:
import os
from os import path
import shutil

In [None]:
# read in list of sampled images
extr = pd.read_csv('rome_sampled_images.csv')

In [None]:
# converting filenames to list
files = sorted(extr.iloc[:, -4:].values.T.ravel().tolist())

In [None]:
# add your source/destination paths
source = "/your/source/dir/"
destination = "/your/destination/dir/"

In [None]:
# copy images to destination folder, print error messages when cannot find image
for f in files:
    try:
        shutil.copy(path.join(source, f), destination)
    except FileNotFoundError as not_found:
        print('CANNOT FIND: ', not_found.filename)