In [1]:
import pandas as pd
import geopy.distance
import numpy as np

## Data processing

In [2]:
# read in city csv file
city = pd.read_csv('Rome.csv')

In [3]:
city.head()

Unnamed: 0,FileName,V1,V2,V3,V4
0,/Users/lucamartial/Desktop/Working Directory/R...,26132,04-07-2018,13:27,42.05370903024502_12.32225280795683_180_0.jpg
1,/Users/lucamartial/Desktop/Working Directory/R...,30284,04-07-2018,13:27,42.05441054171563_12.322332510055844_0_0.jpg
2,/Users/lucamartial/Desktop/Working Directory/R...,33890,04-07-2018,13:27,42.05483504313715_12.322384931898227_270_0.jpg
3,/Users/lucamartial/Desktop/Working Directory/R...,34095,04-07-2018,13:27,42.05474925410415_12.322374516320565_270_0.jpg
4,/Users/lucamartial/Desktop/Working Directory/R...,25437,04-07-2018,13:27,42.05525654570083_12.32243452370517_0_90.jpg


In [4]:
# drop useless cols
del city['V1']
del city['V2']
del city['V3']

In [None]:
# deleting artefacts
city = city[~city.V4.str.contains("type_whatever_weird_strings_might_appear_when_sampling")]

In [5]:
# extract folder name
city['folder'] = city.FileName.str.extract(r'(\d+_\d+)', expand = True)

In [6]:
# extract lat and lon
city['lat'] = city.V4.str.split("_", expand=True)[0]
city['lon'] = city.V4.str.split("_", expand=True)[1]

# groupby lat and lon
city = city.groupby(['lat', 'lon']).first().reset_index()

In [7]:
# drop useless cols
del city['FileName']
del city['V4']

In [8]:
city.head()

Unnamed: 0,lat,lon,folder
0,41.77701518306645,12.53624271569447,70100_48764
1,41.77702388687008,12.432588921336333,70062_48764
2,41.77703315390035,12.537899665233567,70100_48764
3,41.77703597494189,12.536126539880115,70100_48764
4,41.77705298968281,12.53801969028143,70100_48764


In [10]:
# creating copy for sampling purposes
citycop = city.copy()

## Sampling

In [30]:
# enter city info
center_pt = (41.89, 12.48)
radius = 8.7451556

# enter required sample size
sample_size = 2000

In [31]:
# create sample 1 within boundaries
df = citycop.sample(n = 1)
while not geopy.distance.distance(center_pt, (float(df.lat), float(df.lon))).km < radius:
    df = citycop.sample(n = 1)

In [32]:
# sampling loop
for i in range(sample_size-1):
    # shuffle df
    citycop = citycop.sample(frac=1)
    # resample
    sam = citycop.sample(n = 1)
    # condition is for over 100 meters and within city boundaries
    while not ((df.apply(lambda row: geopy.distance.distance((float(row.lat), float(row.lon)), (float(sam.lat), float(sam.lon))).km > 0.1, axis=1).all()) and (geopy.distance.distance(center_pt, (float(sam.lat), float(sam.lon))).km < radius)):
        sam = citycop.sample(n = 1)
    else:
        df = pd.concat([df, sam])

In [33]:
df.head()

Unnamed: 0,lat,lon,folder
426616,41.91223556641864,12.468976841845006,70075_48699
456842,41.91966668532343,12.471516234480076,70076_48695
525249,41.93958789524161,12.480835347570064,70080_48685
349020,41.89232346980918,12.55414365388117,70106_48708
439074,41.91525745061887,12.45454890110841,70070_48697


## Extracting list of lat/long pairs for plotting

In [34]:
# export df of sampled locations to csv
df.to_csv('rome_2000sampled_locations.csv', index=False)

## Print list of folders to download from Mediaflux

In [35]:
# print sorted list of folders to download
download = sorted(df.folder.unique().tolist())

# convert to txt file
np.savetxt('rome_2000folders.txt', download, delimiter="\n", fmt="%s")

## Creating list of image names to extract 

In [36]:
# creating copy for extraction purposes
extr = df.copy()

# creating file paths for extraction
extr['rot1'] = extr.folder + '/' + extr.lat.astype(str) + '_' + extr.lon.astype(str) + '_0_0.jpg'
extr['rot2'] = extr.folder + '/' + extr.lat.astype(str) + '_' + extr.lon.astype(str) + '_90_0.jpg'
extr['rot3'] = extr.folder + '/' + extr.lat.astype(str) + '_' + extr.lon.astype(str) + '_180_0.jpg'
extr['rot4'] = extr.folder + '/' + extr.lat.astype(str) + '_' + extr.lon.astype(str) + '_270_0.jpg'

In [37]:
# export df of sampled images to csv
extr.to_csv('rome_2000sampled_images.csv', index=False)

## Kelly: copying sampled images to specific folder

In [None]:
import os
from os import path
import shutil

In [None]:
# read in list of sampled images
extr = pd.read_csv('rome_sampled_images.csv')

In [None]:
# converting filenames to list
files = sorted(extr.iloc[:, -4:].values.T.ravel().tolist())

In [None]:
# add your source/destination paths
source = "/your/source/dir/"
destination = "/your/destination/dir/"

In [None]:
# copy images to destination folder, print error messages when cannot find image
for f in files:
    try:
        shutil.copy(path.join(source, f), destination)
    except FileNotFoundError as not_found:
        print('CANNOT FIND: ', not_found.filename)