In [1]:
import pandas as pd
import geopy.distance
import os
from os import path
import shutil

## Data processing

In [2]:
# read in city csv file
city = pd.read_csv('Paris.csv')

In [3]:
# drop first columns
city.drop(city.iloc[:, 0:2], inplace=True, axis=1)

# drop date/time
del city['V2']
del city['V3']

In [4]:
# add lat and lon columns to df
city['lat'] = city.V4.str.split("_", expand=True)[0]
city['lon'] = city.V4.str.split("_", expand=True)[1]

In [5]:
city.head()

Unnamed: 0,V1,V4,lat,lon
0,28526,48.9503188807609_2.0679242344190243_180_0.jpg,48.9503188807609,2.0679242344190243
1,32843,48.95115437952288_2.0663042176388444_270_0.jpg,48.95115437952288,2.0663042176388444
2,13724,48.95138626484626_2.065878108753509_0_90.jpg,48.95138626484626,2.065878108753509
3,32109,48.95036067749351_2.0654597799770045_0_270.jpg,48.95036067749351,2.0654597799770045
4,11731,48.95066072080098_2.0682293214905485_0_90.jpg,48.95066072080098,2.0682293214905485


## Sampling

Add remove out of bounds directly in sampling code (too long of a preprocessing step)

In [6]:
# enter city info
center_pt = (48.85, 2.35)
radius = 13.78261445

# enter required sample size
sample_size = 10

In [7]:
# create samples 1 and 2
df = city.sample(n = 1)
sam = city.sample(n = 1)

for i in range(sample_size-1): 
    # condition is for over 100 meters and within city boundaries
    while not ((df.apply(lambda row: geopy.distance.distance((float(row.lat), float(row.lon)), (float(sam.lat), float(sam.lon))).km > 0.1, axis=1).all()) and (geopy.distance.distance(center_pt, (float(sam.lat), float(sam.lon))).km < radius)):
        sam = city.sample(n = 1)
    else:
        df = pd.concat([df, sam])

In [8]:
df.head()

Unnamed: 0,V1,V4,lat,lon
426616,34660,48.89899618234978_2.1107108530319465_180_0.jpg,48.89899618234978,2.1107108530319465
3790588,63656,48.79417908749144_2.506929716870104_180_0.jpg,48.79417908749144,2.506929716870104
2220000,50861,48.87056259587693_2.4436010617713464_180_0.jpg,48.87056259587693,2.4436010617713464
1490411,42408,48.8452246498811_2.1845395320407306_270_0.jpg,48.8452246498811,2.1845395320407306
3724587,39347,48.79493063380667_2.5022425997951814_180_0.jpg,48.79493063380667,2.5022425997951814


## Print list of folders to download from Mediaflux

In [9]:
# print sorted list of folders to download
download = sorted(df.V1.unique().tolist())
download

[26115, 32730, 34660, 35452, 39347, 42408, 45007, 45302, 50861, 63656]

## Creating list of image names to extract 

In [10]:
# creating copy for extraction purposes
extr = df.copy()

# creating file paths for extraction
extr['rot1'] = extr.V1.astype(str) + '/' + extr.lat.astype(str) + '_' + extr.lon.astype(str) + '_0_0.jpg'
extr['rot2'] = extr.V1.astype(str) + '/' + extr.lat.astype(str) + '_' + extr.lon.astype(str) + '_90_0.jpg'
extr['rot3'] = extr.V1.astype(str) + '/' + extr.lat.astype(str) + '_' + extr.lon.astype(str) + '_180_0.jpg'
extr['rot4'] = extr.V1.astype(str) + '/' + extr.lat.astype(str) + '_' + extr.lon.astype(str) + '_270_0.jpg'

In [11]:
# converting filenames to list
files = sorted(extr.iloc[:, -4:].values.T.ravel().tolist())

## Copying sampled images to specific folder

In [None]:
source = "/User/GSV/Paris/"
destination = "/User/GSV/model_test_images/"

In [None]:
for f in files:
    shutil.copy(path.join(source, f), destination)