# Data Aggregation

**Input:** streets_lat_long_curated.csv (containing latitue and longitude points from image donwloaded from crawler step

**Ouput:** geo.csv (contains the census sector corresponding to the lat-long points.


In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point, Polygon
import itertools

from tqdm.notebook import tqdm 

In [2]:
df = pd.read_csv('streets_lat_long_curated.csv')

In [3]:
df

Unnamed: 0,city,direction,filename,latitude,longitude,name,pano_id,uf
0,APIAÍ,0,amYzj4DUMQbgHzB9M3a_WA-0.jpg,-24.514697,-48.846783,unknown,amYzj4DUMQbgHzB9M3a_WA,unknown
1,APIAÍ,90,amYzj4DUMQbgHzB9M3a_WA-90.jpg,-24.514697,-48.846783,unknown,amYzj4DUMQbgHzB9M3a_WA,unknown
2,APIAÍ,180,amYzj4DUMQbgHzB9M3a_WA-180.jpg,-24.514697,-48.846783,unknown,amYzj4DUMQbgHzB9M3a_WA,unknown
3,APIAÍ,270,amYzj4DUMQbgHzB9M3a_WA-270.jpg,-24.514697,-48.846783,unknown,amYzj4DUMQbgHzB9M3a_WA,unknown
4,APIAÍ,0,LoaIOwDck-H26nB1MZdD7Q-0.jpg,-24.514405,-48.846691,unknown,LoaIOwDck-H26nB1MZdD7Q,unknown
...,...,...,...,...,...,...,...,...
112363,DOUTOR ULYSSES,270,oSXOgE7j3mV9lfiPJMwySA-270.jpg,-24.646414,-49.425382,unknown,oSXOgE7j3mV9lfiPJMwySA,unknown
112364,DOUTOR ULYSSES,0,VknK6sWJIKFoVDOPr94viA-0.jpg,-24.625669,-49.418806,unknown,VknK6sWJIKFoVDOPr94viA,unknown
112365,DOUTOR ULYSSES,90,VknK6sWJIKFoVDOPr94viA-90.jpg,-24.625669,-49.418806,unknown,VknK6sWJIKFoVDOPr94viA,unknown
112366,DOUTOR ULYSSES,180,VknK6sWJIKFoVDOPr94viA-180.jpg,-24.625669,-49.418806,unknown,VknK6sWJIKFoVDOPr94viA,unknown


In [4]:
df = df[df['pano_id'] != 'pano_id']
df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)

In [5]:
pano_counts = df['pano_id'].value_counts()
repeated_pano = pano_counts[pano_counts > 16].index
pano_counts[repeated_pano]

Series([], Name: pano_id, dtype: int64)

In [6]:
df[df['pano_id']=='amYzj4DUMQbgHzB9M3a_WA']['name'].value_counts()

unknown    4
Name: name, dtype: int64

In [7]:
df = df.drop_duplicates(subset=['pano_id', 'direction'])
df = df[(df['latitude'] < 10)&(df['longitude'] > -75)&(df['longitude'] < -25)]

In [8]:
df

Unnamed: 0,city,direction,filename,latitude,longitude,name,pano_id,uf
0,APIAÍ,0,amYzj4DUMQbgHzB9M3a_WA-0.jpg,-24.514697,-48.846783,unknown,amYzj4DUMQbgHzB9M3a_WA,unknown
1,APIAÍ,90,amYzj4DUMQbgHzB9M3a_WA-90.jpg,-24.514697,-48.846783,unknown,amYzj4DUMQbgHzB9M3a_WA,unknown
2,APIAÍ,180,amYzj4DUMQbgHzB9M3a_WA-180.jpg,-24.514697,-48.846783,unknown,amYzj4DUMQbgHzB9M3a_WA,unknown
3,APIAÍ,270,amYzj4DUMQbgHzB9M3a_WA-270.jpg,-24.514697,-48.846783,unknown,amYzj4DUMQbgHzB9M3a_WA,unknown
4,APIAÍ,0,LoaIOwDck-H26nB1MZdD7Q-0.jpg,-24.514405,-48.846691,unknown,LoaIOwDck-H26nB1MZdD7Q,unknown
...,...,...,...,...,...,...,...,...
112363,DOUTOR ULYSSES,270,oSXOgE7j3mV9lfiPJMwySA-270.jpg,-24.646414,-49.425382,unknown,oSXOgE7j3mV9lfiPJMwySA,unknown
112364,DOUTOR ULYSSES,0,VknK6sWJIKFoVDOPr94viA-0.jpg,-24.625669,-49.418806,unknown,VknK6sWJIKFoVDOPr94viA,unknown
112365,DOUTOR ULYSSES,90,VknK6sWJIKFoVDOPr94viA-90.jpg,-24.625669,-49.418806,unknown,VknK6sWJIKFoVDOPr94viA,unknown
112366,DOUTOR ULYSSES,180,VknK6sWJIKFoVDOPr94viA-180.jpg,-24.625669,-49.418806,unknown,VknK6sWJIKFoVDOPr94viA,unknown


In [9]:
geometry = [Point(xy) for xy in zip(df['longitude'].astype(float), df['latitude'].astype(float))]

In [10]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
brasil = world[world['iso_a3']=='BRA']

In [13]:
geo = gpd.GeoDataFrame(df, geometry=geometry)

In [25]:
def find_setor(row):
 
    for _, setor in vale.iterrows():
        if row['geometry'].within(setor['geometry']):
            return setor['CD_GEOCODI']
    else:
        return np.nan
    
geo['setor'] = geo.progress_apply(find_setor, axis=1)
#100%120772/120772 [3:09:21<00:00, 10.63it/s]

HBox(children=(FloatProgress(value=0.0, max=112368.0), HTML(value='')))




## saving data

In [26]:
#generated geo.csv
geo.to_csv('geo.csv', index=None)


In [27]:
count_setor = geo.groupby('setor')['uf'].count()

In [28]:
geo

Unnamed: 0,city,direction,filename,latitude,longitude,name,pano_id,uf,geometry,setor
0,APIAÍ,0,amYzj4DUMQbgHzB9M3a_WA-0.jpg,-24.514697,-48.846783,unknown,amYzj4DUMQbgHzB9M3a_WA,unknown,POINT (-48.84678 -24.51470),3.502705e+14
1,APIAÍ,90,amYzj4DUMQbgHzB9M3a_WA-90.jpg,-24.514697,-48.846783,unknown,amYzj4DUMQbgHzB9M3a_WA,unknown,POINT (-48.84678 -24.51470),3.502705e+14
2,APIAÍ,180,amYzj4DUMQbgHzB9M3a_WA-180.jpg,-24.514697,-48.846783,unknown,amYzj4DUMQbgHzB9M3a_WA,unknown,POINT (-48.84678 -24.51470),3.502705e+14
3,APIAÍ,270,amYzj4DUMQbgHzB9M3a_WA-270.jpg,-24.514697,-48.846783,unknown,amYzj4DUMQbgHzB9M3a_WA,unknown,POINT (-48.84678 -24.51470),3.502705e+14
4,APIAÍ,0,LoaIOwDck-H26nB1MZdD7Q-0.jpg,-24.514405,-48.846691,unknown,LoaIOwDck-H26nB1MZdD7Q,unknown,POINT (-48.84669 -24.51441),3.502705e+14
...,...,...,...,...,...,...,...,...,...,...
112363,DOUTOR ULYSSES,270,oSXOgE7j3mV9lfiPJMwySA-270.jpg,-24.646414,-49.425382,unknown,oSXOgE7j3mV9lfiPJMwySA,unknown,POINT (-49.42538 -24.64641),4.128633e+14
112364,DOUTOR ULYSSES,0,VknK6sWJIKFoVDOPr94viA-0.jpg,-24.625669,-49.418806,unknown,VknK6sWJIKFoVDOPr94viA,unknown,POINT (-49.41881 -24.62567),4.128633e+14
112365,DOUTOR ULYSSES,90,VknK6sWJIKFoVDOPr94viA-90.jpg,-24.625669,-49.418806,unknown,VknK6sWJIKFoVDOPr94viA,unknown,POINT (-49.41881 -24.62567),4.128633e+14
112366,DOUTOR ULYSSES,180,VknK6sWJIKFoVDOPr94viA-180.jpg,-24.625669,-49.418806,unknown,VknK6sWJIKFoVDOPr94viA,unknown,POINT (-49.41881 -24.62567),4.128633e+14
