In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt 

%matplotlib inline 

from urllib.request import urlopen
from bs4 import BeautifulSoup
from googlemaps import Client as GoogleMaps
import re

In [2]:
url = "https://es.wikipedia.org/wiki/Anexo:Municipios_de_Brasil"
html = urlopen(url)

In [3]:
soup = BeautifulSoup(html)

In [4]:
data = []
all_rows = soup.find_all('tr')
for row in all_rows :
    row_list = row.find_all('td')
    DataRow = []
    for cell in row_list : 
        DataRow.append(cell.text)
    data.append(DataRow)
data = data[1:]
print(data[-1:])

[['310', 'Leme', '\xa0São Paulo', '101\xa0184\n']]


In [5]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,0,1,2,3
0,1,São Paulo,São Paulo,12 106 920\n
1,2,Río de Janeiro,Río de Janeiro,6 520 266\n
2,3,Brasilia,Distrito Federal,3 039 444\n
3,4,Salvador de Bahía,Bahía,2 953 986\n
4,5,Fortaleza,Ceará,2 627 482\n


In [8]:
header_list = []
col_header = soup.find_all('th')
for col in col_header : 
    header_list.append(col.text)
print(header_list)

['Posición\n', 'Municipio\n', 'Unidad federada\n', 'Población[4]\u200b\n']


In [9]:
df.columns = header_list
df

Unnamed: 0,Posición\n,Municipio\n,Unidad federada\n,Población[4]​\n
0,1,São Paulo,São Paulo,12 106 920\n
1,2,Río de Janeiro,Río de Janeiro,6 520 266\n
2,3,Brasilia,Distrito Federal,3 039 444\n
3,4,Salvador de Bahía,Bahía,2 953 986\n
4,5,Fortaleza,Ceará,2 627 482\n
...,...,...,...,...
305,306,Catalão,Goiás,102 393\n
306,307,Lavras,Minas Gerais,102 124\n
307,308,São Gonçalo do Amarante,Río Grande del Norte,101 492\n
308,309,Japeri,Río de Janeiro,101 237\n


In [10]:
df.shape

(310, 4)

In [11]:
df = df[df['Población[4]\u200b\n']!= '<NA>']
df.shape

(310, 4)

In [12]:
df['Población[4]\u200b\n'] = df['Población[4]\u200b\n'].astype('string')
df['Municipio\n'] = df['Municipio\n'].astype('string')
df['Población[4]\u200b\n'] = [float(str(val).replace(' ','').replace('\xa0','').replace('\n','').replace('.','').replace('[5]','').replace('\u200b','')) for val in df['Población[4]\u200b\n'].values]
df['Municipio\n'] = [str(str(val).replace('\n','')) for val in df['Municipio\n'].values]
df['Municipio\n'] = df['Municipio\n'].astype('string')
df.dtypes

Posición\n            object
Municipio\n           string
Unidad federada\n     object
Población[4]​\n      float64
dtype: object

In [14]:
df = df.loc[ df['Población[4]\u200b\n'] >= 200000]
df.head()

Unnamed: 0,Posición\n,Municipio\n,Unidad federada\n,Población[4]​\n
0,1,São Paulo,São Paulo,12106920.0
1,2,Río de Janeiro,Río de Janeiro,6520266.0
2,3,Brasilia,Distrito Federal,3039444.0
3,4,Salvador de Bahía,Bahía,2953986.0
4,5,Fortaleza,Ceará,2627482.0


In [15]:
df = df.set_index('Posición\n')
df.head()

Unnamed: 0_level_0,Municipio\n,Unidad federada\n,Población[4]​\n
Posición,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,São Paulo,São Paulo,12106920.0
2,Río de Janeiro,Río de Janeiro,6520266.0
3,Brasilia,Distrito Federal,3039444.0
4,Salvador de Bahía,Bahía,2953986.0
5,Fortaleza,Ceará,2627482.0


In [16]:
gmaps = GoogleMaps('AIzaSyBmPY1vTvXaWtekpNOtb4AOVzH27NxA_BA')

In [17]:
df['long'] = ""
df['lat'] = ""

In [18]:
for x in range(len(df)):
    try:
        geocode_result = gmaps.geocode(df['Municipio\n'][x])
        df['lat'][x] = geocode_result[0]['geometry']['location'] ['lat']
        df['long'][x] = geocode_result[0]['geometry']['location']['lng']
    except IndexError:
        print("Address was wrong...")
    except Exception as e:
        print("Unexpected error occurred.", e )
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lat'][x] = geocode_result[0]['geometry']['location'] ['lat']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['long'][x] = geocode_result[0]['geometry']['location']['lng']


Unnamed: 0_level_0,Municipio\n,Unidad federada\n,Población[4]​\n,long,lat
Posición,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,São Paulo,São Paulo,12106920.0,-46.6333,-23.5505
2,Río de Janeiro,Río de Janeiro,6520266.0,-43.1729,-22.9068
3,Brasilia,Distrito Federal,3039444.0,-47.9218,-15.8267
4,Salvador de Bahía,Bahía,2953986.0,-38.5016,-12.9777
5,Fortaleza,Ceará,2627482.0,-38.527,-3.73271


In [19]:
df.to_excel('Brasil.xlsx')