In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

# Neighbourhoods from RIO Human Development Index (IDH)

In [2]:
url_idh = 'https://pt.wikipedia.org/wiki/Lista_de_bairros_do_Rio_de_Janeiro_por_IDH'

In [3]:
source_2 = requests.get(url_idh).text
soup_2 = BeautifulSoup(source_2,'lxml')

In [4]:
# Find table from wiki
table_2 = soup_2.find('table', class_='wikitable')
trs_2 = table_2.find_all('tr')

In [5]:
# TABLE TO LIST of Lists
idh = []
for tr in trs_2:
    row = []
    for td in tr.find_all('td'):
        if td == None or td.text == '':
            row.append(None)
        elif td.text[0] in ['0','1','2','3','4','5','6','7','8','9']:
            row.append(td.text.strip('\n[abc]'))
        elif len(td.text.split(',')) > 1:
            row.append(td.text)
        else:
            row.append(td.text)
    idh.append(row)

## Table idhs to Dataframe 

In [6]:
# DATA FRAME FROM LIST
idhs = pd.DataFrame(idh)
idhs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,,,,,,,,,,
1,,,,,,,,,,
2,1.0,Gávea,8045.0,9808.0,11813.0,213956.0,924.0,987.0,1000.0,970.0
3,2.0,Leblon,7947.0,9901.0,10518.0,244128.0,908.0,993.0,1000.0,967.0
4,3.0,Jardim Guanabara,8047.0,9892.0,11115.0,131686.0,924.0,993.0,972.0,963.0


# Format Data

In [7]:
# Remove Useless Data 
idhs.dropna(inplace=True)

In [8]:
# Rename Columns
idhs = idhs.rename(columns={0:'index',1:'BAIRRO',2:'ESPER',3:'ALFA',4:'FREQ',5:'RENDA',6:'IDHL',7:'IDHE',8:'IDHR',9:'IDH'})
idhs = idhs.drop(['ALFA','FREQ'],axis = 1)
idhs.head(10)

Unnamed: 0,index,BAIRRO,ESPER,RENDA,IDHL,IDHE,IDHR,IDH
2,1,Gávea,8045,213956,924,987,1000,970
3,2,Leblon,7947,244128,908,993,1000,967
4,3,Jardim Guanabara,8047,131686,924,993,972,963
5,4,Ipanema,7868,246545,895,992,1000,962
6,5,Lagoa,7791,295529,882,996,1000,959
7,6,Flamengo,7791,178171,882,995,1000,959
8,7,Humaitá,7791,183065,882,995,1000,959
9,8,"Barra da Tijuca, Joá",7784,248847,881,996,1000,959
10,9,Laranjeiras,7784,167922,881,992,1000,957
11,10,Jardim Botânico,7784,195277,881,991,1000,957


###  "Split (explode) pandas dataframe string entry to separate rows"
https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows

In [9]:
# USEFULL FUNCTION FOR EXPLODING STRINGS From StackOverflow
def tidy_split(df, column, sep='|', keep=False):
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df

In [10]:
idhs = tidy_split(idhs, 'BAIRRO', sep=',')
idhs.head(20)

Unnamed: 0,index,BAIRRO,ESPER,RENDA,IDHL,IDHE,IDHR,IDH
2,1,Gávea,8045,213956,924,987,1000,970
3,2,Leblon,7947,244128,908,993,1000,967
4,3,Jardim Guanabara,8047,131686,924,993,972,963
5,4,Ipanema,7868,246545,895,992,1000,962
6,5,Lagoa,7791,295529,882,996,1000,959
7,6,Flamengo,7791,178171,882,995,1000,959
8,7,Humaitá,7791,183065,882,995,1000,959
9,8,Barra da Tijuca,7784,248847,881,996,1000,959
9,8,Joá,7784,248847,881,996,1000,959
10,9,Laranjeiras,7784,167922,881,992,1000,957


In [11]:
# REMOVE ' ' from BAIRRO
idhs['BAIRRO'] = idhs['BAIRRO'].str.strip()

**ADD MISSING DATA**

In [12]:
# Insert Freguesia de Jacarepaguá
idhs.loc[62]['BAIRRO'] = 'Freguesia de Jacarepaguá'

In [13]:
# Insert Grajau e Quintino
miss = {
    'index' : [16,55],
    'BAIRRO' : ['Grajaú','Quintino Bocaiuva'],
    'ESPER' : ['77,84','73,91'],
    #'ALFA' : ['97,90','96,80'],
    #'FREQ' : ['107,00','91,97'],
    'RENDA' : ['1134,93','424,67'],
    'IDHL' : ['0,881','0,815'],
    'IDHE' : ['0,986','0,952'],
    'IDHR' : ['0,947','0,783'],
    'IDH' : ['0,938','0,850']
}
miss = pd.DataFrame(miss)
idhs = idhs.append(miss , ignore_index=True)

In [14]:
idhs = idhs.drop(['index'],axis=1)

In [15]:
idhs.head()

Unnamed: 0,BAIRRO,ESPER,RENDA,IDHL,IDHE,IDHR,IDH
0,Gávea,8045,213956,924,987,1000,970
1,Leblon,7947,244128,908,993,1000,967
2,Jardim Guanabara,8047,131686,924,993,972,963
3,Ipanema,7868,246545,895,992,1000,962
4,Lagoa,7791,295529,882,996,1000,959


### UPPER CASE 

In [16]:
idhs['BAIRRO'] = idhs['BAIRRO'].str.upper()

In [17]:
idhs['BAIRRO'] = idhs['BAIRRO'].str.replace('Á','A')
idhs['BAIRRO'] = idhs['BAIRRO'].str.replace('É','E')
idhs['BAIRRO'] = idhs['BAIRRO'].str.replace('Í','I')
idhs['BAIRRO'] = idhs['BAIRRO'].str.replace('Ó','O')
idhs['BAIRRO'] = idhs['BAIRRO'].str.replace('Ú','U')
idhs['BAIRRO'] = idhs['BAIRRO'].str.replace('Â','A')
idhs['BAIRRO'] = idhs['BAIRRO'].str.replace('Ê','E')
idhs['BAIRRO'] = idhs['BAIRRO'].str.replace('Ã','A')
idhs['BAIRRO'] = idhs['BAIRRO'].str.replace('Ç','C')

### String -> FLOAT

In [18]:
idhs['ESPER'] =  idhs['ESPER'].apply(lambda x: float(x.replace(',','.')))
#idhs['ALFA'] = idhs['ALFA'].apply(lambda x: float(x.replace(',','.')))
#idhs['FREQ'] = idhs['FREQ'].apply(lambda x: float(x.replace(',','.')))
idhs['RENDA'] = idhs['RENDA'].apply(lambda x: float(x.replace(',','.')))
idhs['IDHL'] = idhs['IDHL'].apply(lambda x: float(x.replace(',','.')))
idhs['IDHE'] = idhs['IDHE'].apply(lambda x: float(x.replace(',','.')))
idhs['IDHR'] = idhs['IDHR'].apply(lambda x: float(x.replace(',','.')))
idhs['IDH'] = idhs['IDH'].apply(lambda x: float(x.replace(',','.')))

In [19]:
idhs.head()

Unnamed: 0,BAIRRO,ESPER,RENDA,IDHL,IDHE,IDHR,IDH
0,GAVEA,80.45,2139.56,0.924,0.987,1.0,0.97
1,LEBLON,79.47,2441.28,0.908,0.993,1.0,0.967
2,JARDIM GUANABARA,80.47,1316.86,0.924,0.993,0.972,0.963
3,IPANEMA,78.68,2465.45,0.895,0.992,1.0,0.962
4,LAGOA,77.91,2955.29,0.882,0.996,1.0,0.959


# Join DataFrame with Bairros.csv

In [20]:
bair = pd.read_csv('bairros.csv',index_col=0)

In [21]:
bair.head()

Unnamed: 0,BAIRRO,SUB,ZONA,LAT,LON
0,SAO CRISTOVAO,CENTRO HISTÓRICO E ZONA PORTUÁRIA,CENTRAL,-22.9002,-43.23024
1,BENFICA,CENTRO HISTÓRICO E ZONA PORTUÁRIA,CENTRAL,-22.88989,-43.24562
2,CAJU,CENTRO HISTÓRICO E ZONA PORTUÁRIA,CENTRAL,-22.87992,-43.22218
3,CATUMBI,CENTRO HISTÓRICO E ZONA PORTUÁRIA,CENTRAL,-22.91759,-43.19704
4,CENTRO,CENTRO HISTÓRICO E ZONA PORTUÁRIA,CENTRAL,-22.908076,-43.182324


In [22]:
result = pd.merge(bair, idhs, on='BAIRRO')

In [23]:
result.head()

Unnamed: 0,BAIRRO,SUB,ZONA,LAT,LON,ESPER,RENDA,IDHL,IDHE,IDHR,IDH
0,SAO CRISTOVAO,CENTRO HISTÓRICO E ZONA PORTUÁRIA,CENTRAL,-22.9002,-43.23024,72.27,412.39,0.788,0.933,0.778,0.833
1,BENFICA,CENTRO HISTÓRICO E ZONA PORTUÁRIA,CENTRAL,-22.88989,-43.24562,73.59,376.65,0.81,0.901,0.763,0.825
2,CAJU,CENTRO HISTÓRICO E ZONA PORTUÁRIA,CENTRAL,-22.87992,-43.22218,68.9,236.59,0.732,0.843,0.685,0.753
3,CATUMBI,CENTRO HISTÓRICO E ZONA PORTUÁRIA,CENTRAL,-22.91759,-43.19704,69.6,324.83,0.743,0.923,0.738,0.802
4,CENTRO,CENTRO HISTÓRICO E ZONA PORTUÁRIA,CENTRAL,-22.908076,-43.182324,76.12,633.36,0.852,0.981,0.85,0.894


# Vizualizing IDHS DATA

In [24]:
import folium
from branca.colormap import linear

In [25]:
latitude = -22.917421
longitude = -43.425574

# IDH -  RENDA MAP

In [26]:
# COLOR MAP IDHR
mini = result['IDHR'].min()
maxi = result['IDHR'].max()
z = (mini+maxi)/2
za = (z + mini)/2
zb = (z + maxi)/2
colormap = linear.Greens_09.scale(mini,maxi)
colormap = colormap.to_step(index=[mini,za,z,zb,maxi])
colormap

In [27]:
# create map of New York using latitude and longitude values
map_rj = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, bairro, idh in zip(result['LAT'], result['LON'], result['BAIRRO'], result['IDHR']):
    label = '{}, {}'.format(bairro, idh)
    label = folium.Popup(label, parse_html=True)
        
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color= colormap(idh),
        fill_opacity=0.7,
        parse_html=False).add_to(map_rj)  


colormap.caption = 'IDHR NEIGHBOURHOOD RJ'
colormap.add_to(map_rj)

    
map_rj

# IDH

In [28]:
# COLOR MAP IDH
mini = result['IDH'].min()
maxi = result['IDH'].max()
z = (mini+maxi)/2
za = (z + mini)/2
zb = (z + maxi)/2
colormap = linear.Blues_09.scale(mini,maxi)
colormap = colormap.to_step(index=[mini,za,z,zb,maxi])
colormap

In [29]:
# create map of New York using latitude and longitude values
map_rj = folium.Map(location=[latitude, longitude], zoom_start=10.5)

# add markers to map
for lat, lng, bairro, idh in zip(result['LAT'], result['LON'], result['BAIRRO'], result['IDH']):
    label = '{}, {}'.format(bairro, idh)
    label = folium.Popup(label, parse_html=True)
        
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color= colormap(idh),
        fill_opacity=0.7,
        parse_html=False).add_to(map_rj)  


colormap.caption = 'IDH NEIGHBOURHOOD RJ'
colormap.add_to(map_rj)

    
map_rj

# Life expectancy MAP

In [30]:
# COLOR MAP Life Expectancy
mini = result['ESPER'].min()
maxi = result['ESPER'].max()
z = (mini+maxi)/2
za = (z + mini)/2
zb = (z + maxi)/2
colormap = linear.Reds_09.scale(mini,maxi)
colormap = colormap.to_step(index=[mini,za,z,zb,maxi])
colormap

In [31]:
# create map of New York using latitude and longitude values
map_rj = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, bairro, life in zip(result['LAT'], result['LON'], result['BAIRRO'], result['ESPER']):
    label = '{}, {}'.format(bairro, life)
    label = folium.Popup(label, parse_html=True)
        
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color= colormap(life),
        fill_opacity=0.7,
        parse_html=False).add_to(map_rj)  


colormap.caption = 'Life Expectancy NEIGHBOURHOOD RJ'
colormap.add_to(map_rj)

    
map_rj

# Average Income MAP

In [32]:
# COLOR MAP INCOME
mini = result['RENDA'].min()
maxi = result['RENDA'].max()
z = (mini+maxi)/2
za = (z + mini)/2
zb = (z + maxi)/2
colormap = linear.Reds_09.scale(mini,maxi)
colormap = colormap.to_step(index=[mini,za,z,zb,maxi])
colormap

In [33]:
# create map of New York using latitude and longitude values
map_rj = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, bairro, income in zip(result['LAT'], result['LON'], result['BAIRRO'], result['RENDA']):
    label = '{}, {}'.format(bairro, income)
    label = folium.Popup(label, parse_html=True)
        
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color= colormap(income),
        fill_opacity=0.7,
        parse_html=False).add_to(map_rj)  


colormap.caption = 'AVG INCOME NEIGHBOURHOOD RJ'
colormap.add_to(map_rj)

    
map_rj

# EXPORT DATASET

In [34]:
result = result.drop(['ZONA','SUB','LAT','LON'],axis=1)

In [35]:
result.head()

Unnamed: 0,BAIRRO,ESPER,RENDA,IDHL,IDHE,IDHR,IDH
0,SAO CRISTOVAO,72.27,412.39,0.788,0.933,0.778,0.833
1,BENFICA,73.59,376.65,0.81,0.901,0.763,0.825
2,CAJU,68.9,236.59,0.732,0.843,0.685,0.753
3,CATUMBI,69.6,324.83,0.743,0.923,0.738,0.802
4,CENTRO,76.12,633.36,0.852,0.981,0.85,0.894


In [36]:
result.to_csv('idhs.csv')