<style>
      h1, h2, h3, h4 {
        color: #bbb;
      }
      ol {
        counter-reset: item;
        font-size: 1.2rem;
        font-weight: 200;
      }
      li {
        display: block;
        color: #aaa;
      }
      li:before {
        content: counters(item, ".") " ";
        counter-increment: item;
      }
    </style>

<h1>Equalização dos CEP com Endereços</h1>
<ol>
    <li>importar as depedências</li>
    <li>configurar as variáveis de ambiente</li>
    <li>fazer download da base de dados
      <ol>
          <li>fazer a leitura da base com pandas</li>
      </ol>
    </li>
    <li>fazer a exploração da base
      <ol>
          <li>implementar lógica de exclusão de dados nulos</li>
          <li>fazer atualização da base de cep</li>
      </ol>
    </li>
    <li>criar a rotina de tratamento dos dados</li>
</ol>


<h3 style="color: #ccc">1. Importar as depedências de desenvolvimento</h3>

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import json
import csv
import math

from unicodedata import normalize
from urllib import request
from datetime import datetime

sys.path.append(os.path.abspath(os.path.join('..')))

In [None]:
from services.version_service import VersionService
from keys.generate_key import GenerateKey as key
ver_service = VersionService()

In [None]:
STORAGE = os.path.join('..', 'storage')

In [None]:
FILENAME_TB_ESTABELECIMENTO = ver_service.get_last_version(key.FILENAME_TB_ESTABELECIMENTO, key.EXT_CSV, STORAGE)
FILENAME_CEP = ver_service.get_last_version(key.FILENAME_CEP, key.EXT_CSV, STORAGE)
FILENAME_JSON = ver_service.get_last_version(key.FILENAME_CEP,key.EXT_JSON, STORAGE)

<h3 style="color: #ccc">2. Configurar as variáveis de ambiente</h3>

In [None]:
environment = {}

with open(os.path.join('../', '.env'), encoding='utf-8') as env:
    files = env.readlines()
    
    for file in files:
        environment[str(file.split('=')[0])] = str(file.split('=')[1]).strip()
    env.close()

environment

In [108]:
# build url
def get_url(path: str) -> str:
    return '{}:{}{}'.format(environment.get('URL'), environment.get('PORT'), path)

# normalize unicode
def normalize_unicode(text: str) -> str:
    return normalize('NFKD', text).encode('ASCII','ignore').decode('ASCII')

# jon column and remove duplicated
def join_columns(text: str) -> str:
    return ' '.join(dict.fromkeys(normalize_unicode(text).lower().split()))

# combine column
def combine_columns(df: pd.DataFrame, col_source: str, col_target: str, delimiter:str = ' '):
    join = lambda s1, s2: join_columns(str(s1 + delimiter + s2))
    df[col_source] = df[col_source].combine(df[col_target], join)

# fillna columns
def filla_columns(df: pd.DataFrame, cols: list, value = ''):
    for col in cols:
        df[col].fillna(value=value, inplace=True)

# to lower case
def to_lower(df: pd.DataFrame, col: str):
    df[col] = df[col].str.lower()

# calculate distance euclidian
def distance_euclidian(coord_origin: tuple, coord_dest:tuple) -> float:
    return math.sqrt(((coord_origin[1] - coord_origin[0]) **2) + ((coord_dest[1] - coord_dest[0]) **2))

# calculate distance euclidian
def distance_euclidian2(coord_origin: tuple, coord_dest:tuple) -> float:
    return math.sqrt(math.pow((coord_origin[1] - coord_origin[0]), 2) + math.pow(coord_dest[1] - coord_dest[0], 2))

# calculate distance manhattan
def distance_manhatran(coord_origin: list, coord_dest:list) -> float:
    return  np.abs(coord_origin[1] - coord_origin[0]) + np.abs(coord_dest[1] - coord_dest[0])

def k_nearst(coord_origins, coord_dests, point):

    for i in range(len(coord_origins)):
        print(coord_origins[i])

<h3 style="color: #ccc">3. Fazer download da base de dados</h3>

In [None]:
# WARNING: run if not exist file
response = request.urlopen(get_url('/export-csv-cep'))
raw_data = response.read()
encoding = response.info().get_content_charset('utf8')
filename = ver_service.get_next_version(key.FILENAME_CEP, key.EXT_CSV, STORAGE)

with open(os.path.join(STORAGE, filename), 'w', encoding='utf-8') as csv:
    csv.writelines(raw_data.decode(encoding))
    csv.close()

<h4 style="color: #ccc">3.1 fazer a leitura da base com pandas</h4>

In [None]:
df_esta = pd.read_csv(os.path.join(STORAGE, FILENAME_TB_ESTABELECIMENTO), sep=';', encoding='utf-8', low_memory=False, dtype=str)
df_cep = pd.read_csv(os.path.join(STORAGE, FILENAME_CEP), sep=',', encoding='utf-8', low_memory=False, dtype=str)

<h3 style="color: #ccc">4. fazer a exploração da base</h3>

In [None]:
# total de registro e colunas
print('estabelecimento: {}'.format(df_esta.shape))
print(80*'#')
print('cep: {}'.format(df_cep.shape))

In [None]:
print(df_cep.info())
print(80*'#')
print(df_esta.info())

In [None]:
df_address = df_esta[['ID','CEP', 'BAIRRO', 'COMPLEMENTO', 'ESTADO', 'LOGRADOURO', 'NUMERO', 'TIPO_LOGRADOURO']]

<h3 style="color: #ccc">4.1 implementar lógica de exclusão de dados nulos</h3>

In [None]:
display(df_cep.isnull().sum())
print(80*"#")
display(df_address.isnull().sum())

In [None]:
# count occurence
display(df_address.value_counts())
print(80*'#')
display(df_cep.value_counts())

In [None]:
display(df_address['CEP'].isin(df_cep['cep']).value_counts())
print(80*'#')
display(df_cep['cep'].isin(df_address['CEP']).value_counts())

In [None]:
cep_left = df_esta[~df_esta['CEP'].isin(df_cep['cep'])]
cep_left = cep_left[['CEP', 'BAIRRO', 'COMPLEMENTO', 'ESTADO', 'LOGRADOURO', 'NUMERO']]
columns={'CEP':'cep','BAIRRO':'bairro','COMPLEMENTO':'complemento','LOGRADOURO':'lougradouro','NUMERO':'numero','ESTADO':'estado'}
cep_left.rename(columns=columns, inplace=True)
cep_left['cidade_estado'] = cep_left['bairro']

In [None]:
filla_columns(cep_left, ['estado', 'bairro', 'complemento', 'cidade_estado'])
filla_columns(cep_left, ['cep'], '00000000')

In [None]:
combine_columns(cep_left, 'cidade_estado', 'estado', '/')
combine_columns(cep_left, 'lougradouro', 'numero', ', ')

In [None]:
cep_left = cep_left[cep_left.columns.difference(['estado', 'numero'])]

In [None]:
df_cep_updated = pd.concat([df_cep, cep_left])

In [None]:
display(df_cep_updated)

In [None]:
print('check merge: {}'.format(df_cep_updated.shape[0] - df_cep.shape[0] == cep_left.shape[0]))

<h3 style="color: #ccc">4.2 Fazer atualização da base de cep</h3>

In [None]:
latest_cep_version = ver_service.get_next_version(key.FILENAME_CEP, key.EXT_CSV, os.path.join(STORAGE))
latest_cep_version

In [None]:
json.dumps(df_cep_updated.iloc[0].to_dict(), ensure_ascii=False)

In [None]:
df_cep_updated.to_csv(os.path.join(STORAGE, latest_cep_version), index=False)

In [None]:
FILENAME_CEP = ver_service.get_last_version(key.FILENAME_CEP, key.EXT_CSV, STORAGE)
df_cep = pd.read_csv(os.path.join(STORAGE, FILENAME_CEP), sep=',', encoding='utf-8', low_memory=False, dtype=str)

In [None]:
df_cep

In [None]:
distance_euclidian((0.2, 0.1), (0.3, -0.4))

In [None]:
distance_euclidian2((0.2, 0.1), (0.3, -0.4))

In [None]:
distance_manhatran((0.2, 0.1), (0.3, -0.4))

In [110]:
users = 1000
 # Gera os pontos de base, aleatoriamente
lat_origin, long_origin = np.random.rand(2, 1)

# Gera o ponto k, também aleatoriamente
lat_dest, long_dest = np.random.rand(2, 10)

In [119]:
long_dest

array([0.29434753, 0.86545192, 0.40304582, 0.92200864, 0.42387968,
       0.58343485, 0.50499207, 0.3630296 , 0.15657585, 0.24866706])

In [140]:
distances = {}
for i in range(len(lat_dest)):
    dist = np.sqrt(np.power(long_origin[0] - lat_origin[0], 2) + np.power(long_dest[i] - lat_dest[i], 2))
    distances[dist] = [long_dest[i], lat_dest[i]]

distances_sorted = sorted(distances)

pX = []
pY = []
n = np.min([10, len(lat_dest)])

for i in range(n):
    pX.append(distances[distances_sorted[i]][0])
    pY.append(distances[distances_sorted[i]][1])

pX, pY

([0.922008637271035,
  0.2943475278109503,
  0.1565758459534905,
  0.2486670601368588,
  0.4238796824745579,
  0.5049920702348204,
  0.5834348477648943,
  0.363029597763903,
  0.4030458161764987,
  0.8654519169742924],
 [0.917387516098454,
  0.21677063326327017,
  0.026811348143088942,
  0.11159770166475524,
  0.15901443706579366,
  0.22591422144514262,
  0.20576162237286288,
  0.8222203635132157,
  0.876568848197626,
  0.06840906189440088])

In [146]:
# Número total de pontos (base)
tam = 100
# Número desejado de pontos mais próximos
n = 10

 # Gera os pontos de base, aleatoriamente
bx, by = np.random.rand(2, tam)

# Gera o ponto k, também aleatoriamente
kx, ky = np.random.rand(2, 1)

distancias = {}
for i in range(len(bx)):
    distancia, = np.sqrt(np.power(bx[i] - kx, 2) + np.power(by[i] - ky, 2))
    distancias[distancia] = [bx[i], by[i]]

# Organiza as distâncias em ordem crescente
ordenadas = sorted(distancias)

# Então, devolve os pontos nas n primeiras entradas
# da lista ordenada (pegando as coordenadas do dicionário)
pX = []
pY = []
n = np.min([n, len(bx)])
for i in range(n):
    coords = distancias[ordenadas[i]]
    pX.append(coords[0])
    pY.append(coords[1])

pX, pY


([0.009814393018307443,
  0.031230690090121138,
  0.0863289236282837,
  0.2266767130402677,
  0.006802558010464588,
  0.25451019494356186,
  0.16747246531219973,
  0.03806380525801756,
  0.25731887333953873,
  0.07750635718386989],
 [0.01826728634599739,
  0.17394330445316697,
  0.1742162285479777,
  0.10109142953346328,
  0.27631628401214614,
  0.09545924569210595,
  0.2639432262953745,
  0.31688102792749584,
  0.22557616446280782,
  0.34464384916968593])