In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys

from matplotlib import pyplot as plt
from dotenv import load_dotenv

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100

sys.path.append("../")


In [2]:
data_path = "../data"


In [3]:
df_gps = pd.read_csv("../data/olist_geolocation_dataset.csv")

df_gps.head()


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP


In [4]:
df_sellers = pd.read_csv("../data/optimized/olist_sellers.csv")
df_cx = pd.read_csv("../data/optimized/olist_customers.csv")


In [5]:
df_gps = df_gps.drop(columns=["geolocation_city", "geolocation_state"], errors="ignore")  # Already in datasets cx and sellers


In [6]:
df_gps.set_index("geolocation_zip_code_prefix")


Unnamed: 0_level_0,geolocation_lat,geolocation_lng
geolocation_zip_code_prefix,Unnamed: 1_level_1,Unnamed: 2_level_1
1037,-23.545621,-46.639292
1046,-23.546081,-46.644820
1046,-23.546129,-46.642951
1041,-23.544392,-46.639499
1035,-23.541578,-46.641607
...,...,...
99950,-28.068639,-52.010705
99900,-27.877125,-52.224882
99950,-28.071855,-52.014716
99980,-28.388932,-51.846871


In [7]:
def geoloc(row):
    return (row["geolocation_lat"], row["geolocation_lng"])


def get_geo(row, target):
    gps_row = df_gps.iloc[row[target]]
    return gps_row["geoloc"]


In [8]:
df_gps["geoloc"] = df_gps.apply(geoloc, axis=1)


In [9]:
df_gps.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geoloc
0,1037,-23.545621,-46.639292,"(-23.54562128115268, -46.63929204800168)"
1,1046,-23.546081,-46.64482,"(-23.54608112703553, -46.64482029837157)"
2,1046,-23.546129,-46.642951,"(-23.54612896641469, -46.64295148361138)"
3,1041,-23.544392,-46.639499,"(-23.5443921648681, -46.63949930627844)"
4,1035,-23.541578,-46.641607,"(-23.541577961711493, -46.64160722329613)"


In [10]:
df_cx["geoloc"] = df_cx.apply(get_geo, axis=1, args=["customer_zip_code_prefix"])


In [11]:
df_cx.head()


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,geoloc
0,1,1,14409,franca,SP,"(-23.56077124285875, -46.65925008909916)"
1,2,2,9790,sao bernardo do campo,SP,"(-23.5562759686364, -46.64091314501704)"
2,3,3,1151,sao paulo,SP,"(-23.5523910910324, -46.62960278559065)"
3,4,4,8775,mogi das cruzes,SP,"(-23.56299985873388, -46.643790530671104)"
4,5,5,13056,campinas,SP,"(-23.56454189785911, -46.649343488487474)"


In [12]:
df_sellers["geoloc"] = df_sellers.apply(get_geo, axis=1, args=["seller_zip_code_prefix"])


In [13]:
df_sellers.head()


Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,geoloc
0,1,13023,campinas,SP,"(-23.57598075131287, -46.68753613737096)"
1,2,13844,mogi guacu,SP,"(-23.56312173510022, -46.656566708353616)"
2,3,20031,rio de janeiro,RJ,"(-23.48844541993188, -46.62237317069649)"
3,4,4195,sao paulo,SP,"(-23.545575, -46.64796351832185)"
4,5,12914,braganca paulista,SP,"(-23.564036525939255, -46.66987204775221)"
