In [1]:
import pandas as pd
import pgeocode

DATA_DIR = '../data/dataset/'
DATA_SUFFIX = '_dataset.csv'

In [2]:
def load_data_as_df(file: str, data_dir = DATA_DIR, data_suffix = DATA_SUFFIX):
    return pd.read_csv(f'{data_dir}{file}{data_suffix}')

customers_df = load_data_as_df('customers')
geolocation_df = load_data_as_df('geolocation')
order_items = load_data_as_df('order_items')
orders_df = load_data_as_df('orders')
products_df = load_data_as_df('products')
sellers_df = load_data_as_df('sellers')

# Customers

Let's start with the basic stats about the customers.


In [3]:
assert customers_df.customer_id.nunique() == len(customers_df)
assert customers_df.customer_unique_id.nunique() <= len(customers_df)


In [4]:
customers_df.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


In [5]:
# First question: what makes the customer_unique_id different from the customer_id? When we tally customers should we use customer_unique_id or customer_id?

In [6]:
total_customers = customers_df.customer_id.nunique()
total_uq_customers = customers_df.customer_unique_id.nunique()

print(f'Total customers: {total_customers}')
print(f'Total unique customers: {total_uq_customers}')

Total customers: 99441
Total unique customers: 96096


In [7]:
# How many customers are there in each state?
customer_states = pd.DataFrame(customers_df.customer_state.value_counts(normalize=True))
# add to customer_states a cumsum column so that we can see how many states are required for good coverage
customer_states['cumsum'] = customer_states.cumsum()

In [8]:
# How many customers are there in each state?
customer_cities = pd.DataFrame(customers_df.customer_city.value_counts(normalize=True))
# add to customer_states a cumsum column so that we can see how many states are required for good coverage
customer_cities['cumsum'] = customer_cities.cumsum()

In [9]:
customer_cities.head(355)

Unnamed: 0_level_0,proportion,cumsum
customer_city,Unnamed: 1_level_1,Unnamed: 2_level_1
sao paulo,0.156274,0.156274
rio de janeiro,0.069207,0.225480
belo horizonte,0.027886,0.253366
brasilia,0.021430,0.274796
curitiba,0.015296,0.290092
...,...,...
caldas novas,0.000362,0.798866
pedreira,0.000362,0.799228
alegrete,0.000362,0.799590
campo bom,0.000362,0.799952


# Geolocation

We could merge the dataframes and visualize the customers on a map.

In [10]:
geolocation_df = load_data_as_df('geolocation')
geolocation_df.rename(columns={'geolocation_zip_code_prefix': 'customer_zip_code_prefix'}, inplace=True)
geolocation_df.sort_values(
    by=['customer_zip_code_prefix', 'geolocation_lat', 'geolocation_lng'],
    ascending=False).head(4)

Unnamed: 0,customer_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
999864,99990,-28.329472,-51.769109,muliterno,RS
999758,99990,-28.329718,-51.769615,muliterno,RS
999764,99980,-28.386239,-51.847741,david canabarro,RS
999897,99980,-28.386408,-51.844876,david canabarro,RS


In [11]:
geolocation_df.geolocation_city.value_counts().head(5)

geolocation_city
sao paulo         135800
rio de janeiro     62151
belo horizonte     27805
são paulo          24918
curitiba           16593
Name: count, dtype: int64

## Some Issues

Before merging with the customer data, there are at least 2 issues to address:
1. The data is very large, and each postal code translated to different lat, lon locations.
2. City names are not standardized, e.g.:
    - "São Paulo" vs "Sao Paulo" 
    - "sao joao do pau d%26apos%3balho" vs "sao joao do pau dbalho"

In [12]:
# Define a mapping for special characters
special_characters = {
    'ã': 'a',
    'á': 'a',
    'â': 'a',
    'à': 'a',
    'ç': 'c',
    'é': 'e',
    'ê': 'e',
    'í': 'i',
    'ó': 'o',
    'ô': 'o',
    'ú': 'u',
    'ñ': 'n',
    'ü': 'u',
    'õ': 'o',
    'ß': 'ss',
    'Á': 'A',
    'À': 'A',
    'Â': 'A',
    'Ã': 'A',
    'É': 'E',
    'Ê': 'E',
    'Í': 'I',
    'Ó': 'O',
    'Ô': 'O',
    'Ú': 'U',
}

# Create a function to replace special characters
def replace_special_characters(city_name):
    for special_char, replacement in special_characters.items():
        city_name = city_name.replace(special_char, replacement)
    return city_name

# Apply the function to the 'geolocation_city' column
geolocation_df['geolocation_city'] = geolocation_df['geolocation_city'].str.replace(r'[^a-zA-Z\s]', '', regex=True)
geolocation_df['geolocation_city'] = geolocation_df['geolocation_city'].apply(replace_special_characters)

# Now you can group by the standardized city names
geolocation_clean_df = geolocation_df.groupby(['customer_zip_code_prefix', 'geolocation_city']).agg(
    geolocation_lat=('geolocation_lat', 'mean'),
    geolocation_lng=('geolocation_lng', 'mean')
).reset_index()

In [13]:
geolocation_clean_df[geolocation_clean_df['customer_zip_code_prefix'] == 17970]

Unnamed: 0,customer_zip_code_prefix,geolocation_city,geolocation_lat,geolocation_lng
10156,17970,sao joao do pau d alho,-21.270781,-51.66438
10157,17970,sao joao do pau dalho,-21.270836,-51.666499
10158,17970,so joo do pau dalho,-21.255882,-51.669572
10159,17970,so joo do pau daposbalho,-21.269165,-51.668758


We finish by removing duplicates for each `customer_zip_code_prefix` .
As can be seen above, it is due to the fact that the city name is not standardized.


In [14]:
original_len = len(geolocation_clean_df)
geolocation_clean_df = geolocation_clean_df.drop_duplicates(subset=['customer_zip_code_prefix'])
print(f'Removed {original_len - len(geolocation_clean_df)} duplicates')

Removed 8853 duplicates


In [15]:
enriched_df = pd.merge(left=customers_df, right=geolocation_clean_df, on='customer_zip_code_prefix', how='left')

In [16]:
assert len(enriched_df) == len(customers_df)

In [17]:
enriched_df.customer_zip_code_prefix.nunique()

14994

In [19]:
enriched_df

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,geolocation_city,geolocation_lat,geolocation_lng
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,franca,-20.498489,-47.396929
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP,sao bernardo do campo,-23.728035,-46.542818
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP,sao paulo,-23.531705,-46.656207
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP,mogi das cruzes,-23.499702,-46.185233
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP,campinas,-22.975100,-47.142925
...,...,...,...,...,...,...,...,...
99436,17ddf5dd5d51696bb3d7c6291687be6f,1a29b476fee25c95fbafc67c5ac95cf8,3937,sao paulo,SP,sao paulo,-23.585979,-46.499703
99437,e7b71a9017aa05c9a7fd292d714858e8,d52a67c98be1cf6a5c84435bd38d095d,6764,taboao da serra,SP,taboao da serra,-23.615877,-46.768522
99438,5e28dfe12db7fb50a4b2f691faecea5e,e9f50caf99f032f0bf3c55141f019d99,60115,fortaleza,CE,fortaleza,-3.734569,-38.510534
99439,56b18e2166679b8a959d72dd06da27f9,73c2643a0a458b49f58cea58833b192e,92120,canoas,RS,canoas,-29.949839,-51.168494


# Map Visualization

We will now visualize the different locations on a map.  
Since there are 15K data points, we will use a marker cluster and only a random sample of 10%.


In [25]:
import folium
from folium.plugins import MarkerCluster


random_sample = enriched_df.sample(frac=0.1)

# Create a base map
map = folium.Map(location=[-23.5505, -46.6333], zoom_start=12)

# Add a marker cluster to the map
marker_cluster = MarkerCluster().add_to(map)
errors = []
error_count = 0
# Add markers to the map based on the random_sample
for index, row in random_sample.iterrows():
    try:
        folium.Marker(
            location=[row['geolocation_lat'], row['geolocation_lng']],
        ).add_to(marker_cluster)
    except:
        errors.append(index)
        error_count += 1

In [31]:
# How many errors did we get?
error_pct = round(error_count / len(random_sample) * 100, 4)
print(f'Error rate: {error_pct}%')

Error rate: 0.2313%


In [32]:
map