In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys
import warnings

import time

from matplotlib import pyplot as plt
from dotenv import load_dotenv

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100

sys.path.append("../")


# The only goal of this notebook is to associate lat/lon to each customer and seller to represent it on a map

As it will be more code than comment and mostly not interesting on a marketing POV, it is separated from the rest of the notebooks

<hr>

Prerequisites :

- Optimizations on sellers
- Pre execution of rfm

In [None]:
data_orders = "../data/optimized/olist_orders.csv"
data_gps = "../data/olist_geolocation_dataset.csv"
data_sellers = "../data/optimized/olist_sellers.csv"
data_cx = "../data/optimized/olist_customers.csv"
data_rfm = "../data/optimized/cx_rfm.csv"


In [None]:
tzero = time.perf_counter()

df_sellers = pd.read_csv(data_sellers)
df_cx = pd.read_csv(data_cx)
df_rfm = pd.read_csv(data_rfm)
df_gps = pd.read_csv(data_gps)
df_orders = pd.read_csv(data_orders)


In [None]:
df_gps.head()


In [None]:
df_gps_for_cx = df_gps[["geolocation_zip_code_prefix", "geolocation_lat", "geolocation_lng"]]
df_gps_for_sellers = df_gps[["geolocation_zip_code_prefix", "geolocation_lat", "geolocation_lng"]]
df_gps_for_cx = df_gps_for_cx.rename(columns={"geolocation_zip_code_prefix": "customer_zip_code_prefix"})
df_gps_for_sellers = df_gps_for_sellers.rename(columns={"geolocation_zip_code_prefix": "seller_zip_code_prefix"})


In [None]:
new_df_cx = pd.merge(left=df_cx, right=df_gps_for_cx, on="customer_zip_code_prefix", how="outer", copy=False)
new_df_sellers = pd.merge(left=df_sellers, right=df_gps_for_sellers, on="seller_zip_code_prefix", how="outer", copy=False)


In [None]:
df_cx = new_df_cx.drop_duplicates(subset="customer_id").reset_index(drop=True).dropna(subset="customer_id")
df_sellers = new_df_sellers.drop_duplicates(subset="seller_id").reset_index(drop=True).dropna(subset="seller_id")


In [None]:
df_cx = df_cx.rename(columns={"geolocation_lat": "lat", "geolocation_lng": "lon"})
df_sellers = df_sellers.rename(columns={"geolocation_lat": "lat", "geolocation_lng": "lon"})


In [None]:
df_cx.head()


In [None]:
df_cx.dtypes


In [None]:
df_cx["customer_id"] = df_cx["customer_id"].astype(np.uint32)
df_cx["customer_unique_id"] = df_cx["customer_unique_id"].astype(np.uint32)
df_cx["customer_zip_code_prefix"] = df_cx["customer_zip_code_prefix"].astype(np.uint32)


In [None]:
df_sellers.dtypes


In [None]:
df_sellers["seller_id"] = df_sellers["seller_id"].astype(np.uint32)
df_sellers["seller_zip_code_prefix"] = df_sellers["seller_zip_code_prefix"].astype(np.uint32)


In [None]:
df_rfm.head()


In [None]:
for index, row in df_rfm.iterrows():
    aliases = df_cx[df_cx["customer_unique_id"] == row["customer_uid"]]
    most_recent = aliases["customer_id"].max()
    most_recent_alias = df_cx[df_cx["customer_id"] == most_recent]
    lat = float(most_recent_alias["lat"].values)
    lon = float(most_recent_alias["lon"].values)
    df_rfm.loc[index, "lat"] = lat
    df_rfm.loc[index, "lon"] = lon


In [None]:
# Exports :

df_cx.to_csv(path_or_buf="../data/optimized/geoloc_applied/olist_customers.csv", index=False)
df_sellers.to_csv(path_or_buf="../data/optimized/geoloc_applied/olist_sellers.csv", index=False)
df_rfm.to_csv(path_or_buf="../data/optimized/geoloc_applied/cx_rfm.csv", index=False)

df_rfm.to_pickle(path="../pickles/geolocd/cx_rfm.pkl")


In [None]:
tfinal = time.perf_counter()

print(f"Exucuted in {tfinal - tzero}s")
