In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys

from matplotlib import pyplot as plt
from dotenv import load_dotenv

sys.path.append("../")

from models.order import Order  # Errors but sys.path.append("../") makes it ok

data_path = "../data"

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100


# The datasets, as is, have a few flaws that will impede any machine learning algorithm, but are inherent to any database dumps.

This flaws are : 

- A repetition of data, inherent to any db_dump : primary keys and foreign keys are present across multiple datasets
- Primary keys are using a nice big boy of format, looks like a large fat 32 characters hexadecimal key which is a nice big 81 bytes. Lets tune that down to a nice unsigned int

Max values are expected to be in the low 6 - high 5 figures, so, we cannot use unsigned int 16bits (max = 65 535). We need the heavier unsigned int32 (max value = 4 294 967 295). Approx 4.3 billions is nice and comfy, Granting us lots of scalability potential.

Let's check the size of our new format against the heavy 32char hexa

In [2]:
key_example = "25e8ea4e93396b6fa0d3dd708e76c1bd"

print("hexa 32 chars : ", sys.getsizeof(key_example), "bytes")

new_key = np.uint32(4294967295)

print("largest uint32 : ", sys.getsizeof(new_key), "bytes")


hexa 32 chars :  81 bytes
largest uint32 :  28 bytes


Ok so we will use uint32 whenever we need a key, we will also eliminate one col entirely (we will use the index of the dataframe instead)

Let's load all of the boyos and see what's what and join those fellas

In [3]:
csv_list = []

for dirpath, subdirs, files in os.walk(data_path):
    for file in files:
        if file.endswith(".csv"):
            csv_list.append(file)

df_dict = {}

for csv in csv_list:
    file_name = f"../data/{csv}"
    df_dict[f"{csv[:-4]}"] = pd.read_csv(file_name)


In [4]:
# Our dfs :

for key in df_dict:
    print(key)


olist_sellers_dataset
product_category_name_translation
olist_orders_dataset
olist_order_items_dataset
olist_customers_dataset
olist_order_payments_dataset
olist_order_reviews_dataset
olist_products_dataset


### Let's begin using RFM (Recency, Frequency, Monetary) to attempt a first classification.

- Recency : Time elapsed since last order (Last order known used as reference)
- Frequency : Avg. time between orders, Cx w/ only 1 order will have a nice 0
- Monetary : Sum total of all orders

### Because otherwise it would be too logical, a unique customer id is NOT the customer id in order.

Instead, it seems that an alias is created. Hence, one Cx can have multiple Cx_ids (Cx is identified by unique id).
So let's keep track of all aliases, because its super fun.

In [5]:
df_cx = df_dict["olist_customers_dataset"]

df_cx.head()


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


In [6]:
id_to_alias = {}
alias_to_id = {}

In [7]:
for index, row in df_cx.iterrows():
    if row["customer_unique_id"] in id_to_alias.keys():
        if row["customer_id"] not in id_to_alias[row["customer_unique_id"]]:
            id_to_alias[row["customer_unique_id"]].append(row["customer_id"])
    elif row["customer_unique_id"] not in id_to_alias.keys():
        id_to_alias[row["customer_unique_id"]] = [row["customer_id"]]
    alias_to_id[row["customer_id"]] = row["customer_unique_id"]


In [8]:
## ml --> max lenght

ml = 1
for cx_alias in id_to_alias:
    if len(id_to_alias[cx_alias]) > ml:
        ml = len(id_to_alias[cx_alias])

ml


17

In [9]:
len(df_cx)


99441

In [10]:
df_orders = df_dict["olist_orders_dataset"]

df_orders.head()


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00


In [11]:
len(df_orders)


99441

In [12]:
len(df_orders["order_id"].unique())


99441

In [13]:
df_payment = df_dict["olist_order_payments_dataset"]


In [14]:
df_payment.head()


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45


In [15]:
len(df_payment["order_id"].unique())


99440

In [16]:
df_payment[df_payment["order_id"].duplicated()]


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
1456,683bf306149bb869980b68d48a1bd6ab,1,credit_card,1,8.58
2324,e6a66a8350bb88497954d37688ab123e,2,voucher,1,10.51
2393,8e5148bee82a7e42c5f9ba76161dc51a,1,credit_card,1,0.67
2414,816ccd9d21435796e8ffa9802b2a782f,1,credit_card,1,5.65
2497,2cbcb371aee438c59b722a21d83597e0,2,voucher,1,7.80
...,...,...,...,...,...
103778,fd86c80924b4be8fb7f58c4ecc680dae,1,credit_card,1,76.10
103817,6d4616de4341417e17978fe57aec1c46,1,credit_card,1,19.18
103860,31bc09fdbd701a7a4f9b55b5955b8687,6,voucher,1,77.99
103869,c9b01bef18eb84888f0fd071b8413b38,1,credit_card,6,238.16


In [17]:
df_payment


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45
...,...,...,...,...,...
103881,0406037ad97740d563a178ecc7a2075c,1,boleto,1,363.31
103882,7b905861d7c825891d6347454ea7863f,1,credit_card,2,96.80
103883,32609bbb3dd69b3c066a6860554a77bf,1,credit_card,1,47.77
103884,b8b61059626efa996a60be9bb9320e10,1,credit_card,5,369.54


In [18]:
order_total = {}

for index, row in df_payment.iterrows():
    if row["order_id"] in order_total.keys():
        order_total[row["order_id"]] += row["payment_value"]
    elif row["order_id"] not in order_total.keys():
        order_total[row["order_id"]] = row["payment_value"]


In [19]:
order_total


{'b81ef226f3fe1789b1e8b2acac839d17': 99.33,
 'a9810da82917af2d9aefd1278f1dcfa0': 24.39,
 '25e8ea4e93396b6fa0d3dd708e76c1bd': 65.71,
 'ba78997921bbcdc1373bb41e913ab953': 107.78,
 '42fdf880ba16b47b59251dd489d4441a': 128.45,
 '298fcdf1f73eb413e4d26d01b25bc1cd': 96.12,
 '771ee386b001f06208a7419e4fc1bbd7': 81.16,
 '3d7239c394a212faae122962df514ac7': 51.84,
 '1f78449c87a54faf9e96e88ba1491fa9': 341.09,
 '0573b5e23cbd798006520e1d5b4c6714': 51.95,
 'd88e0d5fa41661ce03cf6cf336527646': 188.73,
 '2480f727e869fdeb397244a21b721b67': 141.9,
 '616105c9352a9668c38303ad44e056cd': 75.78,
 'cf95215a722f3ebf29e6bbab87a29e61': 102.66,
 '769214176682788a92801d8907fa1b40': 105.28,
 '12e5cfe0e4716b59afb0e0f4a3bd6570': 157.45,
 '61059985a6fc0ad64e95d9944caacdad': 132.04,
 '79da3f5fe31ad1e454f06f95dc032ad5': 98.94,
 '8ac09207f415d55acff302df7d6a895c': 244.15,
 'b2349a3f20dfbeef62e7b31baa22f84b': 136.71,
 '5a1f6d22f7dfb061ef29216b9af687a1': 47.69,
 '4214cda550ece8ee66441f459dc33a8c': 170.57,
 'adfbf6c2a620196f9a3

In [20]:
df_orders


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00
...,...,...,...,...,...,...,...,...
99436,9c5dedf39a927c1b2549525ed64a053c,39bd1228ee8140590ac3aca26f2dfe00,delivered,2017-03-09 09:54:05,2017-03-09 09:54:05,2017-03-10 11:18:03,2017-03-17 15:08:01,2017-03-28 00:00:00
99437,63943bddc261676b46f01ca7ac2f7bd8,1fca14ff2861355f6e5f14306ff977a7,delivered,2018-02-06 12:58:58,2018-02-06 13:10:37,2018-02-07 23:22:42,2018-02-28 17:37:56,2018-03-02 00:00:00
99438,83c1379a015df1e13d02aae0204711ab,1aa71eb042121263aafbe80c1b562c9c,delivered,2017-08-27 14:46:43,2017-08-27 15:04:16,2017-08-28 20:52:26,2017-09-21 11:24:17,2017-09-27 00:00:00
99439,11c177c8e97725db2631073c19f07b62,b331b74b18dc79bcdf6532d51e1637c1,delivered,2018-01-08 21:28:27,2018-01-08 21:36:21,2018-01-12 15:35:03,2018-01-25 23:32:54,2018-02-15 00:00:00


In [21]:
df_orders = df_orders.astype({"order_purchase_timestamp": "datetime64[ns]"})


In [22]:
most_recent = df_orders["order_purchase_timestamp"].max()

print(most_recent)


2018-10-17 17:30:18


In [23]:
df_dict.keys()


dict_keys(['olist_sellers_dataset', 'product_category_name_translation', 'olist_orders_dataset', 'olist_order_items_dataset', 'olist_customers_dataset', 'olist_order_payments_dataset', 'olist_order_reviews_dataset', 'olist_products_dataset'])

In [24]:
order_data = {
    "order_id": order_total.keys(),
    "order_value": order_total.values()
    }

condensed_order_df = pd.DataFrame(data=order_data)

condensed_order_df["order_date"] = np.nan
condensed_order_df["cx_uid"] = np.nan
condensed_order_df["cx_alias"] = np.nan

condensed_order_df


Unnamed: 0,order_id,order_value,order_date,cx_uid,cx_alias
0,b81ef226f3fe1789b1e8b2acac839d17,99.33,,,
1,a9810da82917af2d9aefd1278f1dcfa0,24.39,,,
2,25e8ea4e93396b6fa0d3dd708e76c1bd,65.71,,,
3,ba78997921bbcdc1373bb41e913ab953,107.78,,,
4,42fdf880ba16b47b59251dd489d4441a,128.45,,,
...,...,...,...,...,...
99435,0406037ad97740d563a178ecc7a2075c,363.31,,,
99436,7b905861d7c825891d6347454ea7863f,96.80,,,
99437,32609bbb3dd69b3c066a6860554a77bf,47.77,,,
99438,b8b61059626efa996a60be9bb9320e10,369.54,,,


In [25]:
condensed_order_df["order_id_int"] = np.arange(1, len(condensed_order_df) + 1)


In [26]:
order_simplified_id_dict = {}

for index, row in condensed_order_df.iterrows():
    order_simplified_id_dict[row["order_id"]] = row["order_id_int"]


In [27]:
# ## TAKES AGES

for index, row in condensed_order_df.iterrows():
    order_series = df_orders.loc[df_orders['order_id'] == row["order_id"]]
    row["order_date"] = order_series["order_purchase_timestamp"]
    row["cx_alias"] = order_series["customer_id"]
