In [16]:
import os
import numpy as np
import pandas as pd
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# debug
debug = False
if not debug:
    num_rows = None
else:
    num_rows = 10000
    
def str_to_datetime(strg):
    return datetime.strptime(strg, '%Y-%m-%d %H:%M:%S')
    

# Présentation du jeu de données

**Ce diagramme présente les relations entre les données:**

![image](https://i.imgur.com/HRhd2Y0.png)

L'objectif est de segmenter les clients. Cependant, la base de donnée "customers" contient non pas une ligne par client mais une ligne par commande. On va donc créer notre propre dataframe a partir du jeu de données fourni.

# Analyse exploratoire & Feature engineering

In [26]:
# process_customers
app_customers = pd.read_csv('./data/olist_customers_dataset.csv', nrows= num_rows)
app_aggs = {'customer_id': ['count']}
app = app_customers.groupby('customer_unique_id').agg({**app_aggs})
app.columns = pd.Index(["orders_count"])

# make hash table: customers ids => customer unique ids
cid_to_ucid = pd.DataFrame(data=app_customers[['customer_id', 'customer_unique_id']])
cid_to_ucid = cid_to_ucid.set_index('customer_id')

# process orders 
app_orders = pd.read_csv('./data/olist_orders_dataset.csv', nrows= num_rows)
app_orders = app_orders.join(cid_to_ucid, how='left', on='customer_id')
# only keep delivered orders with non NaN timestamps
orders = app_orders[app_orders["order_status"]=="delivered"]
orders = orders[orders["order_delivered_customer_date"].isnull()==False]
orders = orders[orders["order_approved_at"].isnull()==False]
orders = orders[orders["order_purchase_timestamp"].isnull()==False]
# convert timestamps to datetime
orders["order_delivered_customer_date"] = orders["order_delivered_customer_date"].apply(str_to_datetime)
orders["order_approved_at"] = orders["order_approved_at"].apply(str_to_datetime)
orders["order_purchase_timestamp"] = orders["order_purchase_timestamp"].apply(str_to_datetime)
# create new features
orders["shipping_delay"] = orders["order_delivered_customer_date"] - orders["order_approved_at"]
orders["payment_delay"] = orders["order_approved_at"] - orders["order_purchase_timestamp"]
# convert new features to float
orders["shipping_delay"] = orders["shipping_delay"].apply(lambda x: x.total_seconds() / 86400) # days
orders["payment_delay"] = orders["payment_delay"].apply(lambda x: x.total_seconds() / 60) # minutes
# add aggregation to main dataframe
orders_aggs = {
    'shipping_delay': ['mean'],
    'payment_delay': ['mean'],
    }
orders = orders.groupby('customer_unique_id').agg({**orders_aggs})
orders.columns = pd.Index([e[0] + "_" + e[1] for e in orders.columns.tolist()])
app = app.join(orders, how='left', on='customer_unique_id')

# make hash table: order ids => customer unique ids
oid_to_cuid = pd.DataFrame(data=app_orders[['order_id', 'customer_unique_id']])
oid_to_cuid = oid_to_cuid.set_index('order_id')

# cleaning
del app_customers, app_orders, cid_to_ucid, oid_to_cuid, orders 
gc.collect()

app.head()

Unnamed: 0_level_0,orders_count,shipping_delay_mean,payment_delay_mean
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0000366f3b9a7992bf8c76cfdf3221e2,1,6.400914,14.85
0000b849f77a49e4a4ce2b2a4ca5be3f,1,2.984005,434.283333
0000f46a3911fa3c0805444483337064,1,25.731759,0.0
0000f6ccb0745a6a4b88665a16c9f078,1,20.023472,19.6
0004aac84e0df4da2b147fca70cf8255,1,13.126435,21.166667


# Réductions de dimensionnalité 

# Classification non supervisée

# Résultats