## Base de données juillet 2018
Nous allons exécuter, les mêmes commandes sur notre dataset de juillet 2018 pour avoir une base comparable

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import calendar

In [2]:
df = pd.read_csv('bdd_base.csv')
df_2 = pd.read_csv('bdd_juillet.csv')
df_2 = df_2.drop(columns=['Unnamed: 0'])

In [3]:
datetime = ['order_purchase_timestamp', 'order_approved_at',
            'order_delivered_customer_date', 'order_delivered_carrier_date',
            'order_estimated_delivery_date']
for col in datetime:
    df[col] = df[col].astype('datetime64')

df[['order_purchase_timestamp', 'order_delivered_customer_date']] = df[[
    'order_purchase_timestamp', 'order_delivered_customer_date']].apply(pd.to_datetime)

In [4]:
datetime = ['order_purchase_timestamp', 'order_approved_at',
            'order_delivered_customer_date', 'order_delivered_carrier_date',
            'order_estimated_delivery_date']
for col in datetime:
    df_2[col] = df_2[col].astype('datetime64')

df_2[['order_purchase_timestamp', 'order_delivered_customer_date']] = df_2[[
    'order_purchase_timestamp', 'order_delivered_customer_date']].apply(pd.to_datetime)

In [5]:
def categorisation(x):
    if x < 3:
        return 0
    return 1


df_2['satisfaction'] = df_2['review_score'].map(lambda x: categorisation(x))

In [6]:
df_2['temps_livraisons'] = (df_2.order_delivered_customer_date - 
                            df_2.order_purchase_timestamp).dt.round('1d').dt.days

df_2['retard_livraisons'] = (df_2.order_delivered_customer_date -
                             df_2.order_estimated_delivery_date).dt.round('1d').dt.days


def retard(x):
    if x < 0:
        return 0
    return x


df_2['retard_livraisons'] = df_2['retard_livraisons'].map(lambda x: retard(x))

In [7]:
def noms(x):
    if x in ['bed_bath_table', 'health_beauty', 'sports_leisure', 'furniture_decor',
             'computers_accessories', 'housewares', 'watches_gifts', 'telephony',
             'garden_tools', 'auto', 'toys', 'cool_stuff', 'perfumery', 'baby',
             'electronics']:
        return x
    return 'autres'


df_2['product_category_name'] = df_2['product_category_name_english'].map(
    lambda x: noms(x))

df_2.drop("product_category_name_english", axis=1, inplace=True)

In [8]:
df_2['mois_vente'] = pd.Series(pd.Categorical(
    df['order_purchase_timestamp'].dt.month))

In [9]:
categories_client = df_2.groupby(["customer_unique_id",
                                  "product_category_name"]).agg({
                                      'order_item_id': 'count'}).unstack()
categories_client.fillna(0, inplace=True)
categories_client.columns = categories_client.columns.droplevel(0)
categories_client["total_items"] = categories_client.sum(axis=1)

In [10]:
products_per_order = df_2.groupby(["customer_unique_id", 'order_id']).agg({
    "order_item_id": "count"})

products_per_order = products_per_order.rename(
    columns={"order_item_id": "products_per_order_mean"})

products_per_order = products_per_order.groupby("customer_unique_id")\
    .agg({"products_per_order_mean": "mean"})

In [11]:
def noms(x):
    if x in ['SP', 'RJ', 'MG', 'RS',
             'PR', 'SC']:
        return x
    return 'autres'


df_2['customer_state'] = df_2['customer_state'].map(lambda x: noms(x))

df_f = df_2.reindex(columns=["customer_unique_id",
                    "customer_state", 'mois_vente'])
df_f.head()

Unnamed: 0,customer_unique_id,customer_state,mois_vente
0,2ecddf370925aff428e58b4f6021fe38,autres,4
1,d0ad8d078320fd639fe1ef1bcbe0e4af,SP,6
2,52cf00bf99b259287bcd5b5a766adf2a,SP,12
3,5b7f468205743c98e58a379722d3ae79,SP,12
4,5b7f468205743c98e58a379722d3ae79,SP,5


In [12]:
recurencies = df_2.groupby("customer_unique_id")\
    .agg({"order_purchase_timestamp": ["min", "max"]})
recurencies.columns = recurencies.columns.droplevel(0)

recurencies['delay_e_commande'] = (
    recurencies["max"] - recurencies["min"]).round('1d')
recurencies['max_comma'] = df_2['order_purchase_timestamp'].max()
recurencies['order_total_delais'] = (
    recurencies['max_comma'] - recurencies['max']).round('1d')

recurencies.drop(["min", "max", 'max_comma'], axis=1, inplace=True)

In [13]:
recurencies['delay_e_commande'] = recurencies['delay_e_commande'].dt.days.astype(
    'int64')
recurencies['order_total_delais'] = recurencies['order_total_delais'].dt.days.astype(
    'int64')

recurencies.drop(['delay_e_commande'], axis=1, inplace=True)

In [14]:
df_2 = df_2.rename(columns={"order_id": "nb_commandes",
                            "retard_livraisons": "retard_livraisons_mean",
                            "temps_livraisons": "temps_livraisons_mean",
                            "satisfaction": "satisfaction_mean",
                            "review_score": "review_score_mean",
                            "freight_value": "fdp_total",
                            "price": "price_total",
                            "payment_sequential": "moyen_payment_mean",
                            "payment_installments": "versement_payment_mean"
                            })

In [15]:
df_2 = df_2.groupby('customer_unique_id')\
    .agg({'nb_commandes': 'nunique',
          'retard_livraisons_mean': 'mean',
          'temps_livraisons_mean': 'mean',
          'satisfaction_mean': 'mean',
          'review_score_mean': 'mean',
          'fdp_total': 'sum',
          'price_total': 'sum',
          'moyen_payment_mean': 'mean',
          'versement_payment_mean': 'mean',
          'Year': lambda x: x.value_counts().index[0]})

df_2 = pd.merge(df_2, categories_client,
                how="left",
                on="customer_unique_id")

df_2 = pd.merge(df_2, products_per_order,
                how='left',
                on='customer_unique_id')

df_2 = pd.merge(df_2, df_f,
                how='left',
                on='customer_unique_id')

df_2 = pd.merge(df_2, recurencies,
                how='left',
                on='customer_unique_id')

In [16]:
df_2.drop_duplicates(subset="customer_unique_id", keep="first", inplace=True)

In [17]:
borne = round(np.percentile(df_2["price_total"].values, 99), 2)
df_2 = df_2.loc[df_2["price_total"] < borne]

borne = round(np.percentile(df_2["fdp_total"].values, 99), 2)
df_2 = df_2.loc[df_2["fdp_total"] < borne]

In [18]:
df_2.head(10)

Unnamed: 0,customer_unique_id,nb_commandes,retard_livraisons_mean,temps_livraisons_mean,satisfaction_mean,review_score_mean,fdp_total,price_total,moyen_payment_mean,versement_payment_mean,...,perfumery,sports_leisure,telephony,toys,watches_gifts,total_items,products_per_order_mean,customer_state,mois_vente,order_total_delais
0,000fbf0473c10fc1ab6f8d2d286ce20c,1,0.0,8.0,1.0,5.0,76.56,285.8,1.0,1.0,...,0.0,0.0,0.0,2.0,0.0,4.0,4.0,SP,7,6
4,00172711b30d52eea8b313a7f2cced02,1,0.0,11.0,1.0,4.0,47.57,74.5,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,autres,7,4
5,00196c4c9a3af7dd2ad10eade69c926f,1,0.0,17.0,1.0,5.0,15.26,22.32,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,SP,6,4
6,001a34eb30ecb8e3aacb07c475ca4dd1,1,0.0,2.0,1.0,5.0,9.81,151.9,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,SP,2,20
7,0025795df7a7d077c4c90162fa820085,1,0.0,3.0,0.0,1.0,36.18,177.9,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,2.0,2.0,SP,7,8
9,003162bf0351f47a1a247992b8b9b42d,1,0.0,9.0,1.0,5.0,21.7,514.9,1.0,10.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,autres,1,17
10,0032ca69771a7758ce643258bb9e4ac7,1,0.0,7.0,1.0,5.0,17.45,45.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,SP,12,15
11,00344274804f3b8003de1b0562ae01df,1,0.0,14.0,1.0,3.0,38.23,188.0,1.0,5.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,autres,6,22
12,003a5571a07dcf09bf117d13d2980ba3,1,0.0,13.0,1.0,5.0,31.92,149.9,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,autres,5,7
13,003fb95e849e71e732629d94bb92762f,1,0.0,5.0,1.0,5.0,19.6,57.99,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,SP,4,2


In [19]:
print("Nombre de lignes {} et {} variables".format(
    df_2.shape[0], df_2.shape[1]))

Nombre de lignes 5930 et 32 variables


# Exportation dataset final 

In [20]:
df_2.to_csv('customers-segmentation-juillet.csv')