# 1. Objet

Ce script retraite les données brutes enregistrées par le RawDataProcessor afin de :
- contrôler la conformité à certaines règles de gestion métier
- nettoyer les avoirs lorsque c'est possible
- agréger les informations à une maille commande
- calculer le canal majoritaire en poids
- effectuer d'autres aggrégations si nécessaire (ex : types de tarif...)

# 2. Imports et setup technique

In [1]:
from pathlib import Path
import sys
import datetime
import pandas as pd
from importlib import reload
project_root = str(Path(sys.path[0]).parents[0].absolute())
project_root
if project_root not in sys.path:
    sys.path.append(project_root)
    
from scripts.utils import process_df  # traitement des avoirs
    
data_path = Path('..') / 'data'
persist_path = Path('..') / 'persist'
from IPython.display import display, HTML
display(HTML("<style>.container { width:90%; }</style>"))

In [2]:
orgacom_list = [
    '1ALO',
    '1BFC',
    '1CAP',
    '1CTR',
    '1EXP',
    '1LRO',
    '1LXF',
    '1NCH',
    '1OUE',
    '1PAC',
#     '1PLU', 
    '1PNO',
    '1PSU',
    '1RAA',
    '1SOU',
    '2BRE',
    '2CAE',
    '2CTR',
    '2EST',
    '2IDF',
#     '2IFC', Cash Européenne Food ?
    '2MPY',
    '2NOR',
    '2RAA',
    '2SES',
    '2SOU',
]

# Contrôle des données

On définit les fonctions qui vont permettre d'effectuer les contrôles.

### Contrôle des types de documents

In [3]:
valid_doctypes = ['ZC01', 'ZC02', 'ZC10'] # documents de type "commandes"
invalid_doctypes = ['ZR01', 'ZR02', 'ZA01', 'ZA02', 'ZA03', 'ZA04', 'ZC20']  # autres types de documents
doctypes = valid_doctypes + invalid_doctypes   

On s'assure qu'on n'a pas de type de document non prévu.

In [4]:
def ctrle_doctypes(data):
    if len(data.loc[~data.doctype.isin(doctypes)]) > 0:
        raise RuntimeError(f'An unexpected doctype has been encountered: '
                           f'{data.loc[~data.doctype.isin(doctypes), "doctype"].iloc[0]}!')

### Contrôle des CA bruts non nuls alors que le poids est nul

D'un point de vue métier, il n'est pas possible qu'une ligne de **commande** (= valid_doctype) avec un poids nul ait un CA brut, sauf pour certains articles de service

In [5]:
def ctrle_no_weight_revenue(data, material_with_no_weight = dict(), identifier=None):
    if not identifier:
        # l'identifiant par défaut est l'orgacom du dataframe en coours de traitement
        # hypothèse : 1 seule OC par dataframe.
        identifier = data['orgacom'].iloc[0]
    order_doctype_mask = data.doctype.isin(valid_doctypes)
    if len(data.loc[order_doctype_mask & (data.weight == 0) & (data.brutrevenue != 0)]) > 0:
        print('Some sales have no weight but have brut revenue!')
    material_with_no_weight[identifier] = (
        list(data.loc[order_doctype_mask & (data.weight == 0) & (data.brutrevenue != 0), 'material'].unique())
    )
    return(material_with_no_weight)

In [6]:
for orgacom in orgacom_list[:-1]:
    print(orgacom)
    ctrle_no_weight_revenue(pd.read_pickle(persist_path / 'rawbyoc' / f'data_{orgacom}.pkl'))
ctrle_no_weight_revenue(pd.read_pickle(persist_path / 'rawbyoc' / f'data_{orgacom_list[-1]}.pkl'))    

1ALO
Some sales have no weight but have brut revenue!
1BFC
Some sales have no weight but have brut revenue!
1CAP
Some sales have no weight but have brut revenue!
1CTR
Some sales have no weight but have brut revenue!
1EXP
1LRO
Some sales have no weight but have brut revenue!
1LXF
Some sales have no weight but have brut revenue!
1NCH
Some sales have no weight but have brut revenue!
1OUE
Some sales have no weight but have brut revenue!
1PAC
Some sales have no weight but have brut revenue!
1PNO
Some sales have no weight but have brut revenue!
1PSU
Some sales have no weight but have brut revenue!
1RAA
Some sales have no weight but have brut revenue!
1SOU
Some sales have no weight but have brut revenue!
2BRE
Some sales have no weight but have brut revenue!
2CAE
2CTR
Some sales have no weight but have brut revenue!
2EST
Some sales have no weight but have brut revenue!
2IDF
Some sales have no weight but have brut revenue!
2MPY
Some sales have no weight but have brut revenue!
2NOR
Some sales ha

{'1ALO': ['FL ACCOR', 'FL39860'],
 '1BFC': ['FL ACCOR', 'FL39860', 'LIV ECHANTILLON'],
 '1CAP': ['000000000000189057'],
 '1CTR': ['FL39860', 'FL ACCOR'],
 '1EXP': [],
 '1LRO': ['FL ACCOR', 'FL39860'],
 '1LXF': ['FL39860'],
 '1NCH': ['FL ACCOR', 'FL39860', 'FC21264'],
 '1OUE': ['FL39860',
  'FL ACCOR',
  'FC18404',
  'FC38700',
  'FL ACCOR PRES',
  'FC21264',
  'FL706801'],
 '1PAC': ['FL ACCOR', 'FL39860', '000000000000162186', 'FC18404', 'FC21264'],
 '1PNO': ['FL ACCOR', 'FL39860'],
 '1PSU': ['FL39860', 'FL ACCOR'],
 '1RAA': ['FL ACCOR', 'FL39860', '000000000000052565', 'FC21254', 'FC57752'],
 '1SOU': ['FL ACCOR', 'FL39860', 'FL66446'],
 '2BRE': ['FC42378', '000000000000156486', 'FC34504', 'FC34502', 'FC34503'],
 '2CAE': [],
 '2CTR': ['FC42378', 'INTERTECH', '000000000000156486'],
 '2EST': ['FC42378', '000000000000156486', 'FC34502', 'FC34503'],
 '2IDF': ['FC42378', 'FC34503', 'FC34502'],
 '2MPY': ['FC42378', '000000000000156486', '000000000000028084'],
 '2NOR': ['FC42378',
  'FC34502'

L'essentiel des articles concernés sont des articles de service (forfaits livraison). Il faudrait regarder ce qui s'est passé sur le 156486, il remonte régulièrement côté ES.

### Contrôle des origines de commande

In [7]:
valid_origins = ['TV', 'VR', 'WEB', 'EDI']

def ctrle_origin(data):
    report = data.loc[
        ~data.origin.isin(valid_origins) &
        (data.brutrevenue != 0) &
        data.doctype.isin(valid_doctypes)
    ].value_counts('origin').rename('counts').to_frame()
    report['percentage'] = report['counts'] / sum(
        (data.brutrevenue != 0) &
        data.doctype.isin(valid_doctypes)
    )
    return(report.loc[report['counts'] > 0])

In [8]:
for orgacom in orgacom_list:
    print(orgacom)
    with pd.option_context('display.float_format', lambda x: f'{x:.2%}'):
        display(ctrle_origin(pd.read_pickle(persist_path / 'rawbyoc' / f'data_{orgacom}.pkl')))

1ALO


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,21312,0.15%
TELE,5,0.00%
DFUE,3,0.00%


1BFC


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,4146,0.05%


1CAP


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,2310,0.77%
SCHR,98,0.03%


1CTR


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,5590,0.08%
DFUE,39,0.00%


1EXP


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,123,0.24%


1LRO


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,4545,0.06%


1LXF


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,608,0.04%


1NCH


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,6780,0.06%
SCHR,85,0.00%


1OUE


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,10846,0.07%


1PAC


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,12707,0.09%


1PNO


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,8317,0.07%


1PSU


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,11975,0.08%
DFUE,8,0.00%


1RAA


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,10319,0.06%
TELE,73,0.00%
DFUE,30,0.00%
SCHR,21,0.00%
MUEN,1,0.00%


1SOU


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,11134,0.07%


2BRE


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,5559,0.07%
FRN,4289,0.05%


2CAE


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,17,0.09%


2CTR


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,2328,0.03%
FRN,1367,0.02%


2EST


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,3187,0.06%
FRN,2251,0.04%


2IDF


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,3885,0.04%
FRN,646,0.01%


2MPY


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,1299,0.03%
FRN,760,0.02%


2NOR


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,4924,0.04%
FRN,3010,0.03%


2RAA


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,1941,0.03%
FRN,1699,0.03%


2SES


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,3415,0.04%
FRN,1331,0.01%


2SOU


Unnamed: 0_level_0,counts,percentage
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
#,1914,0.03%
FRN,1218,0.02%
MUEN,1,0.00%


### Retraitement des avoirs

In [9]:
def credit_processing(data):
    len_before = len(data.loc[data.doctype.isin(['ZA01', 'ZA02'])])
    brutrev_before = data.loc[data.doctype.isin(['ZA01', 'ZA02']), 'brutrevenue'].sum()
    print('Avant nettoyage')
    print(f"Nombre d'avoirs ZA01 et ZA02 : {len_before}")
    print(f"CA représenté par ces avoirs : {brutrev_before: .2f} €")
    temp = process_df(data, ).drop('_duplicated', axis=1)
    len_after = len(temp.loc[temp.doctype.isin(['ZA01', 'ZA02'])])
    brutrev_after = temp.loc[temp.doctype.isin(['ZA01', 'ZA02']), 'brutrevenue'].sum()    
    print('Après nettoyage')
    print(f"Nombre d'avoirs ZA01 et ZA02 : {len_after}")
    print(f"CA représenté par ces avoirs : {brutrev_after: .2f} €")  
    return(temp)

### Application du nettoyage, des filtres, et aggrégation

On va boucler sur chacun des fichier, procéder au nettoyage des accords, appliquer les filtres, puis calculer l'aggrégation. A la fin, on concatène les dataframes aggrégés dans un unique dataframe qu'on persistera.

In [16]:
pd.read_pickle(persist_path / 'rawbyoc' / 'data_1SOU.pkl')

Unnamed: 0,orgacom,month,week,date,pricetype_init,pricetype_applied,mercu_init,mercu_applied,client,doctype,...,material,brutrevenue,brutrevcur,netrevenue,netrevcur,weight,weightunit,margin,margincur,marginperkg
300946,1SOU,201707,201727,2017-07-03,ZTCE,ZTCE,H,M,0000066026,ZC10,...,000000000000008422,4.75,EUR,4.75,EUR,6.000,KG,0.77,EUR,0.13
300947,1SOU,201707,201727,2017-07-03,ZTCE,ZTCE,H,M,0000066026,ZC10,...,000000000000008744,5.38,EUR,5.38,EUR,6.000,KG,1.16,EUR,0.19
300948,1SOU,201707,201727,2017-07-03,ZTCE,ZTCE,H,M,0000066026,ZC10,...,000000000000067069,8.11,EUR,8.11,EUR,6.000,KG,1.68,EUR,0.28
300949,1SOU,201707,201727,2017-07-03,ZTCE,ZTCE,H,M,0000066026,ZC10,...,000000000000065629,14.55,EUR,14.55,EUR,1.050,KG,1.31,EUR,1.25
300950,1SOU,201707,201727,2017-07-03,ZTCE,ZTCE,H,M,0000066026,ZC10,...,000000000000007446,12.42,EUR,12.42,EUR,6.000,KG,2.55,EUR,0.43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468087,1SOU,202009,202036,2020-09-04,ZTS,ZTS,S,M,0000270132,ZC10,...,000000000000213206,38.19,EUR,38.19,EUR,24.120,KG,0.99,EUR,0.04
468088,1SOU,202009,202036,2020-09-04,ZTS,ZTS,S,M,0000270132,ZC10,...,000000000000215124,121.50,EUR,121.50,EUR,18.000,KG,6.69,EUR,0.37
468089,1SOU,202009,202036,2020-09-04,ZTS,ZTS,S,M,0000280091,ZR02,...,000000000000221103,-445.55,EUR,-443.28,EUR,-46.200,KG,88.23,EUR,-1.91
468090,1SOU,202009,202036,2020-09-05,ZTCR,ZTM,H,H,P000429,ZC10,...,000000000000160040,7.36,EUR,7.36,EUR,2.080,KG,-2.68,EUR,-1.29


In [21]:
4192726/ 19083945

0.21969912405427705

In [20]:
len(df)

19083945

In [19]:
sum(df.index.duplicated())

4192726

In [17]:
df = data
grouper_fields = ['orgacom', 'date', 'client', 'material']
indicators = ['margin', 'brutrevenue', 'weight']
orders_doctypes=['ZC10']
avoirs_doctypes=['ZA01', 'ZA02']


mask_ZC = df.doctype.isin(['ZC10'])
mask_ZA = df.doctype.isin(['ZA01', 'ZA02'])
raw_avoirs = df.loc[mask_ZA, grouper_fields + indicators]
avoirs = raw_avoirs.groupby(grouper_fields, observed=True).sum()
mask_dup_ZC = (df.loc[mask_ZC]
             .duplicated(grouper_fields, keep=False)
             .rename('_duplicated'))
df = df.merge(
    mask_dup_ZC,
    how='left',
    left_index=True,
    right_index=True)
df['_duplicated'] = df['_duplicated'].fillna(False)
print('ici')
to_update = (
    df.loc[~df._duplicated & mask_ZC, grouper_fields + indicators]
    .merge(avoirs,
           how='inner',
           left_on=grouper_fields,
           right_index=True,
           validate='1:1')
)
for indicator in indicators:
    to_update[indicator] = (to_update[indicator + '_x'] +
                            to_update[indicator + '_y'])
to_update = to_update.loc[(to_update.weight >= 0) &
                          (to_update.brutrevenue >= 0)]
to_update.drop(columns=[indicator + '_x' for indicator in indicators] +
                       [indicator + '_y' for indicator in indicators],
               inplace=True)
mask_to_del = (
    df.set_index(grouper_fields)
      .index.isin(to_update.set_index(grouper_fields).index)
)
df = df.loc[~mask_to_del | ~df.doctype.isin(avoirs_doctypes)]

ici


KeyboardInterrupt: 

In [11]:
aggfuncs = {
    'margin': 'sum', 
    'brutrevenue': 'sum',
    'weight': 'sum',
    'origin': 'size',
}

order_groupers = ['orgacom', 'date', 'client', 'origin']  # pour les commandes, on considèrera ces clefs de regroupement
order_df_list = []  # aggrégé par date, client, canal, orgacom 
pricetype_groupers = ['orgacom', 'date', 'client', 'origin', 'pricetype_init', 'pricetype_applied']
pricetype_df_list = []


for orgacom in orgacom_list:
    print('----------------------------------------------------------------')
    print(f'{datetime.datetime.now()} - Début du traitement pour {orgacom}')
    print('----------------------------------------------------------------')
    print(f'{datetime.datetime.now()} - Lecture du fichier')
    data = pd.read_pickle(persist_path / 'rawbyoc' / f'data_{orgacom}.pkl')
    print(f'{datetime.datetime.now()} - Traitement des avoirs')
    data = credit_processing(data)
    print(f'{datetime.datetime.now()} - Application des filtres: CA > 0, origine de commande ok, poids > 0')
    data = data.loc[
        data.origin.isin(valid_origins) & 
        (data.brutrevenue > 0) &
        (data.weight > 0)
    ]
    print(f"{datetime.datetime.now()} - Calcul de l'aggrégation 'orders'")
    order_df = data.groupby(order_groupers, observed=True).agg(aggfuncs).rename({'origin': 'linecount'}, axis=1)
    order_df_list.append(order_df)
#     print(f"{datetime.datetime.now()} - Calcul de l'aggrégation 'pricetype'")
#     pricetype_df = data.groupby(pricetype_groupers, observed=True).agg(aggfuncs).rename({'origin': 'linecount'}, axis=1)
#     pricetype_df_list.append(pricetype_df)

----------------------------------------------------------------
2021-02-24 15:41:22.029687 - Début du traitement pour 1ALO
----------------------------------------------------------------
2021-02-24 15:41:22.029741 - Lecture du fichier
2021-02-24 15:41:29.368116 - Traitement des avoirs
Avant nettoyage
Nombre d'avoirs ZA01 et ZA02 : 45812
CA représenté par ces avoirs : -1509646.69 €


KeyboardInterrupt: 

In [None]:
pd.concat(order_df_list, axis=0).to_pickle(persist_path / 'orders_all_SV.pkl')

# Calcul du canal majoritaire en poids

In [None]:
raw_orders = pd.read_pickle(persist_path / 'orders_all_SV.pkl')

In [None]:
print(f'Nb orders in initial dataset: {len(raw_orders)}')
target_len = len(raw_orders.reset_index().loc[:, ['orgacom', 'client', 'date']].drop_duplicates())
print(f'Target order count in order dataset: {target_len}')

In [None]:
%%time
main_origin = (
    raw_orders['weight']
    .unstack('origin', fill_value=0)
    .idxmax(axis=1)
    .rename('main_origin')
)

In [None]:
orders_with_main_origin = (
raw_orders
    .unstack('origin', fill_value=0)
    .rename_axis(('indicator', 'origin'), axis=1)
    .groupby('indicator', axis=1).sum()
).join(main_origin)

On vérifie que le calcul du canal majoritaire n'a pas modifié les indicateurs au total :

In [None]:
check = orders_with_main_origin.iloc[:, :-1].join(raw_orders.groupby(['orgacom', 'date', 'client'], observed=True).sum(), rsuffix='_r')
for indicator in ['margin', 'brutrevenue', 'weight', 'linecount']:
    check[indicator + '_delta'] = (check[indicator] - check[indicator + '_r']).abs()
    assert(sum(check[indicator + '_delta']) == 0)
del(check)

In [None]:
orders_with_main_origin.to_pickle(persist_path / 'orders_all_SV_with_main_origin.pkl')