In [1]:
import pickle

import numpy as np
import pandas as pd
from slugify import slugify

# Load data

In [2]:
ROME_df = pd.read_csv('../referentiels/referentiel_ROME/20150921_arboprincipale28427_ROME.csv', index_col=0, sep='|', dtype=str)
OGR_df = pd.read_csv('../referentiels/referentiel_OGR/20150921_arboprincipale28427_OGR.csv', sep='|', dtype=str).set_index('OGR')
NAF_df = pd.read_csv('../referentiels/referentiel_NAF/naf2008_liste_n5_nouveau_header.csv', sep='|', encoding="utf-8").set_index(['NAF'])

In [3]:
with open('../decoupage_ROME.pickle', 'rb') as f:
    decoupage_ROME = pickle.load(f)

# OGR <-> ROMEbis

In [4]:
OGR_df['ROMEbis'] = OGR_df.ROME

In [5]:
for ROME, ROME_groups in decoupage_ROME.items():
    for i, group in enumerate(ROME_groups):
        label = group['label']
        OGRs = group['OGRs']
        name = group['name']
        for OGR in OGRs:
            OGR_df.loc[OGR]['ROMEbis'] = name

In [6]:
OGR_df[OGR_df.ROME == 'D1106']

Unnamed: 0_level_0,ROME1,ROME2,ROME3,label,ROME,slugs,ROMEbis
OGR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
14863,D,11,6,Epicier / Epicière,D1106,epicier-epiciere,14863
16388,D,11,6,Marchand / Marchande de fruits et légumes,D1106,marchand-marchande-de-fruits-et-legumes,16388
20525,D,11,6,Vendeur / Vendeuse de fruits et légumes,D1106,vendeur-vendeuse-de-fruits-et-legumes,20525
20530,D,11,6,Vendeur / Vendeuse de primeurs,D1106,vendeur-vendeuse-de-primeurs,20530
20540,D,11,6,Vendeur / Vendeuse en alimentation générale,D1106,vendeur-vendeuse-en-alimentation-generale,20540
20558,D,11,6,Vendeur / Vendeuse en boucherie,D1106,vendeur-vendeuse-en-boucherie,20558
20559,D,11,6,Vendeur / Vendeuse en boucherie-charcuterie,D1106,vendeur-vendeuse-en-boucherie-charcuterie,20559
20560,D,11,6,Vendeur / Vendeuse en boulangerie-pâtisserie,D1106,vendeur-vendeuse-en-boulangerie-patisserie,20560
20564,D,11,6,Vendeur / Vendeuse en charcuterie,D1106,vendeur-vendeuse-en-charcuterie,20564
20567,D,11,6,Vendeur / Vendeuse en chocolaterie,D1106,vendeur-vendeuse-en-chocolaterie,20567


In [7]:
OGR_df.ROMEbis.to_csv('../ogr_rome_codes.csv')

# ROMEbis dataframe

In [8]:
ROMEbis_df = ROME_df.copy()

In [9]:
ROMEbis_df.index.rename('ROMEbis', inplace=True)
ROMEbis_df['ROME'] = ROMEbis_df.index
ROMEbis_df['ROME_label'] = ROMEbis_df.label

In [10]:
for ROME, ROME_groups in decoupage_ROME.items():
    ROME_info = ROMEbis_df.loc[[ROME]]
    ROMEbis_df = ROMEbis_df.drop([ROME])

    for i, group in enumerate(ROME_groups):
        label = group['label']
        OGRs = group['OGRs']
        name = group['name']
        
        ROMEbis_info = ROME_info.copy()
        ROMEbis_info.index = [name]
        ROMEbis_info.label = [label]
        
        ROMEbis_df = ROMEbis_df.append(ROMEbis_info)

In [11]:
ROMEbis_df['slugs'] = [slugify(r) for r in ROMEbis_df.label]
ROMEbis_df = ROMEbis_df[['famille', 'domaine', 'metier', 'ROME', 'label', 'slugs', 'famille_label', 'domaine_label', 'ROME_label']]
ROMEbis_df.index.rename('ROMEbis', inplace=True)

In [12]:
ROMEbis_df[ROMEbis_df.ROME == 'D1106']

Unnamed: 0_level_0,famille,domaine,metier,ROME,label,slugs,famille_label,domaine_label,ROME_label
ROMEbis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
14863,D,11,6,D1106,Epicier / Epicière,epicier-epiciere,"COMMERCE, VENTE ET GRANDE DISTRIBUTION",Commerce alimentaire et métiers de bouche,Vente en alimentation
16388,D,11,6,D1106,Marchand / Marchande de fruits et légumes,marchand-marchande-de-fruits-et-legumes,"COMMERCE, VENTE ET GRANDE DISTRIBUTION",Commerce alimentaire et métiers de bouche,Vente en alimentation
20525,D,11,6,D1106,Vendeur / Vendeuse de fruits et légumes,vendeur-vendeuse-de-fruits-et-legumes,"COMMERCE, VENTE ET GRANDE DISTRIBUTION",Commerce alimentaire et métiers de bouche,Vente en alimentation
20530,D,11,6,D1106,Vendeur / Vendeuse de primeurs,vendeur-vendeuse-de-primeurs,"COMMERCE, VENTE ET GRANDE DISTRIBUTION",Commerce alimentaire et métiers de bouche,Vente en alimentation
20540,D,11,6,D1106,Vendeur / Vendeuse en alimentation générale,vendeur-vendeuse-en-alimentation-generale,"COMMERCE, VENTE ET GRANDE DISTRIBUTION",Commerce alimentaire et métiers de bouche,Vente en alimentation
20558,D,11,6,D1106,Vendeur / Vendeuse en boucherie,vendeur-vendeuse-en-boucherie,"COMMERCE, VENTE ET GRANDE DISTRIBUTION",Commerce alimentaire et métiers de bouche,Vente en alimentation
20559,D,11,6,D1106,Vendeur / Vendeuse en boucherie-charcuterie,vendeur-vendeuse-en-boucherie-charcuterie,"COMMERCE, VENTE ET GRANDE DISTRIBUTION",Commerce alimentaire et métiers de bouche,Vente en alimentation
20560,D,11,6,D1106,Vendeur / Vendeuse en boulangerie-pâtisserie,vendeur-vendeuse-en-boulangerie-patisserie,"COMMERCE, VENTE ET GRANDE DISTRIBUTION",Commerce alimentaire et métiers de bouche,Vente en alimentation
20564,D,11,6,D1106,Vendeur / Vendeuse en charcuterie,vendeur-vendeuse-en-charcuterie,"COMMERCE, VENTE ET GRANDE DISTRIBUTION",Commerce alimentaire et métiers de bouche,Vente en alimentation
20567,D,11,6,D1106,Vendeur / Vendeuse en chocolaterie,vendeur-vendeuse-en-chocolaterie,"COMMERCE, VENTE ET GRANDE DISTRIBUTION",Commerce alimentaire et métiers de bouche,Vente en alimentation


In [13]:
ROMEbis_df.to_csv('../ROMEbis_df.csv', sep='|')    

In [14]:
ROMEbis_df = pd.read_csv('../ROMEbis_df.csv', index_col=0, sep='|', dtype=str)

# ROME descriptions

In [15]:
ROME_DESCRIPTIONS = ROMEbis_df.label

In [16]:
ROME_DESCRIPTIONS.to_csv('../romebis_labels.csv')

In [17]:
ROME_DESCRIPTIONS = pd.read_csv('../romebis_labels.csv', index_col=0)

In [18]:
#ROME_DESCRIPTIONS.loc['D1106a']
#ROME_DESCRIPTIONS.loc['20646']


# ROMEbis x NAF

In [19]:
with open('../array_ROME1_fusion.pickle', 'rb') as f:
    ROME1_fusion = pickle.load(f)

In [20]:
ROMExNAF = ROME1_fusion.sum(2)

In [21]:
with open('../array_offres_OGR.pickle', 'rb') as f:
    offres_OGR = pickle.load(f)

In [22]:
offres_OGR.shape

(732, 10877)

In [23]:
ROMEbis_NAF = np.zeros((len(NAF_df), len(ROMEbis_df)))

# Fill for unmodified ROME
for ROMEbis_index, ROMEbis in enumerate(ROMEbis_df.index):
    if ROMEbis in ROME_df.index:  # ROME is nos modified
        ROME_index = ROME_df.index.get_loc(ROMEbis)
        ROMEbis_NAF[:, ROMEbis_index] = ROMExNAF[:, ROME_index]


In [24]:
for ROME, ROMEbis_data_list in decoupage_ROME.items():
    ROME_index = ROME_df.index.get_loc(ROME)
    
    # Count job offers for the ROME
    ROME_offres = np.zeros((len(NAF_df)))
    for ROMEbis_data in ROMEbis_data_list:
        OGRs = ROMEbis_data['OGRs']

        for OGR in OGRs:
            OGR_index = OGR_df.index.get_loc(OGR)
            ROME_offres += offres_OGR[:, OGR_index]

    # Fill for ROMEbis
    for ROMEbis_data in ROMEbis_data_list:
        OGRs = ROMEbis_data['OGRs']
        ROMEbis = ROMEbis_data['name']
        ROMEbis_index = ROMEbis_df.index.get_loc(ROMEbis)

        ROMEbis_offres = np.zeros((len(NAF_df)))
        for OGR in OGRs:
            OGR_index = OGR_df.index.get_loc(OGR)
            ROMEbis_offres += offres_OGR[:, OGR_index]

        DPAE_for_ROME = ROMExNAF[:, ROME_index]
        ROME_offres, ROMEbis_offres
        DPAE_prorata = np.nan_to_num((ROMEbis_offres / ROME_offres) * DPAE_for_ROME)
        DPAE_mean = DPAE_for_ROME / len(ROMEbis_data_list)
        mask_prorata = (ROME_offres > 0)
        # règle de trois, ou le cas du dénominateur nul est pris en compte séparément
        ROMEbis_NAF[:, ROMEbis_index] = mask_prorata * DPAE_prorata + (1 - mask_prorata) * DPAE_mean





In [25]:
# Difference between sums by NAF (should be small)
np.abs(ROMEbis_NAF.sum(1) - ROMExNAF.sum(1)).sum()

2.4784507779429532e-11

In [26]:
ROMEbis_NAF = np.ceil(ROMEbis_NAF)

In [27]:
with open('../array_ROMEbis.pickle', 'wb') as f:
    pickle.dump(ROMEbis_NAF, f)