# Corrections des données
Dans ce notebook nous appliquons la correction manuelle des données au données brutes.

In [1]:
import pandas as pd
import os
import csv
from collections import Counter
from ast import literal_eval

from constants import *

%load_ext autoreload
%autoreload 2

In [2]:
df_raw = pd.read_csv(ORGANIZED_RAW_DATA)
df_raw[["chef_vocation", "enfants_profession", "pensionnaires_condition", "profession"]] = df_raw[["chef_vocation", "enfants_profession", "pensionnaires_condition", "profession"]].applymap(literal_eval)
df_raw

Unnamed: 0,chef_vocation,enfants_profession,pensionnaires_condition,annee,profession
0,[rocureur],[],[pensre friere se],1835,"[rocureur, pensre friere se]"
1,[pintier],[],[pensre],1835,"[pintier, pensre]"
2,[boulanger],[],[ouvrier],1835,"[boulanger, ouvrier]"
3,[voiturier],[],[penvre],1835,"[voiturier, penvre]"
4,[marechat],[],"[ouvrier, ouvrier]",1835,"[marechat, ouvrier, ouvrier]"
...,...,...,...,...,...
378721,[chay l],[],[],1898,[chay l]
378722,[meme maison],[],[],1898,[meme maison]
378723,[monimier],[],[],1898,[monimier]
378724,[ontumineur],[],[],1898,[ontumineur]


In [3]:
def load_correction_mapping():
    """
    Load a prepare the manual correction for the top terms
    """
    man_corr = pd.read_csv(CORRECTION_TOP_TERMS)
    man_corr = man_corr[man_corr["corrected"]!= "NSP"]
    man_corr = man_corr.dropna()
    correction_mapping = dict(zip(man_corr['original'], man_corr['corrected']))
    return correction_mapping

def apply_correction(terms, correction_mapping):
    """
    Apply the correction_mapping to each term in terms
    """
    temp =  [correction_mapping.get(term, term) for term in terms] 
    temp = [t for term in temp if not pd.isna(term) for t in term.split("|")]
    return [t for t in temp if t!="NA" and t!="NSP"]

def replace_idem_by_previous_term(terms):
    '''
    Replace every idem by the previous term if any
    '''
    return_list = []
    prev = "idem"
    for term in terms:
        if term != "idem":
            prev = term
        return_list.append(prev)
    return return_list


correction_mapping = load_correction_mapping()

In [4]:
df_corrected = df_raw.copy()

# Apply correction
df_corrected["chef_vocation"] = df_raw["chef_vocation"].apply(lambda l: apply_correction(l, correction_mapping))
df_corrected["enfants_profession"] = df_raw["enfants_profession"].apply(lambda l: apply_correction(l, correction_mapping))
df_corrected["pensionnaires_condition"] = df_raw["pensionnaires_condition"].apply(lambda l: apply_correction(l, correction_mapping))

# Replace idem
df_corrected["chef_vocation"] = df_corrected["chef_vocation"].apply(lambda l: replace_idem_by_previous_term(l))
df_corrected["enfants_profession"] = df_corrected["enfants_profession"].apply(lambda l: replace_idem_by_previous_term(l))
df_corrected["pensionnaires_condition"] = df_corrected["pensionnaires_condition"].apply(lambda l: replace_idem_by_previous_term(l))

# Create "profession" column collection of all other columns 
df_corrected["profession"] = df_corrected["chef_vocation"] + df_corrected["enfants_profession"] + df_corrected["pensionnaires_condition"]
df_corrected["profession"] = df_corrected["profession"].apply(lambda l: [prof for prof in l if len(prof) != 0])

df_corrected.to_csv(CORRECTED_RAW_DATA, index=False)

df_corrected

Unnamed: 0,chef_vocation,enfants_profession,pensionnaires_condition,annee,profession
0,[rocureur],[],[pensre friere se],1835,"[rocureur, pensre friere se]"
1,[pintier],[],[pensionnaire],1835,"[pintier, pensionnaire]"
2,[boulanger],[],[ouvrier],1835,"[boulanger, ouvrier]"
3,[voiturier],[],[penvre],1835,"[voiturier, penvre]"
4,[marechat],[],"[ouvrier, ouvrier]",1835,"[marechat, ouvrier, ouvrier]"
...,...,...,...,...,...
378721,[chay l],[],[],1898,[chay l]
378722,[meme maison],[],[],1898,[meme maison]
378723,[monimier],[],[],1898,[monimier]
378724,[ontumineur],[],[],1898,[ontumineur]


## Restructuration des données 
Chaque entrée correpsond à une occurence de métier avec son année correspondante.

In [5]:
df_job_year = df_corrected[["profession", "annee"]].explode("profession").copy()
df_job_year = df_job_year[~df_job_year["profession"].isin({"·", ""})]
df_job_year = df_job_year.dropna()
df_job_year

Unnamed: 0,profession,annee
0,rocureur,1835
0,pensre friere se,1835
1,pintier,1835
1,pensionnaire,1835
2,boulanger,1835
...,...,...
378721,chay l,1898
378722,meme maison,1898
378723,monimier,1898
378724,ontumineur,1898


In [6]:
already_correct_terms = set(pd.read_csv(ALREADY_CORRECT_TERMS_TOP_100)["0"])

correct_terms_after_cleaning = {t for t in apply_correction(set(correction_mapping.values()), {}) if t != "idem"}

corrects_terms = already_correct_terms.union(correct_terms_after_cleaning)

pd.DataFrame(corrects_terms).to_csv(CORRECT_JOBS, index=False)

In [7]:
corrects_terms = set(pd.read_csv(CORRECT_JOBS)["0"])

df_correct_job_year = df_job_year[df_job_year["profession"].isin(corrects_terms)].copy()
df_correct_job_year["occ"] = 1
df_correct_job_year

Unnamed: 0,profession,annee,occ
1,pintier,1835,1
1,pensionnaire,1835,1
2,boulanger,1835,1
2,ouvrier,1835,1
3,voiturier,1835,1
...,...,...,...
378677,menagere,1898,1
378678,domestique,1898,1
378678,domestique,1898,1
378679,charrelier,1898,1


In [8]:
count_occ_job = df_correct_job_year.groupby(["profession", "annee"]).agg("count").reset_index()
count_occ_job

Unnamed: 0,profession,annee,occ
0,agent,1845,1
1,agent,1848,1
2,agent,1858,12
3,agent,1859,1
4,agent,1863,1
...,...,...,...
10976,volontaire,1894,15
10977,volontaire,1895,40
10978,volontaire,1896,23
10979,volontaire,1897,15


In [10]:
print(f'Il y a {count_occ_job["profession"].nunique()} dénominations uniques après nettoyage.')

Il y a 255 dénominations uniques après nettoyage.


In [11]:
count_occ_job.to_csv(SELECTED_JOB_COUNT)