In [84]:
import os
from openai import OpenAI
import pandas as pd
from os import path
import seaborn as sns
import matplotlib.pyplot as plt

In [47]:
OPENAI_API_KEY = 'your-openai-api-key'

client = OpenAI(
    api_key=OPENAI_API_KEY,
)

In [8]:
DATA_ROOT_PATH = '../data'

# Data Normalization and Transformation
## Getting Unique Normalized Occupations from Accidents Data


In [9]:
new_achs_accidents_data = path.join(DATA_ROOT_PATH, 'masked','new_achs_accidents_data.csv')

In [10]:
new_achs_accidents_data = pd.read_csv(new_achs_accidents_data)
new_occupations = new_achs_accidents_data['ocupacion'].str.lower().unique()

new_occupations.shape

(44375,)

### Normalization Util

In [30]:
import re
from unidecode import unidecode

def normalize(s):
    if not isinstance(s, str):
        return s

    # Lowercase and remove accents (including ñ → n, ü → u)
    s = unidecode(s.lower())

    # Replace punctuation and normalize whitespace
    s = re.sub(r'[.,]', '', s)        # Remove dots and commas
    s = re.sub(r'[-]', ' ', s)        # Replace hyphens with space
    s = re.sub(r'\s+', ' ', s)        # Normalize multiple spaces

    return s.strip()


assert normalize("ÁÉÍÓÚÜÑ") == "aeiouun"
assert normalize("HE-LLO") == "he llo"
assert normalize("   HéLLo   ") == "hello"

## Normalize Dataframe

In [None]:
new_occupations_df = pd.DataFrame(new_occupations, columns=['ocupacion'])
new_occupations_df['ocupacion'] = new_occupations_df['ocupacion'].apply(normalize)
new_occupations_df['ocupacion'] = new_occupations_df['ocupacion'].str.strip()
new_occupations_df = new_occupations_df.drop_duplicates()

print(f"{'='*10}NORMALIZED OCCUPATIONS{'='*10}\n")
print("Number of unique occupations: ", new_occupations_df.shape[0])
print(new_occupations_df.head())


Number of unique occupations:  42845
    ocupacion
0    operario
1  supervisor
2  mantencion
3    mecanico
4      cajera


In [39]:
new_achs_accidents_data['ocupacion'] = new_achs_accidents_data['ocupacion'].str.lower()
new_achs_accidents_data['ocupacion'] = new_achs_accidents_data['ocupacion'].apply(normalize)
new_achs_accidents_data['ocupacion'] = new_achs_accidents_data['ocupacion'].str.strip()

print(new_achs_accidents_data['ocupacion'])

0                        operario
1                      supervisor
2                      mantencion
3                        mecanico
4                          cajera
                   ...           
342356    seleccionadora de fruta
342357                  bodeguero
342358           operador de grua
342359            aux de limpieza
342360        tecnico veterinario
Name: ocupacion, Length: 342361, dtype: object


# Categorization

In [100]:
def run_llm(occupation, system_instructions, taxonomies=None):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": f"{system_instructions}"},
            {"role": "system", "content": f"{taxonomies}"},
            {"role": "user", "content": f"What is the classification of the occupation {occupation}?"}
        ]
    )
    return response.choices[0].message.content

def write_result(index, occupation, response, file_path):
    with open(file_path, 'a', encoding='utf-8') as f:
        f.write(f'{index},{occupation},{response}\n')

In [40]:
system_instructions = """
    You will receive some occupation
    titles from the user. Your task is to classify them according
    to the International Labour Organization occupational
    classification system. For instance, Professor, Teacher, and
    Preschool Teacher should be categorized under the same class.
    Do not assign occupations into managerial classes unless the
    occupation title explicitly states so, like ’Company manager’.
    Your answer should consist of only the number of the class; for
    example, the occupation ’doctor’ should be answered with only '22'.
"""

In [63]:
principal_subgroups_file_path = path.join(DATA_ROOT_PATH, 'classification', 'principal_subgroups.csv')
principal_subgroups = pd.read_csv(principal_subgroups_file_path, sep=';')

print(principal_subgroups.head())
print(principal_subgroups.shape)
print(principal_subgroups.dtypes)

   code                                        description
0    11  Miembros del Poder Ejecutivo y Legislativo, pe...
1    12  Directores y gerentes administrativos y de ser...
2    13  Directores, gerentes y administradores de prod...
3    14  Directores, gerentes y administradores de hote...
4    21   Profesionales de las ciencias y de la ingeniería
(41, 2)
code            int64
description    object
dtype: object


In [51]:
def build_categorization_prompt(subgroups_dataframe):
    content = "code;description\n"
    for index, row in subgroups_dataframe.iterrows():
        content += f"{row['code']};{row['description']}\n"
    return content

categorization_prompt = build_categorization_prompt(principal_subgroups)

In [65]:
input_occupation = 'aux de aseo'

output_occupation_code = int(run_llm(input_occupation, system_instructions, categorization_prompt))
output_occupation_description = principal_subgroups.loc[principal_subgroups['code'] == output_occupation_code, 'description'].values[0]

print("Input occupation: ", input_occupation)
print("Output:", output_occupation_code, "-", output_occupation_description)

Input occupation:  aux de aseo
Output: 91 - Auxiliares de aseo y trabajadores de casa particular


In [66]:
llm_classification_file_path = path.join(DATA_ROOT_PATH, 'outputs', 'llm_classification.csv')

In [101]:
def run_classification(output_path, dataframe, system_instructions, categorization_prompt, start_index=0):
    for index, occupation in enumerate(dataframe['ocupacion'][start_index:]):
        output_occupation_code = int(run_llm(occupation, system_instructions, categorization_prompt))
        write_result(index, occupation, output_occupation_code, output_path)

    print(f"Finished processing {len(dataframe)} occupations")

In [None]:
run_classification(llm_classification_file_path, new_occupations_df, system_instructions, categorization_prompt, start_index=0)

In [67]:
classified_occupations = pd.read_csv(
    llm_classification_file_path,
    names=['index', 'ocupacion', 'clasificacion-oit'],
    on_bad_lines='skip',
    header=0
)

print(classified_occupations.head())
print(classified_occupations.shape)
print(classified_occupations.dtypes)

  index   ocupacion clasificacion-oit
0     0    operario                93
1     1  supervisor                12
2     2  mantencion                93
3     3    mecanico                72
4     4      cajera                42
(42738, 3)
index                object
ocupacion            object
clasificacion-oit    object
dtype: object


In [69]:
merged_occupations = pd.merge(
    new_achs_accidents_data,
    classified_occupations,
    on='ocupacion',
    how='inner'
)

print(merged_occupations.head())
print(merged_occupations.shape)
print(merged_occupations.dtypes)

   id_siniestro  tipo_siniestro  numero_paciente  \
0       7242104               2       1007054764   
1       7242107               2       1007001523   
2       7242197               1       1007055069   
3       7242333               1       1003302110   
4       7243063               1       1003972578   

                                               texto      poblacion  \
0  \nSTP\nCONTINUO INDICACION DE MANEJO AMBULATOR...        CHONCHI   
1  LEY \nREPOSO\nTRASLADO A CEM PARA INSTALAR CAN...      QUILICURA   
2  \n03.01.2022 10:37 RX PIE AP-OBL\n03.01.2022 1...  ISLA DE MAIPO   
3                                            ingreso     TALCAHUANO   
4  \nPARACETAMOL 500 MG (A), ORAL, 500 MG, C/8 HR...   PUERTO MONTT   

    ocupacion  edad       sexo  \
0    operario  22.0  masculino   
1  supervisor  29.0  masculino   
2  mantencion  46.0  masculino   
3    mecanico  42.0  masculino   
4      cajera  49.0   femenino   

                                     relato_admision  m

### Notes

576 Data Points are Lost after the Merge
`342017 -> 341441 `

In [70]:
classification_codes = ['11', '12', '13', '14',
                        '21', '22', '23', '24', '25', '26',
                        '31', '32', '33', '34', '35', '36',
                        '41', '42', '43', '44',
                        '51', '52', '53', '54',
                        '61', '62', '63', 
                        '71', '72', '73', '75',
                        '81', '82', '83', 
                        '91', '92', '93', '94', '95', '96']

In [77]:
not_classified_occupations = merged_occupations[(merged_occupations['clasificacion-oit'] == '00') | (~merged_occupations['clasificacion-oit'].isin(classification_codes))]

print(not_classified_occupations.shape)

(698, 14)


In [78]:
not_classified_occupations[['ocupacion', 'clasificacion-oit']].drop_duplicates().to_csv('not_classified_occupations.csv', index=False)
print(not_classified_occupations['clasificacion-oit'].head)

<bound method NDFrame.head of 523       no puedo clasificar la ocupación "no se lo sab...
1292                                                     74
1936      no puedo clasificar "programa especial empleo"...
2927      no corresponde a una clasificación específica ...
5537      no puedo determinar la clasificación porque "m...
                                ...                        
339975                                                   00
340127                                                   00
340164                                                   00
340249                                                   00
340263                                                   00
Name: clasificacion-oit, Length: 698, dtype: object>


In [80]:
merged_occupations = merged_occupations[(merged_occupations['clasificacion-oit'].isin(classification_codes))]
print(merged_occupations.shape)

(340066, 14)


We now can get the Unique Classifications in the Merged Occupations

In [83]:
distinct_classifications = merged_occupations['clasificacion-oit'].nunique()
print(distinct_classifications)

40


## Zoom on Occupations Classified as Mining, Construction, Manufacturing and Transport Workers

In [90]:
# First we will Count how many occupations are classified in 93
mining_construction_manufacturing_transport_code = '93'

classified_occupations_93 = classified_occupations[classified_occupations['clasificacion-oit'] == mining_construction_manufacturing_transport_code]

print(classified_occupations_93.shape)

(708, 3)


In [91]:
classified_occupations_93['ocupacion'].unique()

array(['operario', 'mantencion', 'operario de produccion',
       'ayudante soldado r', 'operarios', 'pioneta', 'operario de bodega',
       'oepraria', 'operaria', 'ayud trefilado', 'asistente de obra',
       'ayudante albañil', 'pintor', 'ayudante avanzado producc',
       'ayudante de taller', 'descarga', 'operaria de produccion',
       'operario produccion', 'ayudante de carga', 'ayudante soldador',
       'ayudante de produccion', 'ayudante en obras', 'filetera',
       'peoneta', 'ayudante de electrico', 'logistica pesaje camiones',
       'ayudante de planta', 'capataz', 'cargador', 'operario huincha',
       'operario logistico', 'ayudante electromecanico', 'packing',
       'asistente de reparto', 'operador terreno', 'ayudante de operador',
       'tractorista', 'descargador de camion', 'ayudante de maquina',
       'ayud soldador', 'maestro terminaciones', 'recibidor de huincha',
       'trabajador', 'colocador de royos', 'rigger', 'maestro minero',
       'operario junior'

In [93]:
instruction_93_classifications = """
You will receive some occupation
titles from the user and a table for the International Labour
Organization (ILO) occupational classification system. Your task
is to classify the occupations into one of the ILO’s classes.
For instance, Operator, Production Operator, and Operations
Assistant should be under the same class. It is important that
you reply only with the class number. Provide the most probable
class; for example, for ’dam cleaning’, answer with ’933’.
"""

In [92]:
system_prompt = """
code;description
931;Obreros de la minería y la construcción
932;Empacadores manuales y obreros de la industria manufacturera
933;Obreros del transporte y almacenamiento
"""

In [96]:
occupation = 'ayudante avanzado producc'

occupation_code = run_llm(occupation, instruction_93_classifications, system_prompt)
print(occupation_code)

932


In [109]:
class_93_output_path = path.join(DATA_ROOT_PATH, 'outputs', 'llm_classification_93.csv')

In [None]:
run_classification(class_93_output_path, classified_occupations_93, instruction_93_classifications, system_prompt)

In [111]:
classified_occupations_93 = pd.read_csv(class_93_output_path, names=['index', 'ocupacion', 'clasificacion-oit'], on_bad_lines='skip', dtype={'index': int, 'ocupacion': str, 'clasificacion-oit': str})


In [112]:
classified_occupations_93

Unnamed: 0,index,ocupacion,clasificacion-oit
0,0,operario,932
1,1,mantencion,931
2,2,operario de produccion,932
3,3,ayudante soldado r,931
4,4,operarios,932
...,...,...,...
705,705,descargador pioneta,933
706,706,ornero,931
707,707,operario de productividad,932
708,0,operario,931
