In [15]:
import json
import pandas as pd
import os

# Define file paths
PROSPECTS_PATH = "../data/raw/prospects.json"
VAGAS_PATH = "../data/raw/vagas.json"
OUTPUT_PATH = "../data/processed/prospects_vagas_merged.json"

def flatten_prospects(prospects_data: dict) -> pd.DataFrame:
    """
    Flattens the nested prospects.json data into a pandas DataFrame.
    Each row will represent a single prospect application.

    Args:
        prospects_data: The loaded dictionary from prospects.json.

    Returns:
        A DataFrame with flattened prospect data.
    """
    prospects_list = []
    # The keys of the dictionary are the vacancy IDs
    for vaga_id, vaga_info in prospects_data.items():
        # Iterate through the list of prospects for each vacancy
        for prospect in vaga_info.get("prospects", []):
            # Create a dictionary for the current prospect
            prospect_record = {
                "vaga_id": vaga_id,
                "prospect_nome": prospect.get("nome"),
                "prospect_codigo": prospect.get("codigo"),
                "situacao_candidado": prospect.get("situacao_candidado"),
                "data_candidatura": prospect.get("data_candidatura"),
                "ultima_atualizacao": prospect.get("ultima_atualizacao"),
                "comentario": prospect.get("comentario"),
                "recrutador": prospect.get("recrutador")
            }
            prospects_list.append(prospect_record)
    
    return pd.DataFrame(prospects_list)

def flatten_vagas(vagas_data: dict) -> pd.DataFrame:
    """
    Flattens the vagas.json data into a pandas DataFrame.
    It combines the 'informacoes_basicas' and 'perfil_vaga' into a single record.

    Args:
        vagas_data: The loaded dictionary from vagas.json.

    Returns:
        A DataFrame with flattened vacancy data.
    """
    vagas_list = []
    # The keys of the dictionary are the vacancy IDs
    for vaga_id, vaga_details in vagas_data.items():
        # Combine the two nested dictionaries into one
        # This makes it easier to access all vacancy info
        record = {
            "vaga_id": vaga_id,
            **vaga_details.get("informacoes_basicas", {}),
            **vaga_details.get("perfil_vaga", {})
        }
        vagas_list.append(record)
        
    return pd.DataFrame(vagas_list)

def main():
    """
    Main function to run the data preprocessing and merging pipeline.
    """
    print("Starting data merging process...")

    # Create the processed data directory if it doesn't exist
    os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

    # 1. Load the raw JSON files
    print("-> Loading raw data files...")
    with open(PROSPECTS_PATH, 'r', encoding='utf-8') as f:
        prospects_data = json.load(f)
    
    with open(VAGAS_PATH, 'r', encoding='utf-8') as f:
        vagas_data = json.load(f)

    # 2. Flatten the JSON data into DataFrames
    print("-> Flattening JSON data into tables...")
    prospects_df = flatten_prospects(prospects_data)
    vagas_df = flatten_vagas(vagas_data)

    # 3. Merge the two DataFrames on the 'vaga_id'
    print("-> Merging prospects and vagas data...")
    merged_df = pd.merge(prospects_df, vagas_df, on="vaga_id", how="left")

    # 4. Save the result to a JSON file for the next step
    print(f"-> Saving merged data to {OUTPUT_PATH}")
    
    # --- FIX ---
    # Use to_json with orient='records' to create a list of JSON objects.
    # This is a clean and standard format.
    merged_df.to_json(OUTPUT_PATH, orient='records', indent=4, force_ascii=False)
    # --- END FIX ---

    print("\nMerge complete!")
    print(f"Total applications processed: {len(merged_df)}")
    print("First 5 rows of the merged data:")
    print(merged_df.head())

if __name__ == "__main__":
    main()



Starting data merging process...
-> Loading raw data files...
-> Flattening JSON data into tables...
-> Merging prospects and vagas data...
-> Saving merged data to ../data/processed/prospects_vagas_merged.json

Merge complete!
Total applications processed: 53759
First 5 rows of the merged data:
  vaga_id             prospect_nome prospect_codigo  \
0    4530               José Vieira           25632   
1    4530  Srta. Isabela Cavalcante           25529   
2    4531     Sra. Yasmin Fernandes           25364   
3    4531            Alexia Barbosa           25360   
4    4533            Arthur Almeida           26338   

            situacao_candidado data_candidatura ultima_atualizacao  \
0  Encaminhado ao Requisitante       25-03-2021         25-03-2021   
1  Encaminhado ao Requisitante       22-03-2021         23-03-2021   
2     Contratado pela Decision       17-03-2021         12-04-2021   
3  Encaminhado ao Requisitante       17-03-2021         17-03-2021   
4     Contratado pela 

In [None]:
import json
import os

# Define file paths relative to the project root
MERGED_DATA_PATH = "../data/processed/prospects_vagas_merged.json"
ALL_APPLICANTS_PATH = "../data/raw/applicants.json" 
OUTPUT_PATH = "../data/processed/applicants_for_processing.json"

def main():
    """
    Filters the main applicants list to include only those who appear
    in the prospects data, preparing them for LLM processing.
    """
    print("Starting applicant filtering process...")

    # Create the processed data directory if it doesn't exist
    os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

    # 1. Load the merged prospects/vagas data to get active candidate IDs
    print(f"-> Loading merged data from {MERGED_DATA_PATH}...")
    with open(MERGED_DATA_PATH, 'r', encoding='utf-8') as f:
        merged_data = json.load(f)

    # Extract the unique set of candidate codes from the prospects data
    # The 'prospect_codigo' is a string, so we keep it that way.
    active_candidate_ids = {str(record['prospect_codigo']) for record in merged_data}
    print(f"-> Found {len(active_candidate_ids)} unique active candidates.")

    # 2. Load the main applicants data
    print(f"-> Loading all applicants from {ALL_APPLICANTS_PATH}...")
    with open(ALL_APPLICANTS_PATH, 'r', encoding='utf-8') as f:
        # applicants.json is a dictionary where keys are the applicant IDs
        all_applicants_data = json.load(f)

    # 3. Filter the applicants
    print("-> Filtering applicants...")
    filtered_applicants = {}
    for applicant_id, applicant_details in all_applicants_data.items():
        # Check if the applicant's ID is in our set of active IDs
        if applicant_id in active_candidate_ids:
            filtered_applicants[applicant_id] = applicant_details
    
    print(f"-> Found {len(filtered_applicants)} matching applicants to process.")

    # 4. Save the filtered list to a new JSON file
    print(f"-> Saving filtered applicants to {OUTPUT_PATH}...")
    with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
        json.dump(filtered_applicants, f, indent=4, ensure_ascii=False)

    print("\nFiltering process complete!")
    print(f"The file '{OUTPUT_PATH}' is now ready for your LLM processing.")


if __name__ == "__main__":
    main()


Starting applicant filtering process...
-> Loading merged data from ../data/processed/prospects_vagas_merged.json...
-> Found 29405 unique active candidates.
-> Loading all applicants from ../data/raw/applicants.json...
-> Filtering applicants...
-> Found 23463 matching applicants to process.
-> Saving filtered applicants to ../data/processed/applicants_for_processing.json...

Filtering process complete!
The file '../data/processed/applicants_for_processing.json' is now ready for your LLM processing.


: 