In [1]:
#libraries
import re
import pandas as pd
import numpy as np
from scipy.stats import chisquare
import seaborn as sns
import matplotlib.pyplot as plt

# Helper Function

In [2]:
def get_mapped_values(input_keys, mappings, value_descriptions):
    # Convert input_keys to strings
    input_keys = [str(key) for key in input_keys]
    
    # Create a set to keep track of unique corresponding values
    unique_values = set()
    
    for key in input_keys:
        # Check if the key is in the first dictionary
        if key in mappings:
            # Get the value from the first dictionary
            intermediate_value = mappings[key]
            
            # Get the corresponding value from the second dictionary
            final_value = value_descriptions.get(intermediate_value, "Value not found in second dictionary")
            
            # Add to set of unique values
            unique_values.add(final_value)
            
            print(f"Key: {key}, Intermediate Value: {intermediate_value}, Final Value: {final_value}")
        else:
            print(f"Key: {key} not found in first dictionary.")
            
    # Check if all corresponding values are the same or different
    if len(unique_values) > 1:
        print("The corresponding values are different.")
    else:
        print("All corresponding values are the same.")

# HISCLASS Dictionary

In [3]:
# Read the .do file
with open(r"C:\Users\soere\OneDrive\Desktop\Python MA\0.1 Classification Data\HISCLASS.do") as file:
    lines = file.readlines()

lines[:200]

['\n',
 'recode hisco \t(99997=1) (99998=1) (01110=2) (01120=2) (01130=2) (01140=2) (01150=2) (01190=2) (01210=2) (01220=2) (01230=2) (01240=2) ///\n',
 '\t\t\t\t(01250=2) (01260=2) (01270=2) (01280=2) (01290=2) (01320=2) (01330=2) (01340=2) (01350=2) (01390=2) (01400=4) (01420=4) ///\n',
 '\t\t\t\t(01430=4) (01490=4) (02000=2) (02120=2) (02130=2) (02140=2) (02210=2) (02220=2) (02230=2) (02235=2) (02240=2) (02245=2) ///\n',
 '\t\t\t\t(02250=2) (02255=2) (02260=4) (02290=2) (02305=2) (02310=2) (02320=2) (02330=2) (02340=2) (02390=2) (02410=2) (02420=2) ///\n',
 '\t\t\t\t(02430=2) (02440=2) (02450=2) (02460=2) (02470=2) (02480=2) (02485=2) (02490=2) (02510=2) (02520=2) (02590=2) (02620=2) ///\n',
 '\t\t\t\t(02630=2) (02690=2) (02710=2) (02720=2) (02730=2) (02740=2) (02790=2) (02810=2) (02820=2) (02830=2) (02890=2) (02920=2) ///\n',
 '\t\t\t\t(02930=2) (02940=2) (02950=2) (02990=2) (03010=4) (03020=4) (03030=4) (03040=4) (03050=2) (03090=4) (03110=4) (03120=4) ///\n',
 '\t\t\t\t(03130=4) 

In [4]:
# Regular expression to capture HISCO code
pattern = r"\((\d{5})=(\d+)\)"

# Extract all the mappings from the lines
mappings = {}
for line in lines:
    matches = re.findall(pattern, line)
    for hisco_code, hisco_class in matches:
        mappings[hisco_code] = int(hisco_class)

# Check the first few mappings
dict(list(mappings.items())[:100])

{'99997': 1,
 '99998': 1,
 '01110': 2,
 '01120': 2,
 '01130': 2,
 '01140': 2,
 '01150': 2,
 '01190': 2,
 '01210': 2,
 '01220': 2,
 '01230': 2,
 '01240': 2,
 '01250': 2,
 '01260': 2,
 '01270': 2,
 '01280': 2,
 '01290': 2,
 '01320': 2,
 '01330': 2,
 '01340': 2,
 '01350': 2,
 '01390': 2,
 '01400': 4,
 '01420': 4,
 '01430': 4,
 '01490': 4,
 '02000': 2,
 '02120': 2,
 '02130': 2,
 '02140': 2,
 '02210': 2,
 '02220': 2,
 '02230': 2,
 '02235': 2,
 '02240': 2,
 '02245': 2,
 '02250': 2,
 '02255': 2,
 '02260': 4,
 '02290': 2,
 '02305': 2,
 '02310': 2,
 '02320': 2,
 '02330': 2,
 '02340': 2,
 '02390': 2,
 '02410': 2,
 '02420': 2,
 '02430': 2,
 '02440': 2,
 '02450': 2,
 '02460': 2,
 '02470': 2,
 '02480': 2,
 '02485': 2,
 '02490': 2,
 '02510': 2,
 '02520': 2,
 '02590': 2,
 '02620': 2,
 '02630': 2,
 '02690': 2,
 '02710': 2,
 '02720': 2,
 '02730': 2,
 '02740': 2,
 '02790': 2,
 '02810': 2,
 '02820': 2,
 '02830': 2,
 '02890': 2,
 '02920': 2,
 '02930': 2,
 '02940': 2,
 '02950': 2,
 '02990': 2,
 '03010': 4,

In [5]:
# Convert all values in the mappings dictionary to strings
mappings = {key: str(value) for key, value in mappings.items()}

In [6]:
# Create a mapping dictionary for the skills
skill_mappings = {
    "1": "high",
    "2": "high",
    "3": "medium",
    "4": "medium",
    "5": "low",
    "6": "medium",
    "7": "medium",
    "8": "medium",
    "9": "low",
    "10": "low",
    "11": "unskilled",
    "12": "unskilled"}

HISCO_try = ["33160", "59940"]
get_mapped_values(HISCO_try, mappings, skill_mappings)

Key: 33160, Intermediate Value: 5, Final Value: low
Key: 59940, Intermediate Value: 9, Final Value: low
All corresponding values are the same.
