In [4]:
import pandas as pd

# Function to calculate the frequency of each AI verb-noun pair in the uncleaned AI data
def calculate_ai_pair_frequencies(ai_file):
    # Read AI verb-noun pairs from the CSV file (uncleaned data)
    ai_df = pd.read_csv(ai_file)

    # Create a list of AI verb-noun pairs (using 'Verb' and 'Noun' columns)
    ai_verb_noun_pairs = list(zip(ai_df['Verb'], ai_df['Noun']))
    
    # Calculate total number of pairs
    total_ai_pairs = len(ai_verb_noun_pairs)
    
    # Calculate the frequency of each AI verb-noun pair
    frequency_dict = pd.Series(ai_verb_noun_pairs).value_counts(normalize=True).to_dict()  # Normalize to get relative frequency
    
    return frequency_dict, total_ai_pairs

# Function to read the data from a CSV and convert string representations of lists into actual lists of strings
def read_occupation_data(file_name, file_type='csv'):
    if file_type == 'csv':
        df = pd.read_csv(file_name)

    # Convert string representations of lists into actual lists of verb-noun pairs
    if 'Matched Verb-Noun Pairs from AI' in df.columns:
        df['Matched Verb-Noun Pairs from AI'] = df['Matched Verb-Noun Pairs from AI'].apply(
            lambda x: x.split(', ') if isinstance(x, str) else []
        )
    else:
        print("Error: No valid 'Matched Verb-Noun Pairs from AI' column found.")
    return df

# Function to replace 'Not Available' in Task Importance with the average of available task importance values for each occupation
def fill_missing_importance(df):
    # Replace 'Not Available' with NaN
    df['Task Importance'] = df['Task Importance'].replace('Not available', pd.NA)

    # Convert Task Importance to numeric, treating NaN as missing values
    df['Task Importance'] = pd.to_numeric(df['Task Importance'], errors='coerce')

    # Group by Occupation Name and fill NaN with the average task importance for that occupation
    df['Task Importance'] = df.groupby('Occupation Name')['Task Importance'].transform(lambda x: x.fillna(x.mean()))

    return df

# Function to calculate the exposure score using AI pair frequencies
def calculate_exposure_score(task_importances, matched_verb_noun_pairs, frequency_dict):
    weighted_sum_frequencies = 0
    total_weighted_pairs = 0

    # Iterate over each task in the occupation
    for task_idx, task_pairs in enumerate(matched_verb_noun_pairs):
        if not task_pairs:  # Skip if there are no matched pairs
            continue

        # Task importance for the current task
        task_importance = task_importances[task_idx]
        try:
            task_importance = float(task_importance)
        except ValueError:
            continue  # Skip tasks with invalid importance values

        # Process the matched verb-noun pairs
        for pair in set(task_pairs):  # Use set to ensure unique pairs
            # Split the pair into verb and noun
            verb_noun = pair.split(' ')
            if len(verb_noun) != 2:
                print(f"Skipping invalid pair: {pair}")
                continue  # Skip invalid pairs if not exactly two parts

            verb, noun = verb_noun
            frequency = frequency_dict.get((verb, noun), 0)  # Get the frequency from the dict
            weighted_sum_frequencies += task_importance * frequency

        # Add to the total weighted pairs (just counting total verb-noun pairs)
        total_weighted_pairs += task_importance * len(task_pairs)

    # Calculate and return the exposure score using the weighted sum formula
    return weighted_sum_frequencies / total_weighted_pairs if total_weighted_pairs > 0 else 0

# === Main Execution ===
# Step 1: Calculate frequencies of AI verb-noun pairs from the uncleaned data
ai_frequencies, total_extracted_pairs = calculate_ai_pair_frequencies('/Users/twylazhang/Desktop/Directed Research/code_output/3_filter_prepare_compare/filtered_AI_verb_noun_meaningful.csv')

# Step 2: Load the cleaned occupation task data
occupation_df = read_occupation_data('/Users/twylazhang/Desktop/Directed Research/code_output/5_compare/cleaned_matched_pairs.csv', file_type='csv')

# Step 3: Fill missing task importance values with the average importance for the same occupation
occupation_df = fill_missing_importance(occupation_df)

{('use', 'network'): 0.020706543789485887,
 ('use', 'model'): 0.009780907668231613,
 ('use', 'learning'): 0.00907088622268591,
 ('use', 'datum'): 0.008491276879383296,
 ('learn', 'method'): 0.007650843331594505,
 ('use', 'intelligence'): 0.006172839506172839,
 ('face', 'method'): 0.005013620819567612,
 ('train', 'network'): 0.00392685330087521,
 ('train', 'model'): 0.0037819509650495564,
 ('use', 'device'): 0.0035645974613110762,
 ('provide', 'service'): 0.003550107227728511,
 ('learn', 'device'): 0.003463165826233119,
 ('use', 'system'): 0.003376224424737727,
 ('generate', 'model'): 0.0032747927896597693,
 ('generate', 'datum'): 0.0028690662493479394,
 ('use', 'analysis'): 0.0027966150814351127,
 ('drive', 'method'): 0.0027821248478525473,
 ('provide', 'information'): 0.0026517127456094592,
 ('learn', 'system'): 0.0024633397090361095,
 ('position', 'method'): 0.002434359241870979,
 ('use', 'technique'): 0.0024198690082884136,
 ('use', 'method'): 0.002318437373210456,
 ('use', 'informa

In [5]:
total_extracted_pairs

69012

In [6]:
# Initialize dictionaries to store cumulative exposure values
weighted_sum_frequencies = {}
weighted_total_pairs = {}

# Step 4: Calculate the exposure score for each occupation
for idx, row in occupation_df.iterrows():
    occupation_name = row['Occupation Name']
    task_importances = [row['Task Importance']] * len(row['Matched Verb-Noun Pairs from AI']) if isinstance(row['Task Importance'], (int, float)) else [row['Task Importance']]

    print(f"\nProcessing occupation: {occupation_name}")

    # Initialize the weighted sum and total pairs for this occupation if not already done
    if occupation_name not in weighted_sum_frequencies:
        weighted_sum_frequencies[occupation_name] = 0
        weighted_total_pairs[occupation_name] = 0
    
    # Calculate the weighted sums for the current task based on matched verb-noun pairs, task importance, and AI frequencies
    current_weighted_sum = 0
    current_total_weighted_pairs = 0

    # Iterate over the matched verb-noun pairs for the task
    for pair in set(row['Matched Verb-Noun Pairs from AI']):  # Use set to avoid duplicates
        verb_noun = pair.split(' ')
        if len(verb_noun) != 2:
            print(f"Skipping invalid pair: {pair}")
            continue  # Skip invalid pairs if not exactly two parts

        verb, noun = verb_noun
        frequency = ai_frequencies.get((verb, noun), 0)  # Get the frequency from the dictionary (or 0 if not found)

        # Add the weighted frequency to the current weighted sum (task importance * frequency)
        current_weighted_sum += row['Task Importance'] * frequency
    
    # Add the weighted count of verb-noun pairs to the total weighted pairs for this task
    current_total_weighted_pairs += row['Task Importance'] * len(row['Matched Verb-Noun Pairs from AI'])

    # Update cumulative sums for the occupation
    weighted_sum_frequencies[occupation_name] += current_weighted_sum
    weighted_total_pairs[occupation_name] += current_total_weighted_pairs

# After processing all rows, calculate the exposure score for each occupation
exposure_scores = {
    occupation: weighted_sum_frequencies[occupation] / weighted_total_pairs[occupation] if weighted_total_pairs[occupation] > 0 else 0
    for occupation in weighted_sum_frequencies
}

# Convert exposure scores into a DataFrame for better visualization
exposure_scores_df = pd.DataFrame(list(exposure_scores.items()), columns=['Occupation Name', 'Exposure Score'])

# Save the exposure scores to a CSV file
#exposure_scores_df.to_csv('/Users/twylazhang/Desktop/Directed Research/code_output/6_calculate/occupation_exposure_scores.csv', index=False)
#exposure_scores_df.to_excel('/Users/twylazhang/Desktop/Directed Research/code_output/6_calculate/occupation_exposure_scores.xlsx', index=False)

# Print a preview of the result
print("\nExposure Scores Preview:")
print(exposure_scores_df.head())

In [8]:
occupation_df

Unnamed: 0,Occupation Name,Task Verb-Noun,Task Importance,Matched Verb-Noun Pairs from AI
0,Accountants and Auditors,prepare report,90.0,[identify report]
1,Accountants and Auditors,recommend change,89.0,"[determine change, discover change, assist cha..."
2,Accountants and Auditors,analyze datum,88.0,"[identify datum, visualize datum, evaluate dat..."
3,Accountants and Auditors,detect control,88.0,"[detect distraction, detect substance, control..."
4,Accountants and Auditors,inspect book,88.0,"[reproduce book, book availability, distribute..."
...,...,...,...,...
18039,Zoologists and Wildlife Biologists,conduct study,67.0,"[conduct research, study analysis, conduct exp..."
18040,Zoologists and Wildlife Biologists,coordinate program,61.0,"[construct program, program generator, convert..."
18041,Zoologists and Wildlife Biologists,control outbreak,61.0,"[control deployment, control transmission, con..."
18042,Zoologists and Wildlife Biologists,prepare collection,52.0,"[prioritize collection, prepare instruction, p..."


In [None]:

# Step 4: Calculate the exposure score for each occupation
for idx, row in occupation_df.iterrows():
    occupation_name = row['Occupation Name']
    task_importances = [row['Task Importance']] * len(row['Matched Verb-Noun Pairs from AI']) if isinstance(row['Task Importance'], (int, float)) else [row['Task Importance']]

    print(f"\nProcessing occupation: {occupation_name}")

    # Calculate the exposure score based on matched verb-noun pairs, task importance, and AI frequencies
    exposure_score = calculate_exposure_score(row['Task Verb-Noun'], task_importances, row['Matched Verb-Noun Pairs from AI'], ai_frequencies)
    exposure_scores[occupation_name] = exposure_score

# Convert exposure scores into a DataFrame for better visualization
exposure_scores_df = pd.DataFrame(list(exposure_scores.items()), columns=['Occupation Name', 'Exposure Score'])

# Save the exposure scores to a CSV file
#exposure_scores_df.to_csv('/Users/twylazhang/Desktop/Directed Research/code_output/6_calculate/occupation_exposure_scores.csv', index=False)
#exposure_scores_df.to_excel('/Users/twylazhang/Desktop/Directed Research/code_output/6_calculate/occupation_exposure_scores.xlsx', index=False)

# Print a preview of the result
print("\nExposure Scores Preview:")
print(exposure_scores_df.head())