In [1]:
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path

In [2]:
def load_subtlex_us(file_path):
    """
    Load SUBTLEX-US data
    """
    try:
        df = pd.read_excel(file_path)
        print("\n=== SUBTLEX-US Data Loaded ===")
        print("Number of total words:", len(df))
        print("Columns:", df.columns.tolist())
        return df
    except Exception as e:
        print(f"Error reading SUBTLEX-US file: {e}")
        return None

In [3]:
def extract_emotion_words(xml_file):
    """
    Extract emotion words from XML file's categ attributes
    """
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        
        # Extract all unique categ attributes
        emotion_words = set()
        for syn_list in root.findall('.//noun-syn'):
            categ = syn_list.get('categ')
            if categ:
                emotion_words.add(categ.lower())  # Convert to lowercase for matching
                
        print(f"\n=== Extracted {len(emotion_words)} unique emotion words ===")
        return sorted(list(emotion_words))
    except Exception as e:
        print(f"Error parsing XML file: {e}")
        return None

In [4]:
def create_emotion_lexicon(subtlex_df, emotion_words, output_path):
    """
    Create new lexicon containing only emotion words with their SUBTLEX-US information
    """
    # Convert emotion words to lowercase for case-insensitive matching
    emotion_words_set = set(emotion_words)
    
    # Filter SUBTLEX-US dataframe
    emotion_df = subtlex_df[subtlex_df['Word'].str.lower().isin(emotion_words_set)].copy()
    
    # Sort by frequency
    emotion_df = emotion_df.sort_values('FREQcount', ascending=False)
    
    print("\n=== Emotion Lexicon Creation ===")
    print(f"Found {len(emotion_df)} emotion words in SUBTLEX-US")
    print("\nSample of matched words:")
    print(emotion_df[['Word', 'FREQcount', 'SUBTLWF']].head())
    
    # Save to CSV
    emotion_df.to_csv(output_path, index=False)
    print(f"\nSaved emotion lexicon to: {output_path}")
    
    # Report missing words
    found_words = set(emotion_df['Word'].str.lower())
    missing_words = emotion_words_set - found_words
    print(f"\nMissing words ({len(missing_words)}):")
    print(sorted(list(missing_words))[:10], "...")
    
    return emotion_df

In [5]:
HOME_DIR = Path("/Volumes/ssd/01-ckj-postdoc/LLM-alignment-data-generation/")
subtlex_path = HOME_DIR / 'lexicon' / 'subtlex-us' / 'SUBTLEX-US frequency list with PoS and Zipf information.xlsx'
wordnet_path = HOME_DIR / 'lexicon' / 'wn-domains-3.2' / 'wn-affect-1.1' / 'a-synsets.xml'