In [3]:
import pandas as pd
import re
import os
import platform

def extract_keywords(text):
    pattern = r"#\w+#"
    keywords = re.findall(pattern, text)
    return keywords

def extract_keywords_from_file(file_path):
    df = pd.read_csv(file_path)
    
    all_keywords = []
    for text in df['dialogue']:  # 'dialogue' column used to extract keywords
        keywords = extract_keywords(text)
        all_keywords.extend(keywords)
    
    return all_keywords

if __name__ == "__main__":
    os_name = platform.system()
    if os_name == 'Windows':
        PRE_PATH = ''
    elif os_name in ['Linux', 'Darwin']:  # Linux or Mac
        PRE_PATH = '/kkh/'

    # Initialize sets to store unique keywords for each file and the combined set
    unique_keywords_train = set()
    unique_keywords_dev = set()
    unique_keywords_test = set()

    # Process train.csv
    file_path = os.path.join(PRE_PATH + 'data', 'train.csv')
    keywords_train = extract_keywords_from_file(file_path)
    unique_keywords_train.update(keywords_train)
    print(f"train: {sorted(unique_keywords_train)}")

    # Process dev.csv
    file_path = os.path.join(PRE_PATH + 'data', 'dev.csv')
    keywords_dev = extract_keywords_from_file(file_path)
    unique_keywords_dev.update(keywords_dev)
    print(f"dev: {sorted(unique_keywords_dev)}")

    # Process test.csv
    file_path = os.path.join(PRE_PATH + 'data', 'test.csv')
    keywords_test = extract_keywords_from_file(file_path)
    unique_keywords_test.update(keywords_test)
    print(f"test: {sorted(unique_keywords_test)}")

    # Combine keywords from all sets
    combined_keywords = unique_keywords_train | unique_keywords_dev | unique_keywords_test

    # Convert the set to a sorted list for the combined keywords
    sorted_combined_keywords = sorted(combined_keywords)
    print(f"Combined keywords: {sorted_combined_keywords}")


train: ['#Address#', '#CarNumber#', '#CardNumber#', '#DateOfBirth#', '#Email#', '#PassportNumber#', '#Person#', '#Person1#', '#Person2#', '#Person3#', '#Person4#', '#Person5#', '#Person6#', '#Person7#', '#PhoneNumber#', '#SSN#']
dev: ['#Address#', '#DateOfBirth#', '#Person1#', '#Person2#', '#Person3#', '#Person4#', '#PhoneNumber#']
test: ['#Address#', '#PassportNumber#', '#Person1#', '#Person2#', '#Person3#', '#PhoneNumber#']
Combined keywords: ['#Address#', '#CarNumber#', '#CardNumber#', '#DateOfBirth#', '#Email#', '#PassportNumber#', '#Person#', '#Person1#', '#Person2#', '#Person3#', '#Person4#', '#Person5#', '#Person6#', '#Person7#', '#PhoneNumber#', '#SSN#']
