In [13]:
import json

def classify_utterance(text):
    # 将文本转换为小写并确保是字符串列表
    if isinstance(text, list):
        text = ' '.join(text).lower()
    else:
        text = text.lower()
    
    # 初始化标签
    tags = {
        'has_direct_genres': False,
        'has_descriptive_terms': False,
        'has_comparative_terms': False,
        'is_other': False
    }
    
    # 定义电影类型及其关键词
    direct_genres = {
        'comedy': ['comedy', 'comedic', 'funny', 'hilarious', 'humorous', 'laugh', 'laughing', 'jokes', 'silly', 'goofy'],
        'action': ['action', 'intense', 'explosive', 'fighting', 'chase', 'adventure', 'exciting', 'thrilling', 'packed'],
        'drama': ['drama', 'dramatic', 'touching', 'emotional', 'serious', 'deep', 'moving', 'powerful', 'thought-provoking'],
        'horror': ['horror', 'scary', 'frightening', 'terrifying', 'spooky', 'creepy', 'disturbing', 'haunting', 'supernatural'],
        'romance': ['romance', 'romantic', 'love story', 'sweet', 'relationship', 'dating', 'couple', 'heartwarming'],
        'thriller': ['thriller', 'suspense', 'suspenseful', 'mystery', 'tense', 'psychological', 'twist', 'crime'],
        'sci-fi': ['sci-fi', 'science fiction', 'space', 'future', 'alien', 'robots', 'technological', 'dystopian'],
        'family': ['family', 'kids', 'children', 'animated', 'animation', 'disney', 'pixar', 'cartoon'],
        'war': ['war', 'military', 'battle', 'army', 'soldier', 'combat', 'historical'],
        'western': ['western', 'cowboy', 'wild west', 'gunslinger'],
        'fantasy': ['fantasy', 'magical', 'magic', 'mythical', 'fairy tale', 'supernatural'],
        'documentary': ['documentary', 'true story', 'real life', 'educational', 'historical']
    }
    
    # 定义描述性词汇
    descriptive_terms = [
        'good', 'great', 'excellent', 'amazing', 'awesome', 'fantastic', 'wonderful', 'brilliant',
        'outstanding', 'superb', 'perfect', 'masterpiece', 'favorite', 'best',
        'bad', 'terrible', 'awful', 'boring', 'disappointing', 'worst', 'mediocre', 'poor',
        'very', 'really', 'extremely', 'quite', 'pretty', 'super',
        'entertaining', 'enjoyable', 'interesting', 'engaging', 'compelling', 'gripping',
        'powerful', 'moving', 'touching', 'emotional',
        'slow', 'fast-paced', 'intense', 'light', 'dark', 'complex', 'simple',
        'classic', 'modern', 'unique', 'different'
    ]
    
    # 定义比较性词汇
    comparative_terms = [
        'like', 'similar to', 'better than', 'worse than', 'reminds me of',
        'same as', 'different from', 'comparable to', 'as good as',
        'in the style of', 'just like', 'close to'
    ]
    
    # 检查电影类型
    for genre, keywords in direct_genres.items():
        if any(keyword in text for keyword in keywords):
            tags['has_direct_genres'] = True
            break
            
    # 检查描述性词汇
    if any(term in text for term in descriptive_terms):
        tags['has_descriptive_terms'] = True
        
    # 检查比较性词汇
    if any(term in text for term in comparative_terms):
        tags['has_comparative_terms'] = True
    
    # 如果没有任何标签，标记为其他
    if not (tags['has_direct_genres'] or tags['has_descriptive_terms'] or 
            tags['has_comparative_terms']):
        tags['is_other'] = True
        
    return tags

def add_tags_to_dialogs(dialogs):
    tagged_dialogs = []
    
    for dialog in dialogs:
        # 复制原始对话数据
        tagged_dialog = dialog.copy()
        tagged_dialog['dialog'] = []
        
        # 为每个utterance添加标签
        for utterance in dialog['dialog']:
            tagged_utterance = utterance.copy()
            tagged_utterance['utterance_tags'] = classify_utterance(utterance['text'])
            tagged_dialog['dialog'].append(tagged_utterance)
            
        tagged_dialogs.append(tagged_dialog)
    
    return tagged_dialogs

# 读取原始数据
with open('./data/test_data.json', 'r') as f:
    dialogs = json.load(f)

# 添加标签
tagged_dialogs = add_tags_to_dialogs(dialogs)

# 保存带标签的数据
with open('./data/tagged_test_data.json', 'w') as f:
    json.dump(tagged_dialogs, f, indent=2)

# 打印示例
print("示例对话的第一个utterance的标签：")
print(json.dumps(tagged_dialogs[0]['dialog'][0]['utterance_tags'], indent=2))

示例对话的第一个utterance的标签：
{
  "has_direct_genres": false,
  "has_descriptive_terms": false,
  "has_comparative_terms": true,
  "is_other": false
}


In [15]:
def calculate_statistics(dialogs):
    stats = {
        'total_utterances': 0,
        'tags_count': {
            'has_direct_genres': 0,
            'has_descriptive_terms': 0,
            'has_comparative_terms': 0,
            'is_other': 0
        },
        'role_stats': {
            'Seeker': {
                'total': 0,
                'has_direct_genres': 0,
                'has_descriptive_terms': 0,
                'has_comparative_terms': 0,
                'is_other': 0
            },
            'Recommender': {
                'total': 0,
                'has_direct_genres': 0,
                'has_descriptive_terms': 0,
                'has_comparative_terms': 0,
                'is_other': 0
            }
        }
    }
    
    for dialog in dialogs:
        for utterance in dialog['dialog']:
            stats['total_utterances'] += 1
            tags = utterance['utterance_tags']
            role = utterance['role']
            
            # 更新总体统计
            for tag, value in tags.items():
                if value:
                    stats['tags_count'][tag] += 1
            
            # 更新角色相关统计
            stats['role_stats'][role]['total'] += 1
            for tag, value in tags.items():
                if value:
                    stats['role_stats'][role][tag] += 1
    
    return stats

# 在添加标签后计算统计结果
tagged_dialogs = add_tags_to_dialogs(dialogs)
statistics = calculate_statistics(tagged_dialogs)

# 打印统计结果
print("\n=== 标签统计结果 ===")
print(f"总对话回合数: {statistics['total_utterances']}")
print("\n标签分布:")
for tag, count in statistics['tags_count'].items():
    percentage = (count / statistics['total_utterances']) * 100
    print(f"{tag}: {count} ({percentage:.2f}%)")

print("\n按角色分布:")
for role, role_stats in statistics['role_stats'].items():
    print(f"\n{role}:")
    print(f"总回合数: {role_stats['total']}")
    for tag, count in role_stats.items():
        if tag != 'total':
            percentage = (count / role_stats['total']) * 100
            print(f"{tag}: {count} ({percentage:.2f}%)")

# 保存带标签的数据和统计结果
output_data = {
    'tagged_dialogs': tagged_dialogs,
    'statistics': statistics
}

with open('./data/tagged_test_data.json', 'w') as f:
    json.dump(output_data, f, indent=2)


=== 标签统计结果 ===
总对话回合数: 23952

标签分布:
has_direct_genres: 2555 (10.67%)
has_descriptive_terms: 7424 (31.00%)
has_comparative_terms: 3326 (13.89%)
is_other: 13033 (54.41%)

按角色分布:

Seeker:
总回合数: 12401
has_direct_genres: 1322 (10.66%)
has_descriptive_terms: 3659 (29.51%)
has_comparative_terms: 1870 (15.08%)
is_other: 6817 (54.97%)

Recommender:
总回合数: 11551
has_direct_genres: 1233 (10.67%)
has_descriptive_terms: 3765 (32.59%)
has_comparative_terms: 1456 (12.60%)
is_other: 6216 (53.81%)
