In [1]:
import hanlp
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)

                                             

In [6]:
def process_dep(dep_list):
    new_dep_list = []
    for dep_sent in dep_list:
        new_dep_list.append([(tup[0] - 1, tup[1]) for tup in dep_sent])
    return new_dep_list

target_sentence = "阿婆主来到北京立方庭参观自然语义科技公司。"
target_results = HanLP([target_sentence])

print('分词后的结果: ', target_results["tok/fine"])
print('词性分析结果: ', target_results["pos/pku"])
print('依存句法分析结果: ', process_dep(target_results["dep"]))

分词后的结果:  [['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司', '。']]
词性分析结果:  [['n', 'v', 'ns', 'ns', 'v', 'n', 'n', 'n', 'n', 'w']]
依存句法分析结果:  [[(1, 'nsubj'), (-1, 'root'), (3, 'nn'), (1, 'dobj'), (1, 'conj'), (6, 'nn'), (8, 'nn'), (8, 'nn'), (4, 'dobj'), (1, 'punct')]]


In [63]:
dependency_result = process_dep(target_results["dep"])
segmented_result = target_results["tok/fine"]
segmented_result = segmented_result[0]
print(dependency_result)
print(segmented_result)
for word_idx, (parent_idx, relation_type) in enumerate(dependency_result[0]):
    print(word_idx,'***',parent_idx,'***',relation_type)
    print(segmented_result[word_idx])
    print('*'*30)

[[(1, 'nsubj'), (-1, 'root'), (3, 'nn'), (1, 'dobj'), (1, 'conj'), (6, 'nn'), (8, 'nn'), (8, 'nn'), (4, 'dobj'), (1, 'punct')]]
['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司', '。']
0 *** 1 *** nsubj
阿婆主
******************************
1 *** -1 *** root
来到
******************************
2 *** 3 *** nn
北京
******************************
3 *** 1 *** dobj
立方庭
******************************
4 *** 1 *** conj
参观
******************************
5 *** 6 *** nn
自然
******************************
6 *** 8 *** nn
语义
******************************
7 *** 8 *** nn
科技
******************************
8 *** 4 *** dobj
公司
******************************
9 *** 1 *** punct
。
******************************


In [8]:
for word_idx, (parent_idx, relation_type) in enumerate(dependency_result[0]):
    word = segmented_result[word_idx]
    if parent_idx == -1:
        print(f"{word} 是 根节点")
    else:
        parent_word = segmented_result[parent_idx]
        print(f"{word} 的父节点是 {parent_word}，边的类型是 {relation_type}")

阿婆主 的父节点是 来到，边的类型是 nsubj
来到 是 根节点
北京 的父节点是 立方庭，边的类型是 nn
立方庭 的父节点是 来到，边的类型是 dobj
参观 的父节点是 来到，边的类型是 conj
自然 的父节点是 语义，边的类型是 nn
语义 的父节点是 公司，边的类型是 nn
科技 的父节点是 公司，边的类型是 nn
公司 的父节点是 参观，边的类型是 dobj
。 的父节点是 来到，边的类型是 punct


In [92]:
new_dependency_result = []
for word_idx, (parent_idx, relation_type) in enumerate(dependency_result[0]):
    new_dependency_result.append((word_idx, relation_type, parent_idx))
new_dependency_result = [new_dependency_result]

print(new_dependency_result)
print(segmented_result)
# for word_idx, relation_type, parent_idx in new_dependency_result:
#     print(word_idx,"***",relation_type,"***",parent_idx)

[[(0, 'nsubj', 1), (1, 'root', -1), (2, 'nn', 3), (3, 'dobj', 1), (4, 'conj', 1), (5, 'nn', 6), (6, 'nn', 8), (7, 'nn', 8), (8, 'dobj', 4), (9, 'punct', 1)]]
['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司', '。']


In [210]:
def extract_aspect_and_sentiment(dependency_result, segmented_result):
    aspect_words = []
    sentiment_words = []
    word_index_dict = {i: word for i, word in enumerate(segmented_result)}
    neg_positions = list()
    degree_positions = list()
    average_neg = list()
    average_word = list()
    # ⑨
    for word_idx, relation_type, parent_idx in dependency_result:
        if relation_type == "neg":
            neg_positions.append(word_idx)
            average_neg.append(parent_idx)
            average_word.append(word_index_dict.get(word_idx, ""))
        elif relation_type == "advmod":
            degree_positions.append(word_idx)
    
    for word_idx, relation_type, parent_idx in dependency_result:
        word = word_index_dict.get(word_idx, "")
        parent_word = word_index_dict.get(parent_idx, "")
        # ①
        if relation_type == "nsubj":
            aspect_words.append(word)
        # ②
        elif relation_type == "dobj":
            aspect_words.append(word)
        # ③
        elif relation_type == "nmod:topic" or relation_type == "top":
            aspect_words.append(word)
        # ④
        elif relation_type == "amod":
            sentiment = word
            if is_emotional_adjective(word):
                if word_idx in average_neg:
                    sentiment = average_word[average_neg.index(word_idx)] + sentiment
                for deg_idx, deg_type, deg_parent in dependency_result:
                    if deg_type == "advmod" and deg_parent == word_idx:
                        degree_word = word_index_dict.get(deg_idx, "")
                        sentiment = degree_word + sentiment
            sentiment_words.append(sentiment)
        # ⑤
        elif relation_type == "advmod":
            adv_word = word
            modified_word = parent_word
            if parent_idx in average_neg:
                sentiment_words.append(adv_word)
            elif is_emotional_adjective(modified_word):
                sentiment_words.append(adv_word)
                sentiment_words.append(modified_word)
            else:
                for other_idx, other_type, other_parent in dependency_result:
                    if other_parent == parent_idx and other_type == "amod":
                        adj_word = word_index_dict.get(other_idx, "")
                        if is_emotional_adjective(adj_word):
                            sentiment_words.append(adv_word)
                            sentiment_words.append(adj_word)
        # ⑥
        elif relation_type == "conj" or relation_type == "dep":
            adj1 = parent_word
            adj2 = word
            if is_emotional_adjective(adj1) and is_emotional_adjective(adj2):
                sentiment_words.append(adj2)
                sentiment_words.append(adj1)
        # ⑦
        elif relation_type == "xcomp":
            if is_emotional_adjective(word):
                for subj_idx, subj_type, subj_parent in dependency_result:
                    if subj_type == "nsubj" and subj_parent == parent_idx:
                        aspect_word = word_index_dict.get(subj_idx, "")
                        aspect_words.append(aspect_word)
                sentiment_words.append(word)
        # ⑧
        elif relation_type == "ccomp":
            if is_emotional_adjective(word):
                for subj_idx, subj_type, subj_parent in dependency_result:
                    if subj_type == "nsubj" and subj_parent == word_idx:
                        aspect_words.append(word_index_dict.get(subj_idx, ""))
                sentiment_words.append(word)
        # 补充，根节点
        elif relation_type == "root" and is_emotional_word(word):
            if word_idx in average_neg:
                    word = average_word[average_neg.index(word_idx)] + word
            sentiment_words.append(word)
        # ⑩
        elif is_emotional_verb(word):
            sentiment_words.append(word)
            for other_idx, other_type, other_parent in dependency_result:
                if other_parent == word_idx:
                    if other_type == "nsubj" or other_type == "dobj":
                        aspect_words.append(word_index_dict.get(other_idx, ""))
        
    return aspect_words, sentiment_words
# ⑩
def is_emotional_adjective(word):
    emotional_adjectives = {
        "好", "坏", "优秀", "糟糕", "完美", "差", "舒适", "难受", "干净", "脏", 
        "漂亮", "丑", "美丽", "难看", "满意", "不满", "热情", "冷淡", "细致",
        "粗糙", "便宜", "贵", "合适", "不当", "新鲜", "陈旧", "高效", "低效",
        "强大", "弱小", "宽敞", "狭窄", "明亮", "昏暗", "温馨", "冷清"
    }
    return word in emotional_adjectives
# ⑩
def is_emotional_verb(word):
    emotional_verbs = {
        "喜欢", "讨厌", "满意", "不满", "称赞", "批评", "赞同", "反对", 
        "欣赏", "厌恶", "推荐", "反对", "认可", "质疑", "表扬", "责备"
    }
    return word in emotional_verbs
# ⑩
def is_emotional_word(word):
    return is_emotional_adjective(word) or is_emotional_verb(word)

In [211]:
target_sentence = "你像公主一样美丽"
target_results = HanLP([target_sentence])
dependency_result = process_dep(target_results["dep"])
segmented_result = target_results["tok/fine"]
segmented_result = segmented_result[0]
print('分词后的结果: ', segmented_result)
print('依存句法分析结果: ', dependency_result)

new_dependency_result = []
for word_idx, (parent_idx, relation_type) in enumerate(dependency_result[0]):
    new_dependency_result.append((word_idx, relation_type, parent_idx))
#     print(word_idx,"***",relation_type,"***",parent_idx)

分词后的结果:  ['你', '像', '公主', '一样', '美丽']
依存句法分析结果:  [[(3, 'nsubj'), (3, 'prep'), (1, 'pobj'), (-1, 'root'), (3, 'dep')]]


In [212]:
aspect_words, sentiment_words = extract_aspect_and_sentiment(new_dependency_result, segmented_result)
aspect_words = list(set(aspect_words))
sentiment_words = list(set(sentiment_words))
print("方面词:", aspect_words)
print("情感词:", sentiment_words)

方面词: ['你']
情感词: []


In [54]:
data = pd.read_csv(r'C:\Users\86178\Desktop\shiyan3\sentiment.tsv', sep='\t')
data.info()
print(data['text_a'][0])

print('Segmented Beginning')
data['segmented_text'] = data['text_a'].apply(lambda x: HanLP([x])["tok/fine"])
print('Pos Beginning')
data['pos_result'] = data['text_a'].apply(lambda x: HanLP([x])["pos/pku"])
print('Dependency Beginning')
data['dependency_result'] = data['text_a'].apply(lambda x: process_dep(HanLP([x])["dep"]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   1200 non-null   int64 
 1   text_a  1200 non-null   object
dtypes: int64(1), object(1)
memory usage: 18.9+ KB
這間酒店環境和服務態度亦算不錯,但房間空間太小~~不宣容納太大件行李~~且房間格調還可以~~ 中餐廳的廣東點心不太好吃~~要改善之~~~~但算價錢平宜~~可接受~~ 西餐廳格調都很好~~但吃的味道一般且令人等得太耐了~~要改善之~~
Segmented Beginning
Pos Beginning
Dependency Beginning


In [61]:
new_csv_path = r'C:\Users\86178\Desktop\shiyan3\new_sentiment.csv'
data.to_csv(new_csv_path, encoding='utf-8', index=False)

In [224]:
def extract_aspect_and_sentiment(dependency_result, segmented_result):
    aspect_words = []
    sentiment_words = []
    word_index_dict = {i: word for i, word in enumerate(segmented_result)}
    
    # 存储否定词和程度副词信息
    neg_positions = list()
    degree_positions = list()
    average_neg = list()
    average_word = list()
    
    # 存储比喻关系
    simile_pairs = []  # 存储比喻关系的词对
    
    # 第一遍扫描：收集修饰词和比喻关系
    for word_idx, relation_type, parent_idx in dependency_result:
        word = word_index_dict.get(word_idx, "")
        parent_word = word_index_dict.get(parent_idx, "")
        
        if relation_type == "neg":
            neg_positions.append(word_idx)
            average_neg.append(parent_idx)
            average_word.append(word)
        elif relation_type == "advmod":
            degree_positions.append(word_idx)
        # 处理比喻关系
        elif is_simile_word(word):  # 检查是否是比喻词（如"像"、"似"等）
            # 查找比喻词前后的名词和形容词
            for other_idx, other_type, other_parent in dependency_result:
                if other_parent == word_idx or other_parent == parent_idx:
                    simile_pairs.append((word_idx, other_idx))
    
    # 第二遍扫描：提取方面词和情感词
    for word_idx, relation_type, parent_idx in dependency_result:
        word = word_index_dict.get(word_idx, "")
        parent_word = word_index_dict.get(parent_idx, "")
        
        # 处理比喻句中的情感词
        if is_emotional_adjective(word):
            sentiment = word
            # 检查是否在比喻结构中
            for simile_start, simile_end in simile_pairs:
                if word_idx >= simile_start and word_idx <= simile_end:
                    # 处理否定词
                    if word_idx in average_neg:
                        sentiment = average_word[average_neg.index(word_idx)] + sentiment
                    # 处理程度副词
                    for deg_idx, deg_type, deg_parent in dependency_result:
                        if deg_type == "advmod" and deg_parent == word_idx:
                            degree_word = word_index_dict.get(deg_idx, "")
                            if is_degree_word(degree_word):
                                sentiment = degree_word + sentiment
                    sentiment_words.append(sentiment)
                    break
        
        # 处理主语（可能是比喻的对象）
        elif relation_type == "nsubj":
            if not is_stop_word(word):
                aspect_words.append(word)
                # 查找与主语相关的情感词
                for other_idx, other_type, other_parent in dependency_result:
                    if other_parent == parent_idx and is_emotional_adjective(word_index_dict.get(other_idx, "")):
                        sentiment_words.append(word_index_dict.get(other_idx, ""))
        
        # 其他原有的处理逻辑保持不变...
        
    return aspect_words, sentiment_words

def is_simile_word(word):
    """判断是否是比喻词"""
    simile_words = {"像", "似", "如", "若", "仿佛", "好比", "宛如", "恰似", "犹如"}
    return word in simile_words

def is_degree_word(word):
    """判断是否是程度副词"""
    degree_words = {
        "很", "非常", "特别", "格外", "分外", "十分", "极其", "极度",
        "极端", "极为", "最为", "最", "太", "更", "更加", "格外",
        "一样", "那样", "这样", "如此"  # 添加比喻相关的程度词
    }
    return word in degree_words

def is_emotional_adjective(word):
    """判断是否是情感形容词"""
    emotional_adjectives = {
        # 原有的情感词
        "好", "坏", "优秀", "糟糕", "完美", "差", "舒适", "难受", "干净", "脏", 
        "漂亮", "丑", "美丽", "难看", "满意", "不满", "热情", "冷淡", "细致",
        "粗糙", "便宜", "贵", "合适", "不当", "新鲜", "陈旧", "高效", "低效",
        "强大", "弱小", "宽敞", "狭窄", "明亮", "昏暗", "温馨", "冷清",
        # 添加形容外表和性格的词
        "美", "丑", "漂亮", "英俊", "帅", "美丽", "可爱", "迷人", "优雅",
        "高贵", "典雅", "端庄", "秀丽", "清秀", "俊俏", "标致"
    }
    return word in emotional_adjectives

def is_stop_word(word):
    """判断是否是停用词"""
    stop_words = {
        "的", "了", "着", "来", "去", "到", "在", "和", "与", "及",
        "或", "而", "但", "却", "且", "被", "把", "将", "向", "往",
        "是", "有", "没", "不", "这", "那", "这个", "那个", "之",
        "都", "也", "还", "又", "就", "很", "比较", "更", "最"
    }
    return word in stop_words

In [225]:
target_sentence = "你像公主一样美丽"
target_results = HanLP([target_sentence])
dependency_result = process_dep(target_results["dep"])
segmented_result = target_results["tok/fine"]
segmented_result = segmented_result[0]
print('分词后的结果: ', segmented_result)
print('依存句法分析结果: ', dependency_result)

new_dependency_result = []
for word_idx, (parent_idx, relation_type) in enumerate(dependency_result[0]):
    new_dependency_result.append((word_idx, relation_type, parent_idx))
#     print(word_idx,"***",relation_type,"***",parent_idx)
aspect_words, sentiment_words = extract_aspect_and_sentiment(new_dependency_result, segmented_result)
aspect_words = list(set(aspect_words))
sentiment_words = list(set(sentiment_words))
print("方面词:", aspect_words)
print("情感词:", sentiment_words)

分词后的结果:  ['你', '像', '公主', '一样', '美丽']
依存句法分析结果:  [[(3, 'nsubj'), (3, 'prep'), (1, 'pobj'), (-1, 'root'), (3, 'dep')]]
方面词: ['你']
情感词: ['美丽']


In [222]:
for index in range(10):
    print(f"第 {index + 1} 个句子:")
    segmented_result = data['segmented_text'].iloc[index][0]
    dependency_result = data['dependency_result'].iloc[index]
    new_dependency_result = []
    for word_idx, (parent_idx, relation_type) in enumerate(dependency_result[0]):
        new_dependency_result.append((word_idx, relation_type, parent_idx))

    aspect_words, sentiment_words = extract_aspect_and_sentiment(new_dependency_result, segmented_result)
    aspect_words = list(set(aspect_words))
    sentiment_words = list(set(sentiment_words))

    print("方面词:", aspect_words)
    print("情感词:", sentiment_words)
    print("-" * 30)

第 1 个句子:
方面词: []
情感词: ['可以', '很好', '一般']
------------------------------
第 2 个句子:
方面词: []
情感词: ['推荐']
------------------------------
第 3 个句子:
方面词: ['速度']
情感词: []
------------------------------
第 4 个句子:
方面词: []
情感词: ['很满意', '简单']
------------------------------
第 5 个句子:
方面词: ['电池', '系统']
情感词: ['可以']
------------------------------
第 6 个句子:
方面词: []
情感词: ['一般']
------------------------------
第 7 个句子:
方面词: []
情感词: []
------------------------------
第 8 个句子:
方面词: ['性能']
情感词: ['喜欢', '十分好', '不错']
------------------------------
第 9 个句子:
方面词: ['驱动', '键盘', '显卡', 'N卡']
情感词: ['偏向', '很容易', '足够强大']
------------------------------
第 10 个句子:
方面词: []
情感词: ['很喜欢', '不错']
------------------------------
