In [1]:
from bert_serving.client import BertClient
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from Bio.Cluster import kcluster
from sklearn.metrics.pairwise import cosine_similarity
# from pyclustering.cluster.kmeans import kmeans, kmeans_visualizer
# from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
# from pyclustering.cluster.kmeans import kmeans, kmeans_visualizer
# from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
import time
import random
import json

In [2]:
def kmeans_cluster(data, cluster_num=105):
    clf = KMeans(n_clusters=cluster_num)
    clf.fit(data)
    centers = clf.cluster_centers_
    labels = clf.labels_
    return centers, labels

In [3]:
def cluster_result_ranked_output(initial_comments, sen2vec, labels, centers=None, ascending=False):
    """
        initial_comments: 原始的评论
        sen2vec: 原始评论的feature vectors
        labels：评论聚类后的类别
        centers:评论聚类后的中心，如果没有后面会生成
        ascending: True 说明按照每句话跟中心相似度由高到低排序输出
    
    """
    cluster_result_list = []
    initial_comments_array = np.array(initial_comments)
    for cluster_id in range(cluster_num):
        center_sentence = None
        cluster_sentences = initial_comments_array[labels == cluster_id]
        cluster_vectors = sen2vec[labels == cluster_id]
        if centers is None:
            center_index = np.argmax(cosine_similarity(cluster_vectors).mean(axis=1)) # 用跟其他所有句子平均相似度最高的句子作为中心语
            cluster_center = cluster_vectors[center_index]
            center_sentence = cluster_sentences[center_index]
        else:
            cluster_center = centers[cluster_id]              # 类的中心
        similarity = cosine_similarity(cluster_vectors, cluster_center.reshape(1,-1))
        sentences_with_similarity = list(zip(np.array(cluster_sentences), similarity))
        sentences_with_similarity.sort(key=lambda x:x[1], reverse=ascending)    # 按照离中心的相似度倒序输出段落（reverse=True）
        if not center_sentence:
            if ascending:
                center_sentence = list(zip(*sentences_with_similarity))[0][0]   # 第一句是离中心意思最近的
            else:
                center_sentence = list(zip(*sentences_with_similarity))[0][-1]  # 最后一句是离中心意思最近的
        cluster_result_list.append((cluster_id, len(cluster_sentences), center_sentence, ",".join(list(zip(*sentences_with_similarity))[0])))
    return pd.DataFrame(cluster_result_list, columns=["cluster_id", "cluster_size", "center_sentence", "cluster_sentences"])

In [4]:
def cluster_result_output(initial_comments, labels, centers):
    """
        initial_comments: 原始的评论，非评论用于的feature
        labels: 评论feature聚完类的类别
        return: dataframe of 类，类里句子个数，类里的所有句子
    """
    cluster_result = []
    cluster_df = pd.DataFrame(np.stack([initial_comments, labels]).T, columns=['sentence', 'cluster'])
    for cluster in range(len(cluster_df['cluster'].unique())):
        cluster_data = cluster_df[cluster_df['cluster'] == cluster]['sentence'].tolist()
        cluster_result.append((cluster, len(cluster_data), ",".join(cluster_data)))
    cluster_result = pd.DataFrame(cluster_result,columns=['cluster', 'size', 'sentences'])
    return cluster_result

In [5]:
def cosine_kmeans(sen2vec, cluster_num):
    clusterid, error, nfound = kcluster(sen2vec, cluster_num, dist='u')
    return clusterid

In [6]:
def read_sentiment_sentences_file_from_step1(senti_type='positive'):
    """
        senti_type: positive, negative, neutral
        return: list of sentences
    """
    print(f"读取{senti_type}评论的情感分析")
    with open(f"model_result/sentiment/{senti_type}_sentence_list.txt", 'r', encoding='utf-8') as f:
        senti_comments = f.read().splitlines()
    print("非重复段落数: ", len(senti_comments))
    return senti_comments

In [7]:
def load_sen2vec(sentiment_type, is_first_time=True, sample_rate=1):
    if is_first_time:
        sentiment_comments = read_sentiment_sentences_file_from_step1(senti_type=sentiment_type)
        sample_sentences = random.sample(sentiment_comments, round(sample_rate * len(sentiment_comments)))
        bc = BertClient()
        sen2vec = bc.encode(sample_sentences)
        sentence_vec_dict = dict(zip(sample_sentences, sen2vec.tolist()))
        with open(f"step_data/{sentiment_type}_sample_{sample_rate}_sentence_vec_dict.json",'w',encoding='utf-8') as f:
            json.dump(sentence_vec_dict, f)
    else:
        sentence_vec_dict = json.load(open(f"step_data/{sentiment_type}_sample_{sample_rate}_sentence_vec_dict.json"))
    return sentence_vec_dict

In [None]:
# sentiment_type = 'neutral'
sample_rate = 1
for sentiment_type in ['positive', 'negative', 'neutral', 'no_keyword_negative', 'no_keyword_positive', 'no_keyword_neutral']:
    sentence_vec_dict = load_sen2vec(sentiment_type, is_first_time=True, sample_rate=1)
    sample_sentences, sen2vec = list(sentence_vec_dict.keys()), np.array(list(sentence_vec_dict.values()))
    # 余弦距离KMEANS聚类
    cluster_num = round(len(sample_sentences) / 1000)
    print("kcluster聚类")
    clusterid, error, nfound = kcluster(sen2vec, cluster_num, dist='u')
    # 按照离中心语句的距离由近到远排序输出保存
    a = cluster_result_ranked_output(sample_sentences, sen2vec, clusterid, centers=None, ascending=True)
    b = pd.concat([pd.Series([sentiment_type]*len(a), name='sentiment'), a], axis=1)
    b.to_excel(f"model_result/cluster/classify_sample_{sample_rate}_{sentiment_type}_cos_kmeans_ascending_{cluster_num}.xlsx", index=False)
    # 按照离中心语句的距离由远到近排序输出保存
    a = cluster_result_ranked_output(sample_sentences, sen2vec, clusterid, centers=None, ascending=False)
    b = pd.concat([pd.Series([sentiment_type]*len(a), name='sentiment'), a], axis=1)
    b.to_excel(f"model_result/cluster/classify_sample_{sample_rate}_{sentiment_type}_cos_kmeans_descending_{cluster_num}.xlsx", index=False)

读取no_keyword_positive评论的情感分析
非重复段落数:  222864


In [9]:
import openpyxl

In [10]:
b.to_excel(f"model_result/cluster/classify_sample_{sample_rate}_{sentiment_type}_cos_kmeans_ascending_{cluster_num}.xlsx", index=False)
# 按照离中心语句的距离由远到近排序输出保存
a = cluster_result_ranked_output(sample_sentences, sen2vec, clusterid, centers=None, ascending=False)
b = pd.concat([pd.Series([sentiment_type]*len(a), name='sentiment'), a], axis=1)
b.to_excel(f"model_result/cluster/classify_sample_{sample_rate}_{sentiment_type}_cos_kmeans_descending_{cluster_num}.xlsx", index=False)