In [None]:
import numpy
from gensim.models import word2vec
import jieba
from pathlib import Path
from nltk.tokenize import word_tokenize
import multiprocessing
from multistop import Stopwords
import os


In [None]:
class W2VModels(object):
    def __init__(self, cwd, lang='english'):
        """
        模型初始化设置
        :param cwd:  当前工作路径
        :param lang:  数据的语言
        """
        self.cwd = cwd
        self.lang = lang
    def __preproces(self, documents):
        """
        对数据进行预处理,分词、去除停用词；   可以加单词同类型合并的
        :param documents:  文档列表
        :return:  清洗后的文档列表
        """
        docs = []
        if self.lang=='english':
            sw = Stopwords()
            sw.setlang(lang=self.lang)
            stopwords = sw.stopwords()
            for document in documents:
                document = document.lower()
                document = [w for w in word_tokenize(document) if w not in stopwords]
                docs.append(document)
            return docs
        elif self.lang=='chinese':
            sw = Stopwords()
            sw.setlang(lang=self.lang)
            stopwords = sw.stopwords()
            for document in documents:
                words = jieba.lcut(document)
                document = [w for w in words if w not in stopwords]
                docs.append(document)
            return docs
        else:
            assert 'Do not support {} language'.format(self.lang)

    def train(self, documents, min_count=1):
        """
        训练语料库的word2vec模型
        :param documents:  传入的文档列表
        :param min_count: 模型中词语最少在语料中出现min_count次
        :return:
        """
        print('数据预处理开始.......')
        sentences = self.__preproces(documents=documents)
        print('预处理结束...........')
        print('Word2Vec模型训练开始......')

        # sg=0代表CBOW，sg=1代表Skip-Gram
        self.model = word2vec.Word2Vec(sentences, min_count=min_count,sample=1e-3, sg=0, window=3, hs=1,workers=multiprocessing.cpu_count(), epochs=10)
        # print(self.model)
        self.model.save(r"E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\model\word2vec.w2v")
        modeldir = Path(self.cwd).joinpath('model')
        Path(self.cwd).joinpath('model').mkdir(exist_ok=True)
        modelpath = str(Path(modeldir).joinpath('your.model'))
        self.model.save(modelpath)
        print('已将模型存入 {} '.format(str(modelpath)))

    def __search(self, seedwords, n=50):

        self.similars_candidate_idxs = [] #seedwords的候选词
        dictionary = self.model.wv.key_to_index
        print(dictionary)
        self.seedidxs = [] #把word 转化为 index
        for seed in seedwords:
            if seed in dictionary:
                seedidx = dictionary[seed]
                self.seedidxs.append(seedidx)
        print(self.seedidxs)
        for seedidx in self.seedidxs:
            # sims_words形如[('by', 0.99984), ('or', 0.99982), ('an', 0.99981), ('up', 0.99980)]
            sims_words = self.model.wv.similar_by_word(seedidx, topn=n)
            #将词语转为index存储起来
            self.similars_candidate_idxs.extend([dictionary[sim[0]] for sim in sims_words])
        self.similars_candidate_idxs = set(self.similars_candidate_idxs)

    def find(self, seedwords, seedwordsname, topn):
        simidx_scores = []
        print('准备寻找每个seed在语料中所有的相似候选词')
        self.__search(seedwords)
        print('初步搜寻到 {} 个相似的候选词'.format(len(self.similars_candidate_idxs)))

        print('计算每个候选词 与 {seedwordsname} 的相似度， 选出相似度最高的前 {topn} 个候选词'.format(seedwordsname=seedwordsname, topn=topn))
        for idx in self.similars_candidate_idxs:
            score = self.model.wv.n_similarity([idx], self.seedidxs)
            # print('分数shi ：'+score)

            simidx_scores.append((idx, score))
        simidxs = [w[0] for w in sorted(simidx_scores, key=lambda k:k[1], reverse=True)]

        # simwords = [str(self.model.wv.index_to_key[idx]) for idx in simidxs][:topn]
        simwords = [str(self.model.wv.index_to_key[idx]) for idx in simidxs][:]

        resultwords = []
        resultwords.extend(seedwords)
        resultwords.extend(simwords)

        txtdir = Path(self.cwd).joinpath('candidate_words')
        Path(self.cwd).joinpath('candidate_words').mkdir(exist_ok=True)
        candidatetxtfile = Path(txtdir).joinpath('{}.txt'.format(seedwordsname))
        with open(candidatetxtfile, 'w', encoding='utf-8') as f:
            for word in resultwords:
                f.write(word+'\n')
                # f.write(word+"\t"+score+'\n')
        print('已经 【{seedwordsname} 类】 的词语筛选，并保存于 {txtfile}'.format(seedwordsname=seedwordsname, txtfile=candidatetxtfile))
        return simwords


In [None]:
# model = W2VModels(cwd=os.getcwd(), lang='english')
model = W2VModels(cwd=os.getcwd(), lang='chinese')
#df = pd.read_excel('data.xlsx')
#model.train(documents=df['text'])
# model.train(documents=list(open('documents.txt').readlines()))
model.train(documents=list(open('E:\pythonProject\需求分析词典\知乎问题数据集.txt',encoding='utf-8').readlines()))

# model.save(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\model\word2vec.w2v')

In [None]:
Existence = [w for w in open(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Existence.txt',encoding='utf-8').read().split('\n') if w!='']
Growth = [w for w in open(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Growth.txt',encoding='utf-8').read().split('\n') if w!='']
Relatedness = [w for w in open(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Relatedness.txt',encoding='utf-8').read().split('\n') if w!='']
# respect = [w for w in open('seeds/respect.txt').read().split('\n') if w!='']
# teamwork = [w for w in open('seeds/teamwork.txt').read().split('\n') if w!='']

model.find(seedwords=Existence, seedwordsname='Existence', topn=50000)

model.find(seedwords=Growth, seedwordsname='Growth', topn=50000)

model.find(seedwords=Relatedness, seedwordsname='Relatedness', topn=50000)

# model.find(seedwords=respect, seedwordsname='respect', topn=100)
# model.find(seedwords=teamwork, seedwordsname='teamwork', topn=100)

In [None]:
# 判断是否是中文
def is_Chinese(word):#修改过的
    for ch in word:
        if '\u4e00' > ch or ch > '\u9fff':
            return False
    return True

In [None]:
#仅保留中文，去除重复词语
import pandas as pd
Existence = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\candidate_words\Existence.txt',encoding='utf-8',header=None,error_bad_lines=True,quoting=3)
Existence.drop_duplicates(subset=0,keep='first',inplace=True)
with open(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\candidate_words\Existence_仅保留中文.txt',mode='w+',encoding='utf-8') as file:
    for i in range(len(Existence)):
        if is_Chinese(str(Existence.iloc[i,0])):
            file.write(str(Existence.iloc[i,0])+'\n')
file.close()

In [None]:
import pandas as pd
Growth = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\candidate_words\Growth.txt',encoding='utf-8',header=None,error_bad_lines=True,quoting=3)
Growth.drop_duplicates(subset=0,keep='first',inplace=True)
with open(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\candidate_words\Growth_仅保留中文.txt',mode='w+',encoding='utf-8') as file:
    for i in range(len(Growth)):
        if is_Chinese(str(Growth.iloc[i,0])):
            file.write(str(Growth.iloc[i,0])+'\n')
file.close()

In [None]:
import pandas as pd
Relatedness = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\candidate_words\Relatedness.txt',encoding='utf-8',header=None,error_bad_lines=True,quoting=3)
Relatedness.drop_duplicates(subset=0,keep='first',inplace=True)
with open(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\candidate_words\Relatedness_仅保留中文.txt',mode='w+',encoding='utf-8') as file:
    for i in range(len(Relatedness)):
        if is_Chinese(str(Relatedness.iloc[i,0])):
            file.write(str(Relatedness.iloc[i,0])+'\n')
file.close()

In [None]:
import jieba
import pandas as pd
data = pd.read_table(r'E:\pythonProject\需求分析词典\知乎问题数据集.txt',encoding='utf-8',header=None,sep=',')
# print(data)
data_text=data[1].tolist()
print(len(data_text))
dictionary1 = pd.read_table(r'E:\pythonProject\需求分析词典\dictionary.txt',encoding='utf-8',header=None)
dictionary = dictionary1[0].tolist()
# dictionary
# data_text

In [None]:
jieba.load_userdict(dictionary)
def tokenize_zh(text):
    seg = jieba.lcut(text)
    words = '\n'.join(seg)
    return words

In [None]:
# 慎点
jieba.load_userdict(dictionary)
count = 0
with open(r'知乎问题数据集_分词结果.txt',encoding='utf-8',mode='w+') as f:
    for i in data_text:
        # print(i)
        cut_text = tokenize_zh(i)
        # print(cut_text)
        f.write(str(cut_text))
        # f.write('\n')
        print(count)
        count+=1
f.close()

In [None]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# sentence = word2vec.Text8Corpus(r'知乎问题数据集_分词结果.txt')
# # sentence = word2vec.Text8Corpus(r'E:\pythonProject\制作趋势词典\知乎问题数据集.txt')
# model = word2vec.Word2Vec(sentence, min_count=5,sample=1e-3, sg=0, window=3, hs=1,workers=multiprocessing.cpu_count(), epochs=10)
# print(model)

In [None]:
# text = pd.read_table(r'知乎问题数据集_分词结果.txt',encoding='utf-8',header=None,sep=',',error_bad_lines=False,quoting=csv.QUOTE_NONE)
# # text.to_csv(r'知乎问题数据集_分词结果.csv',encoding='utf-8',se)
# print(text)

In [None]:
# text1 = text.fillna('none')
# text1

In [None]:
# text2 = np.array(text1)
# text3 = text2.reshape(-1,1)
# text3

In [None]:
# pd.DataFrame(text3).to_csv(r'分词.txt',encoding='utf-8',header=0,index=0)

In [None]:
# 不浪费时间训练，直接加载模型。
from gensim.models import word2vec
import logging
import multiprocessing
logging.basicConfig(format='%(asctime)s:%(levelname)s: %(message)s', level=logging.INFO) #输出日志
# words = word2vec.Text8Corpus(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\知乎问题数据集_分词结果.txt')
# 如果是0， 则是CBOW模型，是1则是Skip-Gram模型，默认是0即CBOW模型。
# model = word2vec.Word2Vec(words,min_count=1,sample=1e-3, sg=0, window=3, hs=1, workers=multiprocessing.cpu_count(),epochs=10)
# sg=1是skip-gram算法，对低频词敏感；默认sg=0为CBOW算法。
# size是输出词向量的维数，值太小会导致词映射因为冲突而影响结果，值太大则会耗内存并使算法计算变慢，一般值取为100到200之间。
# window是句子中当前词与目标词之间的最大距离，3表示在目标词前看3-b个词，后面看b个词（b在0-3之间随机）。
# min_count是对词进行过滤，频率小于min-count的单词则会被忽视，默认值为5。
# negative和sample可根据训练结果进行微调，sample表示更高频率的词被随机下采样到所设置的阈值，默认值为1e-3。
# hs=1表示层级softmax将会被使用，默认hs=0且negative不为0，则负采样将会被选择使用。
# workers控制训练的并行，此参数只有在安装了Cpython后才有效，否则只能使用单核。multiprocessing.cpu_count()==16:线程数
# print(model)
# model.save(r"E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\model\word2vec.w2v")
model = word2vec.Word2Vec.load(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\model\word2vec.w2v')

In [None]:
# 开始计算相关性分数
import pandas as pd
Existence_seed1 = pd.read_table(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Existence.txt',encoding='utf-8',header=None,quoting=3)
# Existence_seed1
Existence_seed = Existence_seed1[0].to_list()
# Existence_seed
Existence_candidate1 = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\candidate_words\Existence_仅保留中文.txt',encoding='utf-8',header=None,quoting=3)
# Existence_candidate1
Existence_candidate = Existence_candidate1[0].to_list()
# Existence_candidate
Growth_seed1 = pd.read_table(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Growth.txt',encoding='utf-8',header=None,quoting=3)
# Growth_seed1
Growth_seed = Growth_seed1[0].to_list()
# Existence_seed
Growth_candidate1 = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\candidate_words\Growth_仅保留中文.txt',encoding='utf-8',header=None,quoting=3)
# Growth_candidate1
Growth_candidate = Growth_candidate1[0].to_list()
# Growth_candidate
Relatedness_seed1 = pd.read_table(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Relatedness.txt',encoding='utf-8',header=None,quoting=3)
# Relatedness_seed1
Relatedness_seed = Relatedness_seed1[0].to_list()
# Existence_seed
Relatedness_candidate1 = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\candidate_words\Relatedness_仅保留中文.txt',encoding='utf-8',header=None,quoting=3)
# Relatedness_candidate1
Relatedness_candidate = Relatedness_candidate1[0].to_list()
# Relatedness_candidate

In [None]:
#计算极性分数，并归类
import os
os.environ['NUMEXPR_MAX_THREADS'] = '8'
import pandas as pd
Keywords = pd.read_table(r'E:\pythonProject\需求分析词典\去重后的关键词.txt',encoding='utf-8',header=None,quoting=3)
Keywords = Keywords[0].to_list()
len(Keywords)
Existence_seed = pd.read_table(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Existence.txt',encoding='utf-8',header=None,quoting=3)
Existence_seed = Existence_seed[0].to_list()
len(Existence_seed)


Growth_seed = pd.read_table(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Growth.txt',encoding='utf-8',header=None,quoting=3)
Growth_seed = Growth_seed[0].to_list()
len(Growth_seed)

Relatedness_seed = pd.read_table(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Relatedness.txt',encoding='utf-8',header=None,quoting=3)
Relatedness_seed = Relatedness_seed[0].to_list()
len(Relatedness_seed)

In [None]:
with open(r'Existence_candidate_score.txt',encoding='utf-8',mode='w+') as file_existence, open(r'Growth_candidate_score.txt',encoding='utf-8',mode='w+') as file_growth,open(r'Relatedness_candidate_score.txt',encoding='utf-8',mode='w+') as file_relatedness:
    count1=0
    count2=0
    count3=0
    for i in Keywords:
        sum=0
        score_existence = 0
        score_add_existence = 0
        score_growth=0
        score_add_growth=0
        score_realtedness=0
        score_add_relatedness=0
        for existence in Existence_seed[0:7000]:
            try:
                score_existence = model.wv.similarity(i,existence)
            except KeyError:
                score_existence=0
            score_add_existence += score_existence
        for growth in Growth_seed[0:7000]:
            try:
                score_growth = model.wv.similarity(i,growth)
            except KeyError:
                score_growth = 0
            score_add_growth += score_growth
        for relatedness in Relatedness_seed[0:7000]:
            try:
                score_realtedness = model.wv.similarity(i,relatedness)
            except KeyError:
                score_realtedness = 0
            score_add_relatedness += score_realtedness
        if((score_add_existence > score_add_growth) and (score_add_existence > score_add_relatedness)):
            file_existence.write(str(i)+'\t'+str(score_add_existence))
            file_existence.write('\n')
            print('file_existence'+'\t'+str(count1))
            count1+=1
        elif((score_add_growth >= score_add_existence) and (score_add_growth >= score_add_relatedness)):
            file_growth.write(str(i)+'\t'+str(score_add_growth))
            file_growth.write('\n')
            print('file_growth'+'\t'+str(count2))
            count2+=1
        elif((score_add_relatedness >= score_add_existence) and (score_add_relatedness >= score_add_existence)):
            file_relatedness.write(str(i)+'\t'+str(score_add_relatedness))
            file_relatedness.write('\n')
            print('file_relatedness'+'\t'+str(count3))
            count3+=1
        # if(count % 1000)==0:
        #     file_existence.flush()
        #     file_relatedness.flush()
        #     file_growth.flush()
file_existence.close()
file_growth.close()
file_relatedness.close()

In [None]:
# 慎点
with open(r'Existence_candidate_score.txt',encoding='utf-8',mode='w+') as f:
    for i in Existence_candidate:
        sum = 0
        score_add_sum =0
        score_add=0
        score_sub1 = 0
        score_sub1_sum =0
        score_sub2 = 0
        score_sub2_sum = 0
        for j in Existence_seed:
            try:
                score_add = model.wv.similarity(i,j)
            except KeyError:
                score_add = 0
            score_add_sum += score_add
        for k in Growth_seed:
            try:
                score_sub1 = model.wv.similarity(i,k)
            except KeyError:
                score_sub1 = 0
            score_sub1_sum += score_sub1
        for l in Relatedness_seed:
            try:
                score_sub2 =  model.wv.similarity(i,l)
            except KeyError:
                score_sub2 = 0
            score_sub2_sum += score_sub2
        sum = (score_add_sum*2 - score_sub2_sum - score_sub1_sum)
        print(sum)
        f.write(str(sum))
        f.write('\n')
f.close()

In [None]:
# 慎点
with open(r'Growth_candidate_score.txt',encoding='utf-8',mode='w+') as f:
    for i in Growth_candidate:
        # sum = 0
        score_add_sum =0
        score_add=0
        score_sub1 = 0
        score_sub1_sum =0
        score_sub2 = 0
        score_sub2_sum = 0
        for j in Growth_seed:
            try:
                score_add = model.wv.similarity(i,j)
            except KeyError:
                score_add = 0
            score_add_sum += score_add
        for k in Existence_seed:
            try:
                score_sub1 = model.wv.similarity(i,k)
            except KeyError:
                score_sub1 = 0
            score_sub1_sum += score_sub1
        for l in Relatedness_seed:
            try:
                score_sub2 =  model.wv.similarity(i,l)
            except KeyError:
                score_sub2 = 0
            score_sub2_sum += score_sub2
        sum = score_add_sum*2 - score_sub2_sum - score_sub1_sum
        print(sum)
        f.write(str(sum))
        f.write('\n')
f.close()

In [None]:
with open(r'Relatedness_candidate_score.txt',encoding='utf-8',mode='w+') as f:
    for i in Relatedness_candidate:
        # sum = 0
        score_add_sum =0
        score_add=0
        score_sub1 = 0
        score_sub1_sum =0
        score_sub2 = 0
        score_sub2_sum = 0
        for j in Relatedness_seed:
            try:
                score_add = model.wv.similarity(i,j)
            except KeyError:
                score_add = 0
            score_add_sum += score_add
        for k in Existence_seed:
            try:
                score_sub1 = model.wv.similarity(i,k)
            except KeyError:
                score_sub1 = 0
            score_sub1_sum += score_sub1
        for l in Growth_seed:
            try:
                score_sub2 =  model.wv.similarity(i,l)
            except KeyError:
                score_sub2 = 0
            score_sub2_sum += score_sub2
        sum = score_add_sum*2 - score_sub2_sum - score_sub1_sum
        print(sum)
        f.write(str(sum))
        f.write('\n')
f.close()

In [None]:
try:
    y1 = model.wv.similarity(r'全国',r'疫情')
except KeyError:
    y1=0
print(y1)

In [None]:
import pandas as pd

Existence_candidate1 = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\candidate_words\Existence.txt',encoding='utf-8',header=None,quoting=3)
# Existence_candidate1
Existence_candidate = Existence_candidate1[0].to_list()
Existence_candidate

Growth_candidate1 = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\candidate_words\Growth.txt',encoding='utf-8',header=None,quoting=3)
# Growth_candidate1
# Growth_candidate = Growth_candidate1[0].to_list()
# Growth_candidate

Relatedness_candidate1 = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\candidate_words\Relatedness.txt',encoding='utf-8',header=None,quoting=3)
# Relatedness_candidate1
# Relatedness_candidate = Relatedness_candidate1[0].to_list()
# Relatedness_candidate

In [None]:
Existence_candidate_score1=pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\Existence_candidate_score.txt',encoding='utf-8',header=None)
Existence_candidate_score = Existence_candidate_score1[0].to_list()
Growth_candidate_score1=pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\Growth_candidate_score.txt',encoding='utf-8',header=None)
Growth_candidate_score=Growth_candidate_score1[0].to_list()
Relatedness_candidate_score1=pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\Relatedness_candidate_score.txt',encoding='utf-8',header=None)
Relatedness_candidate_score=Relatedness_candidate_score1[0].to_list()

In [None]:
threshold = 1

In [None]:
contact1 = pd.concat([Existence_candidate1,Existence_candidate_score1],axis=1)
contact1.columns=['words','score']
# 筛选出大于阈值的行

# 删除word列为非字符串的行
contact1=contact1[contact1['words'].str.isdecimal()==False]
contact1.dropna(inplace=True)
# 按照score的降序排序
contact1.sort_values(by='score',ascending=False)
# contact1
# 去重复值
contact1.drop_duplicates(subset=['words'], keep='first', inplace=True)
min = contact1['score'].min() #求最小值
min
max = contact1['score'].max() #求最大值
max
#minmax归一化处理
for i in range(len(contact1)):
    score = (contact1.iloc[i,1] - min) / (max - min)
    contact1.iloc[i,1] = score
contact1.sort_values(by='score',ascending=False,inplace=True)
contact1 = contact1.iloc[0:int(len(contact1)*threshold),0:2]
pd.DataFrame(contact1).to_csv(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\Existence_candidate_score_ultimate.txt',encoding='utf-8',header=0,index=0,sep='\t')
contact1

In [None]:
contact2 = pd.concat([Growth_candidate1,Growth_candidate_score1],axis=1)
contact2.columns=['words','score']
# contact2 = contact2[contact2['score'] > threshold]
contact2=contact2[contact2['words'].str.isdecimal()==False]
contact2.sort_values(by='score',ascending=False)
contact2.dropna(inplace=True)
# contact3
contact2.drop_duplicates(subset=['words'], keep='first',inplace=True)
contact2
min = contact2['score'].min() #求最小值
min
max = contact2['score'].max() #求最大值
max
#minmax归一化处理
for i in range(len(contact2)):
    score = (contact2.iloc[i,1] - min) / (max - min)
    contact2.iloc[i,1] = score
contact2.sort_values(by='score',ascending=False,inplace=True)
contact2 = contact2.iloc[0:int(len(contact2)*threshold),0:2]
pd.DataFrame(contact2).to_csv(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\Growth_candidate_score_ultimate.txt',encoding='utf-8',header=0,index=0,sep='\t')
contact2

In [None]:
import pandas as pd
count = 0
contact3 = pd.concat([Relatedness_candidate1,Relatedness_candidate_score1],axis=1)
contact3.columns=['words','score']
# contact3 = contact3[contact3['score']> threshold ]


contact3=contact3[contact3['words'].str.isdecimal()==False]
contact3.dropna(inplace=True)
contact3.sort_values(by='score',ascending=False,inplace=True)
# # contact5
contact3.drop_duplicates(subset=['words'], keep='first',inplace=True)
# contact3
min = contact3['score'].min() #求最小值
# min
max = contact3['score'].max() #求最大值
# max
#minmax归一化处理
for i in range(len(contact3)):
    score = (contact3.iloc[i,1] - min) / (max - min)
    contact3.iloc[i,1] = score
contact3.sort_values(by='score',ascending=False,inplace=True)
contact3 = contact3.iloc[0:int(len(contact3)*threshold),0:2]
pd.DataFrame(contact3).to_csv(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\Relatedness_candidate_score_ultimate.txt',encoding='utf-8',header=0,index=0,sep='\t')
contact3

In [None]:
# SLPA实现
# -*- coding: UTF-8 -*-

"""
Created on 17-11-30

@summary: SLPA（Speaker-listener Label Propagation Algorithm）算法实现

@author: dreamhome
"""
import networkx as nx
import numpy as np

def read_graph_from_file(path):
    """
    :param path: 从文件中读取图结构
    :return: Graph graph
    """
    # 定义图
    graph = nx.Graph()
    # 获取边列表edges_list
    edges_list = []
    # 开始获取边
    fp = open(path,encoding='utf-8')
    edge = fp.readline().split()
    while edge:
        if edge[0].isdigit() and edge[1].isdigit():
            edges_list.append((int(edge[0]), int(edge[1])))
        edge = fp.readline().split()
    fp.close()
    # 为图增加边
    graph.add_edges_from(edges_list)

    # 给每个节点增加标签
    for node, data in list(graph.nodes(data=True)):
        data['label'] = node
    return graph

def slpa(path, threshold, iteration):
    """
    slpa算法
    :param path: 图路径
    :param threshold:  阈值
    :param iteration:  迭代次数
    :return:
    """
    graph = read_graph_from_file(path)

    # 节点存储器初始化
    node_memory = []
    for n in range(graph.number_of_nodes()):
        node_memory.append({n+1: 1})

    # 算法迭代过程
    for t in range(iteration):
        # 任意选择一个监听器
        order = [x+1 for x in np.random.permutation(graph.number_of_nodes())]
        for i in order:
            label_list = {}
            # 从speaker中选择一个标签传播到listener
            for j in graph.neighbors(i):
                sum_label = sum(node_memory[j-1].values())
                label = node_memory[j-1].keys()[np.random.multinomial(
                    1, [float(c) / sum_label for c in node_memory[j-1].values()]).argmax()]
                label_list[label] = label_list.setdefault(label, 0) + 1
            # listener选择一个最流行的标签添加到内存中
            selected_label = max(label_list, key=label_list.get)
            node_memory[i-1][selected_label] = node_memory[i-1].setdefault(selected_label, 0) + 1

    # 根据阈值threshold删除不符合条件的标签
    for memory in node_memory:
        sum_label = sum(memory.values())
        threshold_num = sum_label * threshold
        for k, v in memory.items():
            if v < threshold_num:
                del memory[k]
    # 返回划分结果
    return node_memory

path = r"E:\pythonProject\pythonProject3\Roberta_wwm_ext提取语义特征.csv"
print(read_graph_from_file(path))
print(slpa(path, 0.1, 20))


In [None]:
import random
import networkx as nx
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'png'
class LPA():
    '''
    标签传播算法：传播标签来划分社区
    算法终止条件：迭代次数超过设定值
    self.G：图
    return： None
    '''
    def __init__(self, G, iters=10):
        self.iters = iters
        self.G = G

    def train(self):
        max_iter_num = 0 # 迭代次数

        while max_iter_num < self.iters:
            max_iter_num += 1
            print('迭代次数',max_iter_num)

            for node in self.G:
                count = {} # 记录邻居节点及其标签
                for nbr in self.G.neighbors(node): # node的邻居节点
                    label = self.G.nodes[nbr]['labels']
                    count[label] = count.setdefault(label,0) + 1

                # 找到出现次数最多的标签
                count_items = sorted(count.items(),key=lambda x:-x[-1])
                best_labels = [k for k,v in count_items if v == count_items[0][1]]
                # 当多个标签频次相同时随机选取一个标签
                label = random.sample(best_labels,1)[0]
                self.G.nodes[node]['labels'] = label # 更新标签

    def draw_picture(self):
        # 画图
        node_color = [float(self.G.nodes[v]['labels']) for v in self.G]
        pos = nx.spring_layout(self.G) # 节点的布局为spring型
        plt.figure(figsize = (8,6)) # 图片大小
        nx.draw_networkx(self.G,pos=pos,node_color=node_color)
        plt.show()

if __name__ == "__main__":
    G = nx.karate_club_graph() # 空手道数据集
    # 给节点添加标签
    print('G.nodes:')
    print(G.nodes())
    print('G.edges:')
    print(G.edges())
    print(G.graph)
    for node in G:
        G.add_node(node, labels = node) # 用labels的状态
    model = LPA(G)
    # 原始节点标签
    model.draw_picture()
    model.train()
    com = set([G.nodes[node]['labels'] for node in G])
    print('社区数量',len(com))
    # LPA节点标签
    model.draw_picture()


In [None]:
import matplotlib.pyplot as plt
import networkx as nx
from networkx.algorithms.community import asyn_lpa_communities as lpa

%config InlineBackend.figure_format = 'png'

# 空手道俱乐部
G   = nx.karate_club_graph()
com = list(lpa(G))
print('社区数量',len(com))


com
[{0, 1, 2, 3, 7, 8, 9, 11, 12, 13, 17, 19, 21, 30},
{4, 5, 6, 10, 16},
{14, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33}]


import matplotlib.pyplot as plt
import networkx as nx
from networkx.algorithms.community import asyn_lpa_communities as lpa
%config InlineBackend.figure_format = 'png'

# 空手道俱乐部
G   = nx.karate_club_graph()
com = list(lpa(G))
print('社区数量',len(com))
print("点：")
print(G.nodes)
print("边：")
print(G.edges)

com
[{0, 1, 2, 3, 7, 8, 9, 11, 12, 13, 17, 19, 21, 30},
{4, 5, 6, 10, 16},
{14, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33}]

# 下面是画图
pos = nx.spring_layout(G) # 节点的布局为spring型
NodeId    = list(G.nodes())
node_size = [G.degree(i)**1.2*90 for i in NodeId] # 节点大小


plt.figure(figsize = (8,6)) # 设置图片大小
nx.draw(G,pos,
        with_labels=True,
        node_size =node_size,
        node_color='w',
        node_shape = '.'
       )
'''
node_size表示节点大小
node_color表示节点颜色
with_labels=True表示节点是否带标签
'''
color_list = ['pink','orange','r','g','b','y','m','gray','black','c','brown']
for i in range(len(com)):
    nx.draw_networkx_nodes(G, pos,
                           nodelist=com[i],
                           node_color = color_list[i+2],
                           label=True)
plt.show()

In [None]:
import pandas as pd
data = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\Relatedness_candidate_score.txt',encoding='utf-8',header=None)
data
data[data[0]>0]

In [None]:
from torchvision import transforms
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485,0.456,0.406],
        std=[0.229,0.224,0.225]
    )
])

In [None]:
from PIL import Image
img = Image.open("")
img.show()
