In [1]:
#!/usr/bin/env python3
# coding: utf-8
# File: so-pmi.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-4-4

import jieba.posseg as pseg
import jieba
import math,time
import datetime

class ChineseSoPmi:
    def __init__(self, inputtext_file, seedword_txtfile, pos_candi_txt_file, neg_candi_txtfile):

        self.text_file = inputtext_file
        self.pos_candi_txt_file = pos_candi_txt_file
        self.neg_candi_txtfile = neg_candi_txtfile
        self.seedword_txtfile = seedword_txtfile

    '''分词'''
    def seg_corpus(self, train_data, seedword_txtfile):
        #将情感词加入到用户词典当中，保证分词能够将种子情感词切开
        sentiment_words = [line.strip().split('\t')[0] for line in open(seedword_txtfile, encoding='utf-8')]
        for word in sentiment_words:
            jieba.add_word(word)
        seg_data = list()
        count = 0
        for line in open(train_data, encoding='utf-8'):
            line = line.strip()
            count += 1
            if line:
                seg_data.append([word.word for word in pseg.cut(line) if word.flag[0] not in ['u','w','x','p','q','m']])
            else:
                continue
        return seg_data

    '''统计搭配次数'''
    def collect_cowords(self, seedword_txtfile, seg_data):
        def check_words(sent):
            if set(sentiment_words).intersection(set(sent)):
                return True
            else:
                return False
        cowords_list = list()
        window_size = 5
        count = 0
        sentiment_words = [line.strip().split('\t')[0] for line in open(seedword_txtfile, encoding='utf-8')]
        for sent in seg_data:
            count += 1
            if check_words(sent):
                for index, word in enumerate(sent):
                    if index < window_size:
                        left = sent[:index]
                    else:
                        left = sent[index - window_size: index]
                    if index + window_size > len(sent):
                        right = sent[index + 1:]
                    else:
                        right = sent[index: index + window_size + 1]
                    context = left + right + [word]
                    if check_words(context):
                        for index_pre in range(0, len(context)):
                            if check_words([context[index_pre]]):
                                for index_post in range(index_pre + 1, len(context)):
                                    cowords_list.append(context[index_pre] + '@' + context[index_post])
        return cowords_list

    '''计算So-Pmi值'''
    def collect_candiwords(self, seg_data, cowords_list, seedword_txtfile):
        '''互信息计算公式'''
        def compute_mi(p1, p2, p12):
            return math.log2(p12) - math.log2(p1) - math.log2(p2)
        '''统计词频'''
        def collect_worddict(seg_data):
            word_dict = dict()
            all = 0
            for line in seg_data:
                for word in line:
                    if word not in word_dict:
                        word_dict[word] = 1
                    else:
                        word_dict[word] += 1
            all = sum(word_dict.values())
            return word_dict, all
        '''统计词共现次数'''
        def collect_cowordsdict(cowords_list):
            co_dict = dict()
            candi_words = list()
            for co_words in cowords_list:
                candi_words.extend(co_words.split('@'))
                if co_words not in co_dict:
                    co_dict[co_words] = 1
                else:
                    co_dict[co_words] += 1
            return co_dict, candi_words
        '''收集种子情感词'''
        def collect_sentiwords(seedword_txtfile, word_dict):
            pos_words = set([line.strip().split('\t')[0] for line in open(seedword_txtfile, encoding='utf-8') if
                             line.strip().split('\t')[1] == 'pos']).intersection(set(word_dict.keys()))
            neg_words = set([line.strip().split('\t')[0] for line in open(seedword_txtfile, encoding='utf-8') if
                             line.strip().split('\t')[1] == 'neg']).intersection(set(word_dict.keys()))
            return pos_words, neg_words
        '''计算sopmi值'''
        def compute_sopmi(candi_words, pos_words, neg_words, word_dict, co_dict, all):
            pmi_dict = dict()
            for candi_word in set(candi_words):
                pos_sum = 0.0
                neg_sum = 0.0
                for pos_word in pos_words:
                    p1 = word_dict[pos_word] / all
                    p2 = word_dict[candi_word] / all
                    pair = pos_word + '@' + candi_word
                    if pair not in co_dict:
                        continue
                    p12 = co_dict[pair] / all
                    pos_sum += compute_mi(p1, p2, p12)

                for neg_word in neg_words:
                    p1 = word_dict[neg_word] / all
                    p2 = word_dict[candi_word] / all
                    pair = neg_word + '@' + candi_word
                    if pair not in co_dict:
                        continue
                    p12 = co_dict[pair] / all
                    neg_sum += compute_mi(p1, p2, p12)

                so_pmi = pos_sum - neg_sum
                pmi_dict[candi_word] = so_pmi
            return pmi_dict

        word_dict, all = collect_worddict(seg_data)
        co_dict, candi_words = collect_cowordsdict(cowords_list)
        pos_words, neg_words = collect_sentiwords(seedword_txtfile, word_dict)
        pmi_dict = compute_sopmi(candi_words, pos_words, neg_words, word_dict, co_dict, all)
        return pmi_dict

    '''保存结果'''
    def save_candiwords(self, pmi_dict, pos_candi_txt_file, neg_candi_txtfile):
        def get_tag(word):
            if word:
                return [item.flag for item in pseg.cut(word)][0]
            else:
                return 'x'
        pos_dict = dict()
        neg_dict = dict()
        f_neg = open(neg_candi_txtfile, 'w+', encoding='utf-8')
        f_pos = open(pos_candi_txt_file, 'w+', encoding='utf-8')

        for word, word_score in pmi_dict.items():
            if word_score > 0:
                pos_dict[word] = word_score
            else:
                neg_dict[word] = abs(word_score)

        for word, pmi in sorted(pos_dict.items(), key=lambda asd:asd[1], reverse=True):
            f_pos.write(word + ',' + str(pmi) + ',' + 'pos'+ '\n')
        for word, pmi in sorted(neg_dict.items(), key=lambda asd:asd[1], reverse=True):
            f_neg.write(word + ',' + str(pmi) + ',' + 'neg' + '\n')
        f_neg.close()
        f_pos.close()
        return

    def sopmi(self):
        print('step 1/4:...seg corpus ...')
        start_time  = time.time()
        start_time  = datetime.datetime.now()
        seg_data = self.seg_corpus(self.text_file, self.seedword_txtfile)
        # end_time1 = time.time()
        end_time1 = datetime.datetime.now()
        print('step 1/4 finished:...cost {0}...'.format((end_time1 - start_time)))
        print('step 2/4:...collect cowords ...')
        cowords_list = self.collect_cowords(self.seedword_txtfile, seg_data)
        # end_time2 = time.time()
        end_time2 = datetime.datetime.now()
        print('step 2/4 finished:...cost {0}...'.format((end_time2 - end_time1)))
        print('step 3/4:...compute sopmi ...')
        pmi_dict = self.collect_candiwords(seg_data, cowords_list, self.seedword_txtfile)
        # end_time3 = time.time()
        end_time3 = datetime.datetime.now()
        print('step 1/4 finished:...cost {0}...'.format((end_time3 - end_time2)))
        print('step 4/4:...save candiwords ...')
        self.save_candiwords(pmi_dict, self.pos_candi_txt_file, self.neg_candi_txtfile)
        end_time = datetime.datetime.now()
        print('finished! cost {0}'.format(end_time - start_time))

def test():
    sopmier = ChineseSoPmi(inputtext_file='corpus.txt',
                           seedword_txtfile='seed_words.txt',
                           pos_candi_txt_file='neg_candi.txt',
                           neg_candi_txtfile='pos_candi.txt')
    sopmier.sopmi()

In [2]:
import pandas as pd
data = pd.read_table(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Existence.txt',encoding='utf-8',header=None)
data
data = data[0].to_list()
data
with open(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Esistence_seed_words.txt',encoding='utf-8',mode='a+') as f:
    for i in data:
        f.write(i+'\t'+'pos'+'\n')
f.close()

In [3]:
data1 = pd.read_table(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Growth.txt',encoding='utf-8',header=None)
data1 = data1[0].to_list()
data1
with open(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Esistence_seed_words.txt',encoding='utf-8',mode='a+') as f:
    for i in data1:
        f.write(i+'\t'+'neg'+'\n')
f.close()

In [4]:
data2 = pd.read_table(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Relatedness.txt',encoding='utf-8',header=None)
data2 = data2[0].to_list()
data2
with open(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Esistence_seed_words.txt',encoding='utf-8',mode='a+') as f:
    for i in data2:
        f.write(i+'\t'+'neg'+'\n')
f.close()

In [None]:
import datetime
start  =datetime.datetime.now()
sopmier = ChineseSoPmi(inputtext_file=r'E:\pythonProject\需求分析词典\知乎问题数据集.txt',
                       seedword_txtfile=r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Esistence_seed_words.txt',
                       pos_candi_txt_file=r'Existence_pos_candi.txt',
                       neg_candi_txtfile=r'Existence_neg_candi.txt')
sopmier.sopmi()
end = datetime.datetime.now()
print(end-start)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Shy0418\AppData\Local\Temp\jieba.cache


step 1/4:...seg corpus ...


Loading model cost 0.592 seconds.
Prefix dict has been built successfully.


In [None]:
import pandas as pd
data = pd.read_table(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Growth.txt',encoding='utf-8',header=None)
data
data = data[0].to_list()
data
with open(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Growth_seed_words.txt',encoding='utf-8',mode='a+') as f:
    for i in data:
        f.write(i+'\t'+'pos'+'\n')
f.close()

In [None]:
data1 = pd.read_table(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Existence.txt',encoding='utf-8',header=None)
data1 = data1[0].to_list()
data1
with open(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Growth_seed_words.txt',encoding='utf-8',mode='a+') as f:
    for i in data1:
        f.write(i+'\t'+'neg'+'\n')
f.close()

In [None]:
data1 = pd.read_table(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Relatedness.txt',encoding='utf-8',header=None)
data1 = data1[0].to_list()
data1
with open(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Growth_seed_words.txt',encoding='utf-8',mode='a+') as f:
    for i in data1:
        f.write(i+'\t'+'neg'+'\n')
f.close()

In [None]:
sopmier = ChineseSoPmi(inputtext_file=r'E:\pythonProject\需求分析词典\知乎问题数据集.txt',
                       seedword_txtfile=r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Growth_seed_words.txt',
                       pos_candi_txt_file=r'Growth_pos_candi.txt',
                       neg_candi_txtfile=r'Growth_neg_candi.txt')
sopmier.sopmi()

In [None]:
import pandas as pd
data = pd.read_table(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Relatedness.txt',encoding='utf-8',header=None)
data
data = data[0].to_list()
data
with open(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Relatedness_seed_words.txt',encoding='utf-8',mode='a+') as f:
    for i in data:
        f.write(i+'\t'+'pos'+'\n')
f.close()

In [None]:
data1 = pd.read_table(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Existence.txt',encoding='utf-8',header=None)
data1 = data1[0].to_list()
data1
with open(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Relatedness_seed_words.txt',encoding='utf-8',mode='a+') as f:
    for i in data1:
        f.write(i+'\t'+'neg'+'\n')
f.close()

In [None]:
data1 = pd.read_table(r'E:\pythonProject\需求分析词典\种子词选择\同义词扩展后的Growth.txt',encoding='utf-8',header=None)
data1 = data1[0].to_list()
data1
with open(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Relatedness_seed_words.txt',encoding='utf-8',mode='a+') as f:
    for i in data1:
        f.write(i+'\t'+'neg'+'\n')
f.close()

In [None]:
sopmier = ChineseSoPmi(inputtext_file=r'E:\pythonProject\需求分析词典\知乎问题数据集.txt',
                       seedword_txtfile=r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Relatedness_seed_words.txt',
                       pos_candi_txt_file=r'Relatedness_pos_candi.txt',
                       neg_candi_txtfile=r'Relatedness_neg_candi.txt')
sopmier.sopmi()

In [87]:
threshold1 = 0.45
threshold2 = 0.45
threshold3 = 0.45

In [88]:
import pandas as pd
Existence_pmi = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Existence_pos_candi.txt',encoding='utf-8',header=None,sep=',')
# Existence
Existence_pmi.columns=['words','score','sort']
Existence_pmi = Existence_pmi[['words','score']]
len(Existence_pmi)

Existence_pmi.sort_values(by='score',ascending=False)
Existence_pmi.drop_duplicates(subset=['words'],keep='first',inplace=True)
min = Existence_pmi['score'].min()
max = Existence_pmi['score'].max()

for i in range(len(Existence_pmi)):
    score = (Existence_pmi.iloc[i,1] - min) / (max - min)
    Existence_pmi.iloc[i,1] = score

pd.DataFrame(Existence_pmi).to_csv(r'Existence_candidata_score_ultimate_sopmi.txt',encoding='utf-8',sep = '\t',header=0,index=0)
Existence_pmi

Unnamed: 0,words,score
0,病例,1.000000e+00
1,确诊,8.632906e-01
2,例,8.619289e-01
3,首例,8.386238e-01
4,患者,6.875511e-01
...,...,...
43266,刘庆香,2.978847e-06
43267,说恨,1.644969e-06
43268,memoQ,1.625884e-06
43269,惠东,4.866190e-07


In [89]:
import pandas as pd
Growth_pmi = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Growth_pos_candi.txt',encoding='utf-8',header=None,sep=',')

Growth_pmi.columns=['words','score','sort']
Growth_pmi = Growth_pmi[['words','score']]
len(Existence_pmi)
# Growth_pmi = Growth_pmi.iloc[0:int(threshold*len(Growth_pmi)),0:2]

Growth_pmi.sort_values(by='score',ascending=False)
Growth_pmi.drop_duplicates(subset=['words'],keep='first',inplace=True)
min = Growth_pmi['score'].min()
max = Growth_pmi['score'].max()

for i in range(len(Growth_pmi)):
    score = (Growth_pmi.iloc[i,1] - min) / (max - min)
    Growth_pmi.iloc[i,1] = score

pd.DataFrame(Growth_pmi).to_csv(r'Growth_candidata_score_ultimate_sopmi.txt',encoding='utf-8',sep = '\t',header=0,index=0)
Growth_pmi

Unnamed: 0,words,score
0,捷径,1.000000
1,清北,0.948902
2,参考书,0.891070
3,考不上,0.820108
4,直入,0.781623
...,...,...
21271,涨有,0.000015
21272,厚码,0.000015
21273,赞学,0.000005
21274,系恐,0.000004


In [90]:
import pandas as pd
Relatedness_pmi = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Relatedness_pos_candi.txt',encoding='utf-8',header=None,sep=',')

Relatedness_pmi.columns=['words','score','sort']
Relatedness_pmi = Relatedness_pmi[['words','score']]
len(Relatedness_pmi)
# Relatedness_pmi = Relatedness_pmi.iloc[0:int(threshold*len(Relatedness_pmi)),0:2]
Relatedness_pmi.sort_values(by='score',ascending=False)
Relatedness_pmi.drop_duplicates(subset=['words'],keep='first',inplace=True)
min = Relatedness_pmi['score'].min()
max = Relatedness_pmi['score'].max()

for i in range(len(Relatedness_pmi)):
    score = (Relatedness_pmi.iloc[i,1] - min) / (max - min)
    Relatedness_pmi.iloc[i,1] = score
pd.DataFrame(Relatedness_pmi).to_csv(r'Relatedness_candidata_score_ultimate_sopmi.txt',encoding='utf-8',sep = '\t',header=0,index=0)
Relatedness_pmi

Unnamed: 0,words,score
0,背着,1.000000
1,聊得,0.875182
2,冷漠,0.808730
3,删了,0.749389
4,不懂事,0.727844
...,...,...
24960,孙庞斗智,0.000022
24961,HREC,0.000021
24962,换屏,0.000012
24963,三生,0.000001


## 定义集成学习的规则：
### 如果so-pmi和w2v中都有，则取两个平均
### 如果so-pmi中有，则取so-pmi
### 如果w2v中有，则取w2v

In [91]:
# 求交集
Existence_w2v = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\Existence_candidate_score_ultimate.txt',encoding='utf-8',header=None)
Existence_w2v.columns=['words','score']
# Existence_w2v
Existence_pmi.columns=['words','score']
Existence_pmi
Existence_merge = pd.merge(Existence_w2v,Existence_pmi,on=['words'])
Existence_merge
# for i in range(len(Existence_merge)):
#     Existence_merge.iloc[i,1] = (Existence_merge.iloc[i,1] + Existence_merge.iloc[i,2]) /2
#     print(Existence.iloc[i,2])


# Existence_merge.drop(columns='score_y')
Existence_merge.drop(labels='score_y',inplace=True,axis=1)
pd.DataFrame(Existence_merge).to_csv(r'Existence_merge.txt',encoding='utf-8',header=0,index=0)

In [92]:
# 并-交，并去重，有重复的则留下第一个
Existence_w2v = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\Existence_candidate_score_ultimate.txt',encoding='utf-8',header=None)
Existence_pmi = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Existence_candidata_score_ultimate_sopmi.txt',encoding='utf-8',header=None)
Existence_merge = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Existence_merge.txt',encoding='utf-8',header=None,sep=',')
# Existence_merge
# Existence_pmi
# Existence_m2v
Existence = pd.concat([Existence_merge,Existence_pmi,Existence_w2v],axis=0).drop_duplicates(subset=[0],keep='first')
Existence.sort_values(by=1,ascending=False,inplace=True)
Existence =Existence.iloc[0:int(len(Existence)*threshold1),0:2]

pd.DataFrame(Existence).to_csv(r'E:\pythonProject\需求分析词典\wordexpansion\Result\Existence_words_score_ultimate.txt',encoding='utf-8',header=0,index=0)
Existence

Unnamed: 0,0,1
0,柬埔寨,1.000000
1,意大利,0.992389
2,纽约,0.971933
2,印度,0.971818
4,泰国,0.971577
...,...,...
14767,地盘,0.026141
14768,本质特征,0.026140
14769,光复活,0.026138
14771,山姆,0.026134


In [93]:
# 求交集
Growth_w2v = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\Growth_candidate_score_ultimate.txt',encoding='utf-8',header=None)
Growth_w2v.columns=['words','score']
# Existence_w2v
Growth_pmi.columns=['words','score']
Growth_pmi
Growth_merge = pd.merge(Growth_w2v,Growth_pmi,on=['words'])
# for i in range(len(Existence_merge)):
#     Existence_merge.iloc[i,1] = (Existence_merge.iloc[i,1] + Existence_merge.iloc[i,2]) /2
#     print(Existence.iloc[i,2])
# Existence_merge.drop(columns='score_y')
Growth_merge.drop(labels='score_y',inplace=True,axis=1)
pd.DataFrame(Growth_merge).to_csv(r'Growth_merge.txt',encoding='utf-8',header=0,index=0,sep='\t')
Growth_merge

Unnamed: 0,words,score_x
0,礼智信,0.964601
1,写作,0.930929
2,韩语,0.925577
3,完查,0.919864
4,必修课,0.918332
...,...,...
966,必固,0.421614
967,根因,0.420197
968,ARE,0.420060
969,赤壁之战,0.419904


In [94]:
Growth_w2v = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\Growth_candidate_score_ultimate.txt',encoding='utf-8',header=None,sep='\t')
Growth_pmi = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Growth_candidata_score_ultimate_sopmi.txt',encoding='utf-8',header=None,sep='\t')
Growth_merge = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Growth_merge.txt',encoding='utf-8',header=None,sep='\t')

Growth_merge
Growth = pd.concat([Growth_merge,Growth_pmi,Growth_w2v],axis=0).drop_duplicates(subset=[0],keep='first')
Growth.sort_values(by=1,ascending=False,inplace=True)
Growth = Growth.iloc[0:int(len(Growth)*threshold2),0:2]
pd.DataFrame(Growth).to_csv(r'E:\pythonProject\需求分析词典\wordexpansion\Result\Growth_words_score_ultimate.txt',encoding='utf-8',header=0,index=0)
Growth

Unnamed: 0,0,1
0,二货猪,1.000000
0,捷径,1.000000
0,礼智信,0.964601
1,清北,0.948902
2,写出,0.942362
...,...,...
5213,作善,0.110428
5215,生选,0.110413
5218,生存力,0.110388
5216,可数,0.110388


In [95]:
#求交集
Relatedness_w2v = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\Relatedness_candidate_score_ultimate.txt',encoding='utf-8',header=None)
Relatedness_w2v.columns=['words','score']
Relatedness_w2v
Relatedness_pmi.columns=['words','score']
Relatedness_pmi
Relatedness_merge = pd.merge(Relatedness_w2v,Relatedness_pmi,on=['words'])
Relatedness_merge.drop(labels='score_y',inplace=True,axis=1)
pd.DataFrame(Relatedness_merge).to_csv(r'Relatedness_merge.txt',encoding='utf-8',header=0,index=0,sep='\t')
Relatedness_merge

Unnamed: 0,words,score_x
0,删掉,1.000000
1,爱生气,0.990521
2,男同学,0.958325
3,radwimps,0.953351
4,女同学,0.952418
...,...,...
901,Tiger,0.391899
902,教师应,0.391119
903,侠客,0.389109
904,高圆圆,0.388872


In [96]:
Relatedness_w2v = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\词向量法\Relatedness_candidate_score_ultimate.txt',encoding='utf-8',header=None,sep='\t')
Relatedness_pmi = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Relatedness_candidata_score_ultimate_sopmi.txt',encoding='utf-8',header=None,sep='\t')
Relatedness_merge = pd.read_table(r'E:\pythonProject\需求分析词典\wordexpansion\test\共现法\Relatedness_merge.txt',encoding='utf-8',header=None,sep='\t')

Relatedness = pd.concat([Relatedness_merge,Relatedness_pmi,Relatedness_w2v],axis=0).drop_duplicates(subset=[0],keep='first')
Relatedness.sort_values(by=1,ascending=False,inplace=True)
Relatedness = Relatedness.iloc[0:int(len(Relatedness)*threshold3),0:2]
pd.DataFrame(Relatedness).to_csv(r'E:\pythonProject\需求分析词典\wordexpansion\Result\Relatedness_words_score_ultimate.txt',encoding='utf-8',header=0,index=0)
Relatedness

Unnamed: 0,0,1
0,删掉,1.000000
1,爱生气,0.990521
2,男票,0.986755
2,男同学,0.958325
3,radwimps,0.953351
...,...,...
7364,酸臭,0.082415
7365,睡出,0.082415
7366,自含,0.082415
7367,对簿公堂,0.082408


In [3]:
import re
import csv
import jieba
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder

def chinese(text):
    """
    对中文数据进行处理，并将计算出的pmi保存到"中文pmi计算.csv"
    """
    content = ''.join(re.findall(r'[\u4e00-\u9fa5]+', text))

    words = jieba.cut(content)

    words = [w for w in words if len(w)>1]

    bigram_measures = BigramAssocMeasures()

    finder = BigramCollocationFinder.from_words(words)

    with open('中文pmi计算.csv','a+',encoding='utf-8',newline='') as csvf:

        writer = csv.writer(csvf)

        writer.writerow(('word1','word2','pmi_score'))

        for row in finder.score_ngrams(bigram_measures.pmi):

            data = (*row[0],row[1])
            try:
                writer.writerow(data)
            except:
                pass

In [4]:
def english(text):
    """
    对英文数据进行处理，并将计算出的pmi保存到"english_pmi_computer.csv"
    """

    stopwordss = set(stopwords.words('english'))

    stemmer = nltk.stem.snowball.SnowballStemmer('english')

    tokenizer = nltk.tokenize.RegexpTokenizer('\w+')

    words = tokenizer.tokenize(text)

    words = [w for w in words if not w.isnumeric()]

    words = [w.lower() for w in words]

    words = [stemmer.stem(w) for w in words]

    words = [w for w in words if w not in stopwordss]

    bigram_measures = BigramAssocMeasures()

    finder = BigramCollocationFinder.from_words(words)

    with open('english_pmi_computer.csv','a+',encoding='gbk',newline='') as csvf:

        writer = csv.writer(csvf)

        writer.writerow(('word1','word2','pmi_score'))

        for row in finder.score_ngrams(bigram_measures.pmi):

            data = (*row[0],row[1])
            try:
                writer.writerow(data)
            except:
                pass

In [None]:
def pmi_score(file,lang,column='数据列'):
    """
    计算pmi
    :param file: 原始文本数据文件
    :param lang: 数据的语言,参数为chinese或english
    :param column: 如果文件为excel形式的文件，column为excel中的数据列

    """
    #读取数据
    text = ''
    if 'csv' in file:
        df = pd.read_csv(file)
        rows = df.iterrows()
        for row in rows:
            text += row[1][column]
    elif ('xlsx' in file) or ('xls' in file):
        df = pd.read_excel(file)
        rows = df.iterrows()
        for row in rows:
            text += row[1][column]
    else:
        text = open(file).read()

    #对该语言的文本数据计算pmi
    globals()[lang](text)

#计算pmi
pmi_score(file='test.txt',lang='chinese')