# 资源预处理

## 大连理工大学情感词汇本体库

### 辅助情感分类的处理

In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
df = pd.read_excel('../情感词汇本体.xlsx')

del df['Unnamed: 10']
del df['Unnamed: 11']
# 修改‘PH ’空格（词语：光怪陆离）
df.loc[16894, '情感分类'] = 'PH'
# 修改‘NA’的缺失（388个）
df['情感分类'] = df['情感分类'].fillna('NA')

df.rename(columns={'强度.1':'辅助情感强度', '极性.1':'辅助情感极性'}, inplace=True)

len(df)

27466

In [3]:
df.head()

Unnamed: 0,词语,词性种类,词义数,词义序号,情感分类,强度,极性,辅助情感分类,辅助情感强度,辅助情感极性
0,脏乱,adj,1.0,1.0,NN,7,2,,,
1,糟报,adj,1.0,1.0,NN,5,2,,,
2,早衰,adj,1.0,1.0,NE,5,2,,,
3,责备,verb,1.0,1.0,NN,5,2,,,
4,贼眼,noun,1.0,1.0,NN,5,2,,,


In [4]:
df[df['情感分类'].isnull()]

Unnamed: 0,词语,词性种类,词义数,词义序号,情感分类,强度,极性,辅助情感分类,辅助情感强度,辅助情感极性


In [5]:
have2_emotion_df = df[df['辅助情感分类'].notnull()]
len(have2_emotion_df)

3828

In [6]:
have2_emotion_df.head()

Unnamed: 0,词语,词性种类,词义数,词义序号,情感分类,强度,极性,辅助情感分类,辅助情感强度,辅助情感极性
5,战祸,noun,1.0,1.0,ND,5,2,NC,5.0,2.0
7,折辱,noun,1.0,1.0,NE,5,2,NN,5.0,2.0
20,神采,adj,1.0,1.0,PA,5,1,PH,1.0,1.0
23,盛誉,noun,1.0,1.0,PH,5,1,PA,1.0,1.0
71,友邻,adj,1.0,1.0,PH,5,1,PB,3.0,1.0


In [7]:
words = df['词语'].tolist()
emotions = df['情感分类'].tolist()
intensities = df['强度'].tolist()
polarities = df['极性'].tolist()

len(words)

27466

In [8]:
words += have2_emotion_df['词语'].tolist()
emotions += have2_emotion_df['辅助情感分类'].tolist()
intensities += have2_emotion_df['辅助情感强度'].tolist()
polarities += have2_emotion_df['辅助情感极性'].tolist()

len(words), len(words) - len(have2_emotion_df)

(31294, 27466)

In [9]:
pure_df = pd.DataFrame({'word': words, 'emotion': emotions,
                        'intensity': intensities, 'polarity': polarities})
len(pure_df)

31294

In [10]:
pure_df.head()

Unnamed: 0,word,emotion,intensity,polarity
0,脏乱,NN,7.0,2.0
1,糟报,NN,5.0,2.0
2,早衰,NE,5.0,2.0
3,责备,NN,5.0,2.0
4,贼眼,NN,5.0,2.0


In [11]:
words = pure_df['word'].tolist()
print(len(words))
print(len(set(words)))

31294
27351


In [12]:
pure_df['word'].value_counts().head()

无赖          5
好事          4
燕雀安知鸿鹄之志    3
轻飘          3
呜呼哀哉        3
Name: word, dtype: int64

In [13]:
df[df['词语']=='无赖']

Unnamed: 0,词语,词性种类,词义数,词义序号,情感分类,强度,极性,辅助情感分类,辅助情感强度,辅助情感极性
3504,无赖,noun,1.0,1.0,NN,7,2,,,
14737,无赖,adv,4.0,1.0,ND,9,2,NN,9.0,2.0
14738,无赖,adv,4.0,3.0,ND,9,2,NN,9.0,2.0


In [14]:
pure_df[pure_df['word']=='无赖']

Unnamed: 0,word,emotion,intensity,polarity
3504,无赖,NN,7.0,2.0
14737,无赖,ND,9.0,2.0
14738,无赖,ND,9.0,2.0
30717,无赖,NN,9.0,2.0
30718,无赖,NN,9.0,2.0


In [15]:
print(len(pure_df))
pure_df.drop_duplicates(inplace=True)
print(len(pure_df))
pure_df.dropna(inplace=True)
len(pure_df)

31294
31266


31264

In [16]:
pure_df[pure_df['emotion'].isnull()]

Unnamed: 0,word,emotion,intensity,polarity


In [17]:
words = pure_df['word'].tolist()
print(len(words))
print(len(set(words)))

31264
27351


In [18]:
pure_df['word'].value_counts().head()

好事      4
内乱      3
轻飘      3
郁结      3
呜呼哀哉    3
Name: word, dtype: int64

In [19]:
pure_df[pure_df['word']=='好事']

Unnamed: 0,word,emotion,intensity,polarity
5689,好事,NN,5.0,2.0
9633,好事,PH,5.0,1.0
9659,好事,PA,3.0,3.0
29814,好事,NN,1.0,3.0


最多的词重复了3次，因此维度为 21 + 1 \* 4 + 1 \* 4 = 29

In [20]:
pure_df.to_csv('emotion_words_{}.csv'.format(len(pure_df)), index=None)

### 导出 词语-矩阵

In [21]:
import pandas as pd
import joblib
import numpy as np

In [22]:
df = pd.read_csv('emotion_words_31264.csv', keep_default_na=False)
len(df)

31264

In [23]:
df.head()

Unnamed: 0,word,emotion,intensity,polarity
0,脏乱,NN,7.0,2.0
1,糟报,NN,5.0,2.0
2,早衰,NE,5.0,2.0
3,责备,NN,5.0,2.0
4,贼眼,NN,5.0,2.0


In [24]:
categories = list(set(df['emotion'].tolist()))
categories.sort()
print(len(categories))

categories

21


['NA',
 'NB',
 'NC',
 'ND',
 'NE',
 'NG',
 'NH',
 'NI',
 'NJ',
 'NK',
 'NL',
 'NN',
 'PA',
 'PB',
 'PC',
 'PD',
 'PE',
 'PF',
 'PG',
 'PH',
 'PK']

In [25]:
category2name = dict()
category2name['PA'] = '快乐'
category2name['PE'] = '安心'
category2name['PD'] = '尊敬'
category2name['PH'] = '赞扬'
category2name['PG'] = '相信'
category2name['PB'] = '喜爱'
category2name['PK'] = '祝愿'
category2name['NA'] = '愤怒'
category2name['NB'] = '悲伤'
category2name['NJ'] = '失望'
category2name['NH'] = '疚'
category2name['PF'] = '思'
category2name['NI'] = '慌'
category2name['NC'] = '恐惧'
category2name['NG'] = '羞'
category2name['NE'] = '烦闷'
category2name['ND'] = '憎恶'
category2name['NN'] = '贬责'
category2name['NK'] = '妒忌'
category2name['NL'] = '怀疑'
category2name['PC'] = '惊奇'

In [26]:
names = [category2name[c] for c in categories]
len(names), names

(21,
 ['愤怒',
  '悲伤',
  '恐惧',
  '憎恶',
  '烦闷',
  '羞',
  '疚',
  '慌',
  '失望',
  '妒忌',
  '怀疑',
  '贬责',
  '快乐',
  '喜爱',
  '惊奇',
  '尊敬',
  '安心',
  '思',
  '相信',
  '赞扬',
  '祝愿'])

In [27]:
category2index = dict(zip(categories, [i for i in range(len(categories))]))

category2index

{'NA': 0,
 'NB': 1,
 'NC': 2,
 'ND': 3,
 'NE': 4,
 'NG': 5,
 'NH': 6,
 'NI': 7,
 'NJ': 8,
 'NK': 9,
 'NL': 10,
 'NN': 11,
 'PA': 12,
 'PB': 13,
 'PC': 14,
 'PD': 15,
 'PE': 16,
 'PF': 17,
 'PG': 18,
 'PH': 19,
 'PK': 20}

In [28]:
words = df['word'].tolist()
emotions = df['emotion'].tolist()
intensities = df['intensity'].tolist()
polarities = df['polarity'].tolist()

In [29]:
words2array = dict()
words2occur = dict()

for i, word in enumerate(words):
    emotion = emotions[i]
    intensity = intensities[i]
    polarity = polarities[i]
    
    if word not in words2array.keys():
        arr = np.zeros(29)
        arr[category2index[emotion]] += 1
        arr[21] = intensity
        arr[25] = polarity
        
        words2array[word] = arr
        words2occur[word] = 1
        
    else:
        words2array[word][category2index[emotion]] += 1
        words2array[word][21 + words2occur[word]] = intensity
        words2array[word][25 + words2occur[word]] = polarity
        
        words2occur[word] += 1

In [30]:
len(words2array)

27351

In [31]:
df['word'].value_counts().head()

好事      4
内乱      3
靠不住     3
心虚      3
暗送秋波    3
Name: word, dtype: int64

In [32]:
df[df['word']=='暗送秋波']

Unnamed: 0,word,emotion,intensity,polarity
2787,暗送秋波,PB,5.0,1.0
2788,暗送秋波,NN,5.0,2.0
28035,暗送秋波,ND,3.0,2.0


In [33]:
words2array['暗送秋波']

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 5., 5., 3., 0., 1., 2., 2., 0.])

In [34]:
df.head()

Unnamed: 0,word,emotion,intensity,polarity
0,脏乱,NN,7.0,2.0
1,糟报,NN,5.0,2.0
2,早衰,NE,5.0,2.0
3,责备,NN,5.0,2.0
4,贼眼,NN,5.0,2.0


In [35]:
joblib.dump((category2index, words2array),
            'words2array_{}.pkl'.format(len(words2array)))

['words2array_27351.pkl']

In [36]:
a, b = joblib.load('words2array_27351.pkl')
a

{'NA': 0,
 'NB': 1,
 'NC': 2,
 'ND': 3,
 'NE': 4,
 'NG': 5,
 'NH': 6,
 'NI': 7,
 'NJ': 8,
 'NK': 9,
 'NL': 10,
 'NN': 11,
 'PA': 12,
 'PB': 13,
 'PC': 14,
 'PD': 15,
 'PE': 16,
 'PF': 17,
 'PG': 18,
 'PH': 19,
 'PK': 20}

In [37]:
list(b.items())[:5]

[('脏乱',
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 7., 0., 0., 0., 2., 0., 0., 0.])),
 ('糟报',
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 5., 0., 0., 0., 2., 0., 0., 0.])),
 ('早衰',
  array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 5., 0., 0., 0., 2., 0., 0., 0.])),
 ('责备',
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 5., 0., 0., 0., 2., 0., 0., 0.])),
 ('贼眼',
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 5., 0., 0., 0., 2., 0., 0., 0.]))]