### FastText词向量训练

In [1]:
from utils import load_corpus

#### 对语料库进行清洗和分词

In [2]:
import pandas as pd

data = load_corpus("weibo2018/train.txt") + load_corpus("weibo2018/test.txt")
df = pd.DataFrame(data, columns=["content", "sentiment"])

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.502 seconds.
Prefix dict has been built succesfully.


In [3]:
df.head()

Unnamed: 0,content,sentiment
0,"[书中, 自有, 黄金屋, 书中, 自有, 颜如玉, 沿着, 岁月, 的, 长河, 跋涉, ...",1
1,"[这是, 英超, 被, 黑, 的, 最惨, 的, 一次, [二哈], [二哈], 十几年来,...",0
2,"[中国, 远洋, 海运, 集团, 副总经理, 俞曾, 港, 月, 日, 在, 上, 表示, ...",1
3,"[看, 流星花园, 其实, 也, 还好, 啦, 现在, 的, 观念, 以及, 时尚, 眼光,...",1
4,"[汉武帝, 的, 罪己, 诏, 的, 真实性, 尽管, 存在, 着, 争议, 然而, 轮台,...",1


#### 训练词向量

In [4]:
from gensim.models import FastText
model = FastText(df["content"], 
                 size=100,
                 window=5, 
                 min_count=3, # 只保留出现次数大于3的词语
                 iter=1000,  # 10000次训练
                 min_n=2,     # 默认为3,因为文本是中文这里改为2
                 max_n=4,     # 默认为6,因为文本是中文这里改为5
                 word_ngrams=1)

#### 词向量效果展示

In [5]:
model.most_similar("我")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('你', 0.8299396634101868),
 ('他', 0.7914878129959106),
 ('她', 0.7295881509780884),
 ('自己', 0.6827410459518433),
 ('不', 0.649764358997345),
 ('你们', 0.6482383608818054),
 ('了', 0.6053299307823181),
 ('有人', 0.5951108932495117),
 ('他们', 0.5880480408668518),
 ('别人', 0.5770147442817688)]

In [6]:
model.most_similar("开心")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('喜欢', 0.5164913535118103),
 ('小心', 0.49298298358917236),
 ('幸福', 0.4906578063964844),
 ('难过', 0.4871729612350464),
 ('难受', 0.4736190438270569),
 ('容易', 0.47024640440940857),
 ('生气', 0.46578270196914673),
 ('好开心', 0.4513933062553406),
 ('天天开心', 0.44929811358451843),
 ('好', 0.4428994655609131)]

In [7]:
model.most_similar("卧槽")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('管', 0.33231693506240845),
 ('tm', 0.3150600790977478),
 ('生菜', 0.3112187087535858),
 ('大神', 0.3058115243911743),
 ('卧', 0.30501842498779297),
 ('破产', 0.30370914936065674),
 ('逛逛', 0.30254125595092773),
 ('好好看', 0.29780805110931396),
 ('薛之谦', 0.2912497818470001),
 ('诶', 0.2890874147415161)]

In [8]:
model.most_similar("[二哈]")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('了', 0.45228898525238037),
 ('[doge]', 0.44459354877471924),
 ('啊啊啊', 0.39727917313575745),
 ('[阴险]', 0.39499837160110474),
 ('[笑cry]', 0.39007097482681274),
 ('我', 0.3849623203277588),
 ('[允悲]', 0.3687397241592407),
 ('[跪了]', 0.3672424852848053),
 ('[困]', 0.35909155011177063),
 ('啊', 0.35804903507232666)]

#### 保存模型

In [9]:
model.save("model/model_100.txt")