# word2vec Basic
## Loading data

In [1]:
import pandas as pd
df = pd.read_csv('data/sentiment.csv')
df.head(5)

Unnamed: 0,tag,text
0,P,店家很給力，快遞也是相當快，第三次光顧啦
1,N,這樣的配置用Vista系統還是有點卡。 指紋收集器。 沒送原裝滑鼠還需要自己買，不太好。
2,P,不錯，在同等檔次酒店中應該是值得推薦的！
3,N,哎！ 不會是蒙牛乾的吧 嚴懲真凶！
4,N,空尤其是三立電視臺女主播做的序尤其無趣像是硬湊那麼多字


In [2]:
print(len(df))
print(df['tag'].value_counts())

6388
N    3347
P    3041
Name: tag, dtype: int64


## Tokenization

In [3]:
import jieba
df['token_text'] = df['text'].apply(lambda x:list(jieba.cut(x)))
df.head(5)

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/0p/7xy1_dzx0_s5rnf06c0b316w0000gn/T/jieba.cache
Loading model cost 0.653 seconds.
Prefix dict has been built succesfully.


Unnamed: 0,tag,text,token_text
0,P,店家很給力，快遞也是相當快，第三次光顧啦,"[店家, 很, 給力, ，, 快遞, 也, 是, 相當快, ，, 第三次, 光顧, 啦]"
1,N,這樣的配置用Vista系統還是有點卡。 指紋收集器。 沒送原裝滑鼠還需要自己買，不太好。,"[這樣, 的, 配置, 用, Vista, 系統, 還是, 有點, 卡, 。, , 指紋,..."
2,P,不錯，在同等檔次酒店中應該是值得推薦的！,"[不錯, ，, 在, 同等, 檔次, 酒店, 中應, 該, 是, 值得, 推薦, 的, ！]"
3,N,哎！ 不會是蒙牛乾的吧 嚴懲真凶！,"[哎, ！, , 不會, 是, 蒙牛, 乾, 的, 吧, , 嚴懲, 真凶, ！]"
4,N,空尤其是三立電視臺女主播做的序尤其無趣像是硬湊那麼多字,"[空, 尤其, 是, 三立, 電視, 臺, 女主播, 做, 的, 序, 尤其, 無趣, 像是..."


## Building model

In [4]:
from gensim.models import Word2Vec
model = Word2Vec(df['token_text'], min_count=1, size=300, window=5, sg=0, workers=4)

In [29]:
model.wv.save_word2vec_format('model.bin', binary=True)

## Using model

In [31]:
print(model.most_similar(positive=['結婚', '他'], negative=['她'], topn=20))
print("-"*40)
print(model.most_similar(positive=['結婚', '她'], negative=['他'], topn=20))
print("-"*40)


[('結婚然', 0.6256321668624878), ('生小孩', 0.624354362487793), ('結婚過', 0.620686948299408), ('想結', 0.6196929216384888), ('不結', 0.6189882159233093), ('嫁娶', 0.6148558259010315), ('有計畫', 0.6132557392120361), ('要結', 0.6122066974639893), ('先買房', 0.6104307174682617), ('成家', 0.609870433807373), ('再結', 0.608663022518158), ('結婚後來', 0.6075947284698486), ('裸婚', 0.6005551218986511), ('今年年底', 0.596331775188446), ('會結', 0.5920178890228271), ('結婚現', 0.5911446809768677), ('當結婚', 0.5892318487167358), ('生女兒', 0.5877693295478821), ('奉子成婚', 0.5866853594779968), ('試婚', 0.5865316987037659)]
----------------------------------------
[('結婚然', 0.6303492784500122), ('生小孩', 0.6300292611122131), ('嫁娶', 0.6172428727149963), ('步入', 0.6136747598648071), ('有結', 0.6112129092216492), ('奉子成婚', 0.6074534058570862), ('結婚過', 0.6059277057647705), ('之後結婚', 0.6053087115287781), ('禮堂', 0.6033338308334351), ('婚生', 0.6031174659729004), ('明年初', 0.6030763387680054), ('結婚且', 0.6014657616615295), ('親事', 0.600039541721344), ('婚生子', 0.599635

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


In [30]:
print(len(model['好吃']))
print(model.most_similar('好吃'))
print("-"*40)
print(model.most_similar(positive=['工程師', '他'], negative=['她'], topn=20))
print("-"*40)
print(model.most_similar(positive=['工程師', '她'], negative=['他'], topn=20))
print("-"*40)
print(model.most_similar(positive=['科學家', '男'], negative=['女'], topn=20))
print("-"*40)
print(model.most_similar(positive=['科學家', '女'], negative=['男'], topn=20))
print("-"*40)
print(model.most_similar(positive=['醫生', '他'], negative=['她'], topn=20))
print("-"*40)
print(model.most_similar(positive=['醫生', '她'], negative=['他'], topn=20))
print("-"*40)
print(model.most_similar(positive=['家長', '他'], negative=['她'], topn=20))
print("-"*40)
print(model.most_similar(positive=['家長', '她'], negative=['他'], topn=20))
print("-"*40)
print(model.most_similar(positive=['結婚', '他'], negative=['她'], topn=20))
print("-"*40)
print(model.most_similar(positive=['結婚', '她'], negative=['他'], topn=20))
print("-"*40)
print(model.most_similar(positive=['同性', '他'], negative=['她'], topn=20))
print("-"*40)
print(model.most_similar(positive=['同性', '她'], negative=['他'], topn=20))
print("-"*40)
print(model.most_similar(positive=['同志', '他'], negative=['她'], topn=20))
print("-"*40)
print(model.most_similar(positive=['同志', '她'], negative=['他'], topn=20))
print("-"*40)
print(model.most_similar(positive=['不婚', '他'], negative=['她'], topn=20))
print("-"*40)
print(model.most_similar(positive=['不婚', '她'], negative=['他'], topn=20))
print("-"*40)
print(model.most_similar(positive=['未婚', '他'], negative=['她'], topn=20))
print("-"*40)
print(model.most_similar(positive=['未婚', '她'], negative=['他'], topn=20))
print("-"*40)
print(model.most_similar(positive=['成功', '他'], negative=['她'], topn=20))
print("-"*40)
print(model.most_similar(positive=['成功', '她'], negative=['他'], topn=20))
print("-"*40)
print(model.most_similar(positive=['外遇', '他'], negative=['她'], topn=20))
print("-"*40)
print(model.most_similar(positive=['外遇', '她'], negative=['他'], topn=20))
print("-"*40)
print(model.most_similar(positive=['離婚', '他'], negative=['她'], topn=20))
print("-"*40)
print(model.most_similar(positive=['離婚', '她'], negative=['他'], topn=20))
print("-"*40)
print(model.most_similar(positive=['失敗', '他'], negative=['她'], topn=20))
print("-"*40)
print(model.most_similar(positive=['失敗', '她'], negative=['他'], topn=20))

  """Entry point for launching an IPython kernel.
  


300
[('不愛吃', 0.6662341952323914), ('海鮮', 0.6537903547286987), ('吃', 0.6478301286697388), ('不新鮮', 0.6473957300186157), ('我愛吃', 0.645952045917511), ('燒肉', 0.6451542973518372), ('愛吃', 0.6437597274780273), ('生魚片', 0.6403005123138428), ('很愛吃', 0.6384100914001465), ('沒吃過', 0.6372779607772827)]
----------------------------------------


  after removing the cwd from sys.path.


KeyError: "word '工程師' not in vocabulary"

## Loading pre-trained model

In [6]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format("data/CBOW_iter15_2017-2018.bin", binary = True)

In [8]:
print(len(model['好吃']))
print(model.most_similar('好吃'))

SyntaxError: unexpected EOF while parsing (<ipython-input-8-e3fead0f3b67>, line 3)

# R RDS to word2vec model


## Convert RDS to pandas dataframe

In [8]:
# !pip install pyreadr
import pyreadr
result = pyreadr.read_r('../R/Crawler/boy-girl_201904121250.rda')


In [9]:
print(result.keys())
post = result["allp.df"]
print(post.keys())

odict_keys(['allc.df', 'allp.df'])
Index(['plink', 'board', 'pcontent', 'poster', 'ptitle', 'ptime'], dtype='object')


## tokenizing post content

In [10]:
import re
post['ptext'] = post['pcontent'].apply(lambda x:x.replace("\n", ""))
post['ptext'] = post['ptext'].apply(lambda x:re.sub("\s", "", x))
post.head(5)

Unnamed: 0,plink,board,pcontent,poster,ptitle,ptime,ptext
0,https://www.ptt.cc/bbs/Boy-Girl/M.1165687832.A...,Boy-Girl,\n我記得我前男友影響我最深的就是：\n\n別人的感情世界不要管、不要插手、不要評論\n\n...,candyzz (嚕拉拉),大小姐,Sun Dec 10 02:37:15 2006,我記得我前男友影響我最深的就是：別人的感情世界不要管、不要插手、不要評論*-*因為女生很容易...
1,https://www.ptt.cc/bbs/Boy-Girl/M.1166975303.A...,Boy-Girl,\n我和她 已經交往快8年了 從國二開始認識 一直到大四上學期的今天聖誕夜\n\n想起一開始...,abcc122333 (小巴),[分享]她答應了!她答應了!,Mon Dec 25 01:11:34 2006,我和她已經交往快8年了從國二開始認識一直到大四上學期的今天聖誕夜想起一開始我們還經常鬥嘴誇口...
2,https://www.ptt.cc/bbs/Boy-Girl/M.1167097252.A...,Boy-Girl,\n\n\n\n故事是這樣的\n\n男生希望女生來找他 兩個人為了早一個小時 晚一個小時\n...,soppy158 (Rosy),(兩性成長)兩性平等與錯誤投射,Tue Dec 26 10:11:03 2006,故事是這樣的男生希望女生來找他兩個人為了早一個小時晚一個小時爭執不休吵架時間就已經多過這一個...
3,https://www.ptt.cc/bbs/Boy-Girl/M.1168227921.A...,Boy-Girl,\n從小學開始 正常人都會對男女有好奇心 但我們都不成熟 對了 健康第一^O^\n\n甚...,voicespq (男稿的人),[分享]人際關係 永續發展,Mon Jan 8 11:45:20 2007,從小學開始正常人都會對男女有好奇心但我們都不成熟對了健康第一^O^甚至大人都有不成熟之時所以...
4,https://www.ptt.cc/bbs/Boy-Girl/M.1168753589.A...,Boy-Girl,\n前陣子我家閃光一直叫我去玩\n\n但是我對這種打打殺殺的遊戲非常沒興趣\n\n所以就一直...,flower319 (*花*),[討論] 我該怎樣跟我家閃光開口,Sun Jan 14 13:46:24 2007,前陣子我家閃光一直叫我去玩但是我對這種打打殺殺的遊戲非常沒興趣所以就一直拒絕但是他一直遊說我...


In [12]:
jeiba.cut("你最近好嗎？")

NameError: name 'jeiba' is not defined

In [13]:
len(post["ptext"])

87276

In [25]:
import jieba
token_post = []
i = 0
for p in post['ptext']:
    token_post.append(list(jieba.cut(p)))
    i += 1
    if(i%1000 == 0):
        print(i)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000


In [26]:
token_post[1]

['我',
 '和',
 '她',
 '已經',
 '交往',
 '快',
 '8',
 '年',
 '了',
 '從',
 '國二開始',
 '認識',
 '一直',
 '到',
 '大四',
 '上學期',
 '的',
 '今天',
 '聖誕夜',
 '想起',
 '一開始',
 '我們',
 '還經常鬥',
 '嘴',
 '誇口',
 '絕對',
 '不會',
 '喜歡',
 '上',
 '對方',
 '但',
 '相處',
 '越久越',
 '發現',
 '自己',
 '不能',
 '沒有',
 '彼此',
 '當時',
 '我',
 '這個',
 '毛頭',
 '小子',
 '為',
 '了',
 '打動',
 '她',
 '芳',
 '心想',
 '盡辦法',
 '最',
 '後',
 '我',
 '做出',
 '了',
 '一個',
 '硫酸',
 '銅藍色',
 '斜晶體',
 '就',
 '像',
 '一顆',
 '海洋',
 '之星',
 '般地',
 '送給',
 '她',
 '我終',
 '於',
 '得到',
 '她',
 '的',
 '心',
 '....',
 '高中',
 '時',
 '她',
 '考上',
 '了',
 '台北',
 '的',
 '學校',
 '我則',
 '繼續',
 '留在',
 '中壢',
 '大學',
 '我',
 '考上',
 '台中',
 '而',
 '她則',
 '是',
 '出國',
 '留學',
 '修習',
 '國際',
 '貿易',
 '這段',
 '日子',
 '我們',
 '經常',
 '是',
 '利用',
 'MSN',
 '聯絡',
 '感情',
 '她',
 '在',
 '國外',
 '的',
 '生活',
 '支出',
 '幾乎',
 '都',
 '由',
 '我',
 '來',
 '支付',
 '(',
 '我',
 '家境',
 '算不錯',
 ')',
 '而',
 '她',
 '也',
 '因為',
 '成績',
 '優異',
 '提前',
 '在',
 '大四',
 '上',
 '畢業',
 '今天',
 '聖誕夜',
 '她',
 '特地',
 '下台',
 '中',
 '陪',
 '我',
 'ㄧ',
 '起度過',
 '晚上',
 '我載',
 '她',
 '到',

In [35]:
from gensim.models import Word2Vec
model = Word2Vec(token_post, min_count=20, size=300, window=8, workers=4)

# Chinese Pretrained - eland

In [None]:
from gensim.models import KeyedVectors
model_tw = KeyedVectors.load_word2vec_format("data/CBOW_iter15_2017-2018.bin", binary = True)

## Case: 罷工 #protest and social movement

In [None]:
print(model_tw.most_similar('罷工', topn = 20))


In [None]:
print(model_tw.most_similar('抗爭', topn = 50))

## Case: gender stereotype

In [None]:
female = ["她", "妳", "女生", "女孩", "女士", "女人", "婦人", "婦", "女", "婦女",\
          "女性", "女孩子", "她們", "媽媽", "女兒", "女子","人妻", "姊姊", "姐姐",\
          "妹妹", "少婦", "熟女", "小女孩", "母親", "表妹", "姪女", "婆婆", "閨蜜",\
          "孫女", "女友", "少女", "阿姨", "姑姑", "閨密", "奶奶", "老婆", "阿嬤", "外婆",\
          "堂妹", "大嫂", "外甥女", "媳婦", "妻子" ,"太太", "表姊", "嫂嫂", "大女兒",\
          "小姑", "老媽", "表姐", "堂姐", "弟妹", "弟媳", "祖母", "舅媽", "繼女", "岳母",\
          "乾媽", "女方", "愛女", "養母", "大姊", "兒媳", "大姑", "前妻", "嫂子", "繼母",\
          "嬸嬸", "長女", "么女", "王女", "侄女", "伯母", "大妹", "外孫女"]
male = ["他", "你", "男", "男生", "男孩", "男士", "男人", "男性"]
print(model_tw.most_similar(positive=female, topn = 50))

## Case: genderization
* 看起來不太管用，因為很可能在文本裡面「他」的數量遠過於「她」，導致算相似度的時候，蔡英文與柯文哲兩個人名都是跟「他」比較接近。


In [None]:
print(model_tw.similarity("她", "蔡英文"))
print(model_tw.similarity("他", "蔡英文"))
print(model_tw.similarity("它", "蔡英文"))
print(model_tw.similarity("女", "蔡英文"))
print(model_tw.similarity("男", "蔡英文"))

In [None]:
print(model_tw.similarity("她", "柯文哲"))
print(model_tw.similarity("他", "柯文哲"))
print(model_tw.similarity("女生", "柯文哲"))
print(model_tw.similarity("男生", "柯文哲"))

# Pretained GoogleNews
Garg, N., Schiebinger, L., Jurafsky, D., & Zou, J. (2017). Word Embeddings Quantify 100 Years of Gender and Ethnic Stereotypes, 115(16). https://doi.org/10.1073/pnas.1720347115

In [None]:
from gensim.models import KeyedVectors
model_en = KeyedVectors.load_word2vec_format("../../../Downloads/GoogleNews-vectors-negative300.bin", binary = True)
print("Number of words: %d" % len(model.vocab))

## Case: Ideology

In [None]:
for t, v in model.most_similar('ideology', topn = 50):
    print(t, "\t\t", v)

# Pretrained FastText

# Case: Homo

## to-do
1. Drawing keyword networks

## Generating term-to-term network

In [None]:
edgedict = dict()
nodedict = dict()
key = "甲甲"
res = model_tw.most_similar(key, topn = 300)
nodedict[key] = res
for k, v in res:
    edgedict[key, k] = v
    edgedict[k, key] = v

In [None]:
for key, v in res:
    if key not in nodedict:
        res = model_tw.most_similar(key, topn = 300)
        nodedict[key] = res
        for k, v in res:
            edgedict[key, k] = v
            edgedict[k, key] = v
len(edgedict)

In [None]:
len(nodedict)
nodedict['甲甲']

In [None]:
for k, v in sorted(edgedict.items(), key=lambda x: x[1], reverse = True)[:200]:
    print(k, v)

In [None]:
for k, v in sorted(edgedict.items(), key=lambda x: x[1], reverse = True)[:50]:
    print(k, v)

## Drawing network