In [1]:
import MeCab
from gensim.models import word2vec
import logging
import numpy as np
from  urllib  import request
import re
from tqdm import tqdm_notebook as tqdm

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
positive_data_file = "data/amazon_ja/pos.txt"
negative_data_file = "data/amazon_ja/neg.txt"

In [4]:
def load_data_and_labels(positive_data_file, negative_data_file, level="char", lang="En", dic_dir=None, t=None):
       
    positive_examples = list(open(positive_data_file, "r").readlines())
    negative_examples = list(open(negative_data_file, "r").readlines())
    if level == "char":
        positive_examples = [s.replace(" ", "").replace("", " ").lower() for s in positive_examples]
        negative_examples = [s.replace(" ", "").replace("", " ").lower() for s in negative_examples]
    elif level == "word":
        if lang == "Ja":
            #t = Tokenizer(dic_dir)
            positive_examples = [t.tokenize(s) for s in positive_examples]
            negative_examples = [t.tokenize(s) for s in negative_examples]
        else:
            positive_examples = [s.strip() for s in positive_examples]
            negative_examples = [s.strip() for s in negative_examples]
    else:
        print("invaid value of 'level'. ('char' or 'word') ")
        
    n_pos = len(positive_examples)
    n_neg = len(negative_examples)
    ratio = n_pos/n_neg
    print("# pos: ", n_pos)
    print("# neg: ", n_neg)
    print("pos/neg:", ratio)
    x_text = positive_examples + negative_examples

    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    
    return x_text, y, ratio

In [5]:
dic_dir = "/usr/lib/mecab/dic/mecab-ipadic-neologd"
#dic_dir = "/usr/local/lib/mecab/dic/mecab-ipadic-neologd/"
class Tokenizer:
    def __init__(self, stopwords, parser=None, include_pos=None, exclude_posdetail=None, exclude_reg=None):
    
        self.stopwords = stopwords
        self.include_pos = include_pos if include_pos else  ["名詞", "動詞", "形容詞"]
        self.exclude_posdetail = exclude_posdetail if exclude_posdetail else ["接尾", "数"]
        self.exclude_reg = exclude_reg if exclude_reg else r"$^"  # no matching reg
        if parser:
            self.parser = parser
        else:
            mecab = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/")
            self.parser = mecab.parse
            

    def tokenize(self, text, show_pos=False):
        text = re.sub(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", "", text)    #URL
        text = re.sub(r"\"?([-a-zA-Z0-9.`?{}]+\.jp)\"?" ,"", text)  # xxx.jp 
        text = text.lower()
        l = [line.split("\t") for line in self.parser(text).split("\n")]
        res = [
            i[2] if not show_pos else (i[2],i[3]) for i in l 
                if len(i) >=4 # has POS.
                    and i[3].split("-")[0] in self.include_pos
                    and i[3].split("-")[1] not in self.exclude_posdetail
                    and not re.search(r"(-|−)\d", i[2])
                    and not re.search(self.exclude_reg, i[2])
                    and i[2] not in self.stopwords          
            ]
        return res

In [6]:
dic_dir = "/usr/local/lib/mecab/dic/mecab-ipadic-neologd/" #mac
dic_dir = "/usr/lib/mecab/dic/mecab-ipadic-neologd"
mecab = MeCab.Tagger("-Ochasen -d {}".format(dic_dir))

In [7]:
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt")
stopwords = [line.decode("utf-8").strip() for line in res]
print(stopwords[:3])

['あそこ', 'あたり', 'あちら']


In [8]:
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/English.txt")
stopwords += [line.decode("utf-8").strip() for line in res]
print(stopwords[-3:])

["you've", 'z', 'zero']


In [9]:
t = Tokenizer(stopwords, mecab.parse)

In [10]:
t.tokenize("認めたくないものだな。自分自身の若さ故の過ちというものを。")

['認める', '自分自身', '若さ故の過ち']

In [11]:
level = "word"
x_text, y, ratio = load_data_and_labels(positive_data_file, negative_data_file, level=level, lang="Ja", t=t)

# pos:  62402
# neg:  9060
pos/neg: 6.887637969094922


In [12]:
x_text[:2]

[['書き込む',
  '読み出し',
  '転送速度',
  'いずれ',
  '満足',
  '画素',
  'コンパクトカメラ',
  'タイプ',
  'デジカメ',
  '入れる',
  '撮影',
  '使う',
  '撮影後',
  'カード',
  'リーダ',
  '接続',
  'する',
  '撮影',
  'する',
  '膨大',
  '量',
  '画像',
  'データ',
  'サムネイル',
  '表示',
  'する',
  'ピックアップ',
  'する',
  '画像',
  'コピペ',
  'する',
  'する',
  'いる',
  'ストレス',
  '感じる',
  'ない',
  '快適',
  '使える',
  'いる',
  '限定',
  '個体',
  'SDカード',
  '本体',
  'シンプル',
  '小さい',
  'ボール紙',
  '挟む',
  'いる',
  '梱包',
  'シンプル',
  '売価',
  '安い',
  '性能',
  '満足',
  '出来る',
  'いる',
  '買う',
  '良い',
  '思う',
  'いる',
  '耐久性',
  'わかる',
  '経過',
  '観察'],
 ['D6', '使う', '初心者', '1つ', '問題', '使える']]

In [13]:
text = "\n".join([ " ".join(x) for x in x_text ])

In [14]:
with open("wkachi.txt", "w") as f:
    f.write(text)

In [15]:
sentence = word2vec.Text8Corpus("wkachi.txt")

In [16]:
sentence

<gensim.models.word2vec.Text8Corpus at 0x7f50e700c908>

In [17]:
model = word2vec.Word2Vec(sentence)

2018-07-21 15:14:42,471 : INFO : collecting all words and their counts
2018-07-21 15:14:42,473 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-07-21 15:14:43,031 : INFO : collected 56624 word types from a corpus of 1920944 raw words and 193 sentences
2018-07-21 15:14:43,032 : INFO : Loading a fresh vocabulary
2018-07-21 15:14:43,181 : INFO : effective_min_count=5 retains 19616 unique words (34% of original 56624, drops 37008)
2018-07-21 15:14:43,182 : INFO : effective_min_count=5 leaves 1857771 word corpus (96% of original 1920944, drops 63173)
2018-07-21 15:14:43,234 : INFO : deleting the raw counts dictionary of 56624 items
2018-07-21 15:14:43,236 : INFO : sample=0.001 downsamples 30 most-common words
2018-07-21 15:14:43,237 : INFO : downsampling leaves estimated 1526415 word corpus (82.2% of prior 1857771)
2018-07-21 15:14:43,283 : INFO : estimated required memory for 19616 words and 100 dimensions: 25500800 bytes
2018-07-21 15:14:43,283 : INFO : rese

In [18]:
results = model.wv.most_similar(positive=["速度"])
for result in results:
    print(result)

2018-07-21 15:15:45,277 : INFO : precomputing L2-norms of word weight vectors


('スピード', 0.8742904663085938)
('読み込み', 0.8708919882774353)
('速い', 0.8581838607788086)
('USB3.0', 0.8490318059921265)
('書き込み', 0.8419095277786255)
('pro', 0.8294757008552551)
('通信速度', 0.8217605352401733)
('SPEC', 0.8190112113952637)
('転送速度', 0.8144389390945435)
('HDD', 0.8089838027954102)


  if np.issubdtype(vec.dtype, np.int):


In [19]:
len(model.wv["コンパクトカメラ"])

KeyError: "word 'コンパクトカメラ' not in vocabulary"

In [None]:
model.save("./amazon_ja_word2vec.model")

In [None]:
model = word2vec.Word2Vec.load("./amazon_ja_word2vec.model")

In [None]:
model.wv["速度"]

In [None]:
with open("data/amazon_ja/pos.txt") as f:
    pos_doc = [t.tokenize(doc) for doc in tqdm(f.readlines())]
print(pos_doc[:2])

In [None]:
[model.wv[d] for d in pos_doc[0]]