## 単語感情極性対応表
http://www.lr.pi.titech.ac.jp/~takamura/pndic_ja.html

In [1]:
import requests
import logging
from pathlib import Path
import pandas as pd
import numpy as np
import MeCab
import re
import jaconv
from gensim import corpora, models,matutils
import random
from tqdm import tqdm_notebook as tqdm

In [2]:
res = requests.get("http://www.lr.pi.titech.ac.jp/~takamura/pubs/pn_ja.dic")

In [3]:
text = res.text

In [4]:
l = text.split("\r\n")

In [5]:
len(l)

55126

In [6]:
l[:3]

['優れる:すぐれる:動詞:1', '良い:よい:形容詞:0.999995', '喜ぶ:よろこぶ:動詞:0.999979']

In [7]:
l = [row.split(":") for row in l]

In [8]:
l[:3]

[['優れる', 'すぐれる', '動詞', '1'],
 ['良い', 'よい', '形容詞', '0.999995'],
 ['喜ぶ', 'よろこぶ', '動詞', '0.999979']]

In [9]:
df = pd.DataFrame(l, columns=["word", "kana", "pos", "rating"])

In [10]:
df.head()

Unnamed: 0,word,kana,pos,rating
0,優れる,すぐれる,動詞,1.0
1,良い,よい,形容詞,0.999995
2,喜ぶ,よろこぶ,動詞,0.999979
3,褒める,ほめる,動詞,0.999979
4,めでたい,めでたい,形容詞,0.999645


In [11]:
df[["rating"]] = df[["rating"]].fillna(0.0).astype(float) 

In [12]:
pos_df = df[df["rating"]>0]

In [13]:
neg_df = df[df["rating"]<0]

In [14]:
len(pos_df.index)

5122

In [15]:
len(neg_df.index)

49983

In [16]:
len(df.index)

55126

In [17]:
d = df.groupby("word").count()

In [18]:
d[d["kana"]>=2].sort_values(["kana"], ascending=False)

Unnamed: 0_level_0,kana,pos,rating
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
大和,11,11,11
ホーム,11,11,11
太刀,9,9,9
頭,8,8,8
大人,7,7,7
端,7,7,7
一人,7,7,7
メーン,6,6,6
縁,6,6,6
オーバー,6,6,6


In [19]:
d[d["kana"]>= 2].describe()

Unnamed: 0,kana,pos,rating
count,2050.0,2050.0,2050.0
mean,2.197073,2.197073,2.197073
std,0.64498,0.64498,0.64498
min,2.0,2.0,2.0
25%,2.0,2.0,2.0
50%,2.0,2.0,2.0
75%,2.0,2.0,2.0
max,11.0,11.0,11.0


In [22]:
df[df["word"]=="助言"]

Unnamed: 0,word,kana,pos,rating
2105,助言,じょげん,名詞,0.952095
16686,助言,じょごん,名詞,-0.208164


In [23]:
#mecab = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/")
mecab = MeCab.Tagger("-Ochasen")

In [24]:
mecab.parse("認めたくないものだな。自分自身の若さ故の過ちというものを。")

'認め\tミトメ\t認める\t動詞-自立\t一段\t連用形\nたく\tタク\tたい\t助動詞\t特殊・タイ\t連用テ接続\nない\tナイ\tない\t助動詞\t特殊・ナイ\t基本形\nもの\tモノ\tもの\t名詞-非自立-一般\t\t\nだ\tダ\tだ\t助動詞\t特殊・ダ\t基本形\nな\tナ\tな\t助詞-終助詞\t\t\n。\t。\t。\t記号-句点\t\t\n自分\tジブン\t自分\t名詞-一般\t\t\n自身\tジシン\t自身\t名詞-一般\t\t\nの\tノ\tの\t助詞-連体化\t\t\n若\tワカ\t若い\t形容詞-自立\t形容詞・アウオ段\tガル接続\nさ\tサ\tさ\t名詞-接尾-特殊\t\t\n故\tユエ\t故\t名詞-一般\t\t\nの\tノ\tの\t助詞-連体化\t\t\n過ち\tアヤマチ\t過ち\t名詞-一般\t\t\nという\tトイウ\tという\t助詞-格助詞-連語\t\t\nもの\tモノ\tもの\t名詞-非自立-一般\t\t\nを\tヲ\tを\t助詞-格助詞-一般\t\t\n。\t。\t。\t記号-句点\t\t\nEOS\n'

In [25]:
res = requests.get("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt")
stopwords = res.text.split("\r\n")
print(stopwords[:3])

['あそこ', 'あたり', 'あちら']


In [26]:
res = requests.get("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/English.txt")
stopwords += res.text.split("\r\n")
print(stopwords[-3:])

['z', 'zero', '']


In [27]:
jaconv.kata2hira('ア')

'あ'

In [62]:
class Tokenizer:
    def __init__(self, stopwords, parser=None, include_pos=None, exclude_posdetail=None, exclude_reg=None):
    
        self.stopwords = stopwords
        self.include_pos = include_pos if include_pos else  ["名詞", "動詞", "形容詞"]
        self.exclude_posdetail = exclude_posdetail if exclude_posdetail else ["接尾", "数"]
        self.exclude_reg = exclude_reg if exclude_reg else r"$^"  # no matching reg
        if parser:
            self.parser = parser
        else:
            mecab = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/")
            self.parser = mecab.parse
            

    def tokenize(self, df, text, show_pos=False):
        text = re.sub(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", "", text)    #URL
        text = re.sub(r"\"?([-a-zA-Z0-9.`?{}]+\.jp)\"?" ,"", text)  # xxx.jp 
        text = text.lower()
        l = [line.split("\t") for line in self.parser(text).split("\n")]
        v = 0.0
        for i in l:
            if len(i) >=4 \
                and i[3].split("-")[0] in self.include_pos \
                and i[3].split("-")[1] not in self.exclude_posdetail \
                and not re.search(r"(-|−)\d", i[2]) \
                and not re.search(self.exclude_reg, i[2])\
                and i[2] not in self.stopwords:
                d = df[df["word"]==i[2]].groupby("word").mean().values
                d = d[0][0] if d else 0.0
                v += d
        
        #res = [
        #     (jaconv.kata2hira(i[1]), df[(df["word"]== i[2])]).rating if not show_pos else (i[2],i[3]) for i in l 
         #       if len(i) >=4 # has POS.
         #           and i[3].split("-")[0] in self.include_pos
          #          and i[3].split("-")[1] not in self.exclude_posdetail
          #          and not re.search(r"(-|−)\d", i[2])
          #          and not re.search(self.exclude_reg, i[2])
          #          and i[2] not in self.stopwords          
          #  ]
        return v

In [63]:
t = Tokenizer(stopwords, mecab.parse, exclude_reg=r"\d(年|月|日)")

In [64]:
t.tokenize(df, "認めたくないものだな。自分自身の若さ故の過ちというものを。")



-1.0765065

In [65]:
df[df["word"]=="認める"]

Unnamed: 0,word,kana,pos,rating
15547,認める,したためる,動詞,-0.19584
42906,認める,みとめる,動詞,-0.547493


In [66]:
pos_doc = []
neg_doc = []

In [None]:
with open("data/amazon_ja/pos.txt") as f:
    pos_doc = [t.tokenize(df, doc) for doc in tqdm(f.readlines())]
print(pos_doc[:5])



In [None]:
with open("data/amazon_ja/neg.txt") as f:
    neg_doc = [t.tokenize(df, doc) for doc in tqdm(f.readlines())]
print(neg_doc[:5])