In [1]:
from  urllib  import request
import logging
from pathlib import Path
import numpy as np
import pandas as pd
import re
import MeCab
from gensim import corpora, models,matutils
import random
from tqdm import tqdm_notebook as tqdm
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
#from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

%load_ext jupyternotify

<IPython.core.display.Javascript object>

In [2]:
mecab = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/")

In [3]:
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt")
stopwords = [line.decode("utf-8").strip() for line in res]
print(stopwords[:3])

['あそこ', 'あたり', 'あちら']


In [4]:
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/English.txt")
stopwords += [line.decode("utf-8").strip() for line in res]
print(stopwords[-3:])

["you've", 'z', 'zero']


In [5]:
class Tokenizer:
    def __init__(self, stopwords, parser=None, include_pos=None, exclude_posdetail=None, exclude_reg=None):
    
        self.stopwords = stopwords
        self.include_pos = include_pos if include_pos else  ["名詞", "動詞", "形容詞"]
        self.exclude_posdetail = exclude_posdetail if exclude_posdetail else ["接尾", "数"]
        self.exclude_reg = exclude_reg if exclude_reg else r"$^"  # no matching reg
        if parser:
            self.parser = parser
        else:
            mecab = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/")
            self.parser = mecab.parse
            

    def tokenize(self, text, show_pos=False):
        text = re.sub(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", "", text)    #URL
        text = re.sub(r"\"?([-a-zA-Z0-9.`?{}]+\.jp)\"?" ,"", text)  # xxx.jp 
        text = text.lower()
        l = [line.split("\t") for line in self.parser(text).split("\n")]
        res = [
            i[2] if not show_pos else (i[2],i[3]) for i in l 
                if len(i) >=4 # has POS.
                    and i[3].split("-")[0] in self.include_pos
                    and i[3].split("-")[1] not in self.exclude_posdetail
                    and not re.search(r"(-|−)\d", i[2])
                    and not re.search(self.exclude_reg, i[2])
                    and i[2] not in self.stopwords          
            ]
        return res

In [6]:
t = Tokenizer(stopwords, mecab.parse)

In [7]:
t.tokenize("認めたくないものだな。自分自身の若さ故の過ちというものを。")

['認める', '自分自身', '若さ故の過ち']

# Character level + random forest

In [8]:
def load_data_and_labels(positive_data_file, negative_data_file, level="char", lang="En"):
       
    positive_examples = list(open(positive_data_file, "r").readlines())
    negative_examples = list(open(negative_data_file, "r").readlines())
    if level == "char":
        positive_examples = [s.replace(" ", "").replace("", " ").lower() for s in positive_examples]
        negative_examples = [s.replace(" ", "").replace("", " ").lower() for s in negative_examples]
    elif level == "word":
        if lang == "Ja":
            t = Tokenizer()
            positive_examples = [t.tokenize(s) for s in positive_examples]
            negative_examples = [t.tokenize(s) for s in negative_examples]
        else:
            positive_examples = [s.strip() for s in positive_examples]
            negative_examples = [s.strip() for s in negative_examples]
    else:
        print("invaid value of 'level'. ('char' or 'word') ")
        
    n_pos = len(positive_examples)
    n_neg = len(negative_examples)
    ratio = n_pos/n_neg
    print("# pos: ", n_pos)
    print("# neg: ", n_neg)
    print("pos/neg:", ratio)
    x_text = positive_examples + negative_examples

    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    
    return x_text, y, ratio

In [10]:
%%notify
positive_data_file = "data/amazon_ja/pos.txt"
negative_data_file = "data/amazon_ja/neg.txt"

x_text, y, ratio = load_data_and_labels(positive_data_file, negative_data_file, level="char", lang="Ja")

# pos:  62240
# neg:  9026
pos/neg: 6.895634832705517


<IPython.core.display.Javascript object>

In [11]:
x_text[0]

' 書 き 込 み 、 読 み 出 し 、 転 送 速 度 、 い ず れ も 満 足 で す 。 1 6 0 0 万 画 素 の コ ン パ ク ト カ メ ラ タ イ プ の デ ジ カ メ に 入 れ て 撮 影 に 使 い 、 撮 影 後 は カ ー ド リ ー ダ に 接 続 し て 、 撮 影 し た 膨 大 な 量 の 画 像 デ ー タ を サ ム ネ イ ル 表 示 に し た り 、 ピ ッ ク ア ッ プ し た 画 像 を コ ピ ペ し た り し て い ま す が 、 と く に ス ト レ ス を 感 じ る こ と な く 、 快 適 に 使 え て い ま す 。 【 a m a z o n . c o . j p 限 定 】 の 個 体 は 、 s d カ ー ド 本 体 が シ ン プ ル な 小 さ い ボ ー ル 紙 に 挟 ま れ て い る だ け 。 梱 包 は 超 シ ン プ ル で す が 、 実 売 価 格 が 安 く 、 性 能 に も 満 足 出 来 て い る の で 買 っ て 良 か っ た と 思 っ て い ま す 。 耐 久 性 は わ か り ま せ ん 。 そ こ は 要 経 過 観 察 で す ね 。 \n '

In [12]:
x_text_sp = [doc[:-2].split() for doc in x_text]

In [13]:
%%notify
d = corpora.Dictionary(x_text_sp)

<IPython.core.display.Javascript object>

In [14]:
d[1000]

'棄'

In [15]:
# bag of char
boc = [d.doc2bow(doc) for doc  in tqdm(x_text_sp)]




In [16]:
df = pd.DataFrame([len(b) for b in boc],columns=["length"])

In [17]:
df.head()

Unnamed: 0,length
0,140
1,25
2,57
3,71
4,72


In [18]:
df.describe()

Unnamed: 0,length
count,71266.0
mean,62.73119
std,39.008697
min,1.0
25%,37.0
50%,53.0
75%,77.0
max,730.0


In [19]:
len(d)

3792

In [20]:
%%notify
dense = list(matutils.corpus2dense(boc,  num_terms=len(d)))

In [21]:
dense = np.array(dense)

In [22]:
dense.shape

(3792, 71266)

In [23]:
%%notify
data_train_s, data_test_s, label_train_s, label_test_s = model_selection.train_test_split(dense.T, y, test_size=0.05)

<IPython.core.display.Javascript object>

In [24]:
estimator = RandomForestClassifier(verbose=10)

In [25]:
%%notify
estimator.fit(data_train_s, label_train_s)

building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.5s remaining:    0.0s


building tree 2 of 10


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   14.8s remaining:    0.0s


building tree 3 of 10


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   20.5s remaining:    0.0s


building tree 4 of 10


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   26.6s remaining:    0.0s


building tree 5 of 10


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   31.4s remaining:    0.0s


building tree 6 of 10


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   36.2s remaining:    0.0s


building tree 7 of 10


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   41.1s remaining:    0.0s


building tree 8 of 10


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   44.3s remaining:    0.0s


building tree 9 of 10


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   47.7s remaining:    0.0s


building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   53.1s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=10,
            warm_start=False)

<IPython.core.display.Javascript object>

In [26]:
estimator.score(data_test_s, label_test_s)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished


0.9003928170594837

In [31]:
tuned_parameters = [
    {
        "n_estimators": [50, 70, 90, 110, 130, 150]
    }
]

clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=2, scoring='accuracy', n_jobs=1,verbose=10)

In [None]:
%%notify
clf.fit(data_train_s, label_train_s)

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] n_estimators=50 .................................................
[CV] ......... n_estimators=50, score=0.901155061888866, total= 2.1min
[CV] n_estimators=50 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.2min remaining:    0.0s


[CV] ........ n_estimators=50, score=0.9026321231278249, total= 2.3min
[CV] n_estimators=70 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.6min remaining:    0.0s


[CV] ......... n_estimators=70, score=0.900505154943724, total= 2.9min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  7.6min remaining:    0.0s


[CV] n_estimators=70 .................................................
[CV] ........ n_estimators=70, score=0.9026025819030457, total= 3.0min
[CV] n_estimators=90 .................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 10.7min remaining:    0.0s


# Character level + LogisticRegression

In [None]:
y_true, y_pred = label_test_s, clf.predict(data_test_s)
print(accuracy_score(y_true, y_pred, target_names=["nag","pos"]))

In [None]:
y_ = [v.argmax() for v in y]

In [None]:
y_[:10]

In [None]:
data_train_s, data_test_s, label_train_s, label_test_s = model_selection.train_test_split(dense.T, y_, test_size=0.05)

In [None]:
estimator = LogisticRegression(verbose=10)

In [None]:
estimator.fit(data_train_s, label_train_s)

In [None]:
estimator.score(data_test_s, label_test_s)

In [None]:
diparameter={"C": [10**i for i in range(-2,4)]}
licv=GridSearchCV(LogisticRegression(),param_grid=diparameter, cv=2, scoring='accuracy', n_jobs=1,verbose=10)
licv.fit(data_train_s, label_train_s)
predictor=licv.best_estimator_

In [None]:
y_true, y_pred = label_test_s, licv.predict(data_test_s)
print(classification_report(y_true, y_pred, target_names=["nag","pos"], digits=4))

# BOW + random forest

In [None]:
pos_doc = []
neg_doc = []

In [None]:
with open("data/amazon_ja/pos.txt") as f:
    pos_doc = [t.tokenize(doc) for doc in tqdm(f.readlines())]
print(pos_doc[:2])

In [None]:
with open("data/amazon_ja/neg.txt") as f:
    neg_doc = [t.tokenize(doc) for doc in tqdm(f.readlines())]
print(neg_doc[:5])

In [None]:
d = corpora.Dictionary(pos_doc+neg_doc)

In [None]:
pos_bow = [d.doc2bow(doc) for doc  in tqdm(pos_doc)]

In [None]:
neg_bow = [d.doc2bow(doc) for doc  in tqdm(neg_doc)]

In [None]:
df = pd.DataFrame([len(b) for b in pos_bow+neg_bow],columns=["length"])

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
len(d)

In [None]:
pos_label = [1 for b in pos_doc]
neg_label = [0 for b in neg_doc]
print(len(pos_label))
print(len(neg_label))

In [None]:
label = pos_label + neg_label

In [None]:
len(label)

In [None]:
dense = list(matutils.corpus2dense(pos_bow+neg_bow,  num_terms=len(d)))

In [None]:
dense = np.array(dense)

In [None]:
dense.shape

In [None]:
data_train_s, data_test_s, label_train_s, label_test_s = model_selection.train_test_split(dense.T, label, test_size=0.05)

In [None]:
estimator = RandomForestClassifier(verbose=10)

In [None]:
estimator.fit(data_train_s, label_train_s)

In [None]:
estimator.score(data_test_s, label_test_s)

In [None]:
tuned_parameters = [{'n_estimators': [50, 70, 90, 110, 130, 150]}]#, 'max_features': ['auto', 'sqrt', 'log2', None]}]

clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=2, scoring='accuracy', n_jobs=3,verbose=10)

In [None]:
clf.fit(data_train_s, label_train_s)

In [None]:
print("best param")
print(clf.best_estimator_)

In [None]:
for params, mean_score, all_scores in clf.grid_scores_:
        print("{:.3f} (+/- {:.3f}) for {}".format(mean_score, all_scores.std() / 2, params))

In [None]:
y_true, y_pred = label_test_s, clf.predict(data_test_s)
print(classification_report(y_true, y_pred,target_names=["nag","pos"]))

# Tf-Idf + random forest

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
with open("data/amazon_ja/pos.txt") as f:
    raw_doc = f.readlines()
with open("data/amazon_ja/neg.txt") as f:
    raw_doc += f.readlines()
len(raw_doc)

In [None]:
vectorizer = TfidfVectorizer(tokenizer=t.tokenize)
train_matrix = vectorizer.fit_transform(raw_doc)

In [None]:
#data_train_s, data_test_s, label_train_s, label_test_s = model_selection.train_test_split(dense.T, label, test_size=0.1)
data_train_s, data_test_s, label_train_s, label_test_s = model_selection.train_test_split(train_matrix, label, test_size=0.05)

In [None]:
estimator = RandomForestClassifier(verbose=10)

In [None]:
estimator.fit(data_train_s, label_train_s)

In [None]:
estimator.score(data_test_s, label_test_s)

In [None]:
tuned_parameters = [{'n_estimators': [50, 70, 90, 110, 130, 150]}]#, 'max_features': ['auto', 'sqrt', 'log2', None]}]

clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=2, scoring='accuracy', n_jobs=3,verbose=10)

In [None]:
clf.fit(data_train_s, label_train_s)

In [None]:
print("best param")
print(clf.best_estimator_)

In [None]:
for params, mean_score, all_scores in clf.grid_scores_:
        print("{:.3f} (+/- {:.3f}) for {}".format(mean_score, all_scores.std() / 2, params))

In [None]:
y_true, y_pred = label_test_s, clf.predict(data_test_s)
print(classification_report(y_true, y_pred,target_names=["nag","pos"]))