In [4]:
from  urllib  import request
import logging
from pathlib import Path
import numpy as np
import pandas as pd
import re
import MeCab
from gensim import corpora, models,matutils
import random
from tqdm import tqdm_notebook as tqdm
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report

In [5]:
mecab = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/")

In [6]:
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt")
stopwords = [line.decode("utf-8").strip() for line in res]
print(stopwords[:3])

['あそこ', 'あたり', 'あちら']


In [7]:
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/English.txt")
stopwords += [line.decode("utf-8").strip() for line in res]
print(stopwords[-3:])

["you've", 'z', 'zero']


In [8]:
class Tokenizer:
    def __init__(self, stopwords, parser=None, include_pos=None, exclude_posdetail=None, exclude_reg=None):
    
        self.stopwords = stopwords
        self.include_pos = include_pos if include_pos else  ["名詞", "動詞", "形容詞"]
        self.exclude_posdetail = exclude_posdetail if exclude_posdetail else ["接尾", "数"]
        self.exclude_reg = exclude_reg if exclude_reg else r"$^"  # no matching reg
        if parser:
            self.parser = parser
        else:
            mecab = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/")
            self.parser = mecab.parse
            

    def tokenize(self, text, show_pos=False):
        text = re.sub(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", "", text)    #URL
        text = re.sub(r"\"?([-a-zA-Z0-9.`?{}]+\.jp)\"?" ,"", text)  # xxx.jp 
        text = text.lower()
        l = [line.split("\t") for line in self.parser(text).split("\n")]
        res = [
            i[2] if not show_pos else (i[2],i[3]) for i in l 
                if len(i) >=4 # has POS.
                    and i[3].split("-")[0] in self.include_pos
                    and i[3].split("-")[1] not in self.exclude_posdetail
                    and not re.search(r"(-|−)\d", i[2])
                    and not re.search(self.exclude_reg, i[2])
                    and i[2] not in self.stopwords          
            ]
        return res

In [9]:
t = Tokenizer(stopwords, mecab.parse, exclude_reg=r"\d(年|月|日)")

In [10]:
t.tokenize("認めたくないものだな。自分自身の若さ故の過ちというものを。")

['認める', '自分自身', '若さ故の過ち']

# Character level + random forest

In [57]:
def load_data_and_labels(positive_data_file, negative_data_file, level="char", lang="En"):
       
    positive_examples = list(open(positive_data_file, "r").readlines())
    negative_examples = list(open(negative_data_file, "r").readlines())
    if level == "char":
        positive_examples = [s.replace(" ", "").replace("", " ").lower() for s in positive_examples]
        negative_examples = [s.replace(" ", "").replace("", " ").lower() for s in negative_examples]
    elif level == "word":
        if lang == "Ja":
            t = Tokenizer()
            positive_examples = [t.tokenize(s) for s in positive_examples]
            negative_examples = [t.tokenize(s) for s in negative_examples]
        else:
            positive_examples = [s.strip() for s in positive_examples]
            negative_examples = [s.strip() for s in negative_examples]
    else:
        print("invaid value of 'level'. ('char' or 'word') ")
        
    n_pos = len(positive_examples)
    n_neg = len(negative_examples)
    ratio = n_pos/n_neg
    print("# pos: ", n_pos)
    print("# neg: ", n_neg)
    print("pos/neg:", ratio)
    x_text = positive_examples + negative_examples

    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    
    return x_text, y, ratio

In [58]:
positive_data_file = "data/amazon_ja/pos.txt"
negative_data_file = "data/amazon_ja/neg.txt"

x_text, y, ratio = load_data_and_labels(positive_data_file, negative_data_file, level="char", lang="Ja")

# pos:  46363
# neg:  6026
pos/neg: 7.693826750746764


In [64]:
x_text[0]

' ク ロ ッ ク ス の 偽 物 サ ン ダ ル は 今 や ど こ に で も 売 っ て い ま す が 、 昔 買 っ た も の は け っ こ う 丈 夫 に 出 来 て い て 長 く 履 け た の で す が 、 最 近 の 物 は 底 が 柔 ら か く 、 石 な ど を 踏 ん で し ま う と 足 の 裏 が 痛 い し 、 す ぐ に 底 が す り 減 っ て 履 け な く な っ て し ま う 。 安 物 買 い を や め て 、 正 規 の ク ロ ッ ク ス 公 式 品 を 買 っ て み ま し た が 、 や は り 底 が 頑 丈 で 履 き 心 地 が い い 。 底 が 柔 ら か く て ふ に ゃ ふ に ゃ の ク ッ シ ョ ン 性 を 求 め て い る 人 に は 、 固 く て 満 足 出 来 な い か も し れ ま せ ん が 、 散 歩 や 買 い 物 に 行 く 時 な ど サ ン ダ ル を 普 段 履 き す る 自 分 に と っ て は こ の く ら い し っ か り し て い な い と 履 け た も の で は な い で す 。 満 足 の い く 買 い 物 が 出 来 ま し た 。 \n '

In [65]:
x_text_sp = [doc[:-2].split() for doc in x_text]

In [67]:
d = corpora.Dictionary(x_text_sp)

In [70]:
d[1000]

'寸'

In [73]:
# bag of char
boc = [d.doc2bow(doc) for doc  in tqdm(x_text_sp)]




In [74]:
df = pd.DataFrame([len(b) for b in boc],columns=["length"])

In [75]:
df.head()

Unnamed: 0,length
0,91
1,66
2,48
3,91
4,98


In [76]:
df.describe()

Unnamed: 0,length
count,52389.0
mean,62.661417
std,39.216156
min,1.0
25%,37.0
50%,53.0
75%,76.0
max,730.0


In [77]:
len(d)

3626

In [79]:
dense = list(matutils.corpus2dense(boc,  num_terms=len(d)))

In [80]:
dense = np.array(dense)

In [81]:
dense.shape

(3626, 52389)

In [83]:
data_train_s, data_test_s, label_train_s, label_test_s = model_selection.train_test_split(dense.T, y, test_size=0.05)

In [84]:
estimator = RandomForestClassifier(verbose=10)

In [85]:
estimator.fit(data_train_s, label_train_s)

building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s


building tree 2 of 10


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.7s remaining:    0.0s


building tree 3 of 10


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.4s remaining:    0.0s


building tree 4 of 10


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.1s remaining:    0.0s


building tree 5 of 10


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.7s remaining:    0.0s


building tree 6 of 10


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   10.6s remaining:    0.0s


building tree 7 of 10


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   12.7s remaining:    0.0s


building tree 8 of 10


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   14.8s remaining:    0.0s


building tree 9 of 10


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   16.7s remaining:    0.0s


building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   18.4s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=10,
            warm_start=False)

In [86]:
estimator.score(data_test_s, label_test_s)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


0.8961832061068702

In [90]:
tuned_parameters = [{'n_estimators': [50, 70, 90, 110, 130, 150]}]#, 'max_features': ['auto', 'sqrt', 'log2', None]}]

clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=2, scoring='accuracy', n_jobs=3,verbose=10)

In [91]:
clf.fit(data_train_s, label_train_s)

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] n_estimators=50 .................................................
[CV] n_estimators=50 .................................................
[CV] n_estimators=70 .................................................
[CV] ........................ n_estimators=50, score=0.898891 -  49.6s
[CV] n_estimators=70 .................................................
[CV] ........................ n_estimators=50, score=0.899618 -  52.6s
[CV] n_estimators=90 .................................................


[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:   56.9s


[CV] ........................ n_estimators=70, score=0.899658 - 1.1min
[CV] n_estimators=90 .................................................
[CV] ........................ n_estimators=70, score=0.899735 - 1.0min
[CV] n_estimators=110 ................................................
[CV] ........................ n_estimators=90, score=0.899578 - 1.3min
[CV] n_estimators=110 ................................................
[CV] ........................ n_estimators=90, score=0.899212 - 1.3min
[CV] n_estimators=130 ................................................
[CV] ....................... n_estimators=110, score=0.899377 - 1.6min
[CV] n_estimators=130 ................................................


[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:  3.6min


[CV] ....................... n_estimators=110, score=0.899052 - 1.6min
[CV] n_estimators=150 ................................................
[CV] ....................... n_estimators=130, score=0.899739 - 2.0min
[CV] n_estimators=150 ................................................


[Parallel(n_jobs=3)]: Done   9 out of  12 | elapsed:  4.5min remaining:  1.5min


[CV] ....................... n_estimators=130, score=0.898931 - 1.8min
[CV] ....................... n_estimators=150, score=0.899297 - 2.2min
[CV] ....................... n_estimators=150, score=0.899574 - 2.0min


[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed:  6.5min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=3,
       param_grid=[{'n_estimators': [50, 70, 90, 110, 130, 150]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=10)

In [92]:
y_true, y_pred = label_test_s, clf.predict(data_test_s)
print(classification_report(y_true, y_pred,target_names=["nag","pos"]))

             precision    recall  f1-score   support

        nag       0.96      0.24      0.39       330
        pos       0.90      1.00      0.95      2290

avg / total       0.91      0.90      0.88      2620



# BOW + random forest

In [11]:
pos_doc = []
neg_doc = []

In [12]:
with open("data/amazon_ja/pos.txt") as f:
    pos_doc = [t.tokenize(doc) for doc in tqdm(f.readlines())]
print(pos_doc[:2])


[['クロックス', '偽物', 'サンダル', '売る', 'いる', '昔', '買う', '丈夫', '出来る', 'いる', '長い', '履ける', 'の', '最近', '物', '底', '柔らかい', '石', '踏む', 'しまう', '足', '裏', '痛い', '底', 'すり減る', '履ける', 'なる', 'しまう', '安物', 'やめる', '正規', 'crocs', '公式', '買う', 'みる', '底', '頑丈', '履く', 'いい', '底', '柔らかい', 'クッション性', '求める', 'いる', '固い', '満足', '出来る', 'しれる', '散歩', '買い物', '行く', 'サンダル', '普段', '履く', 'する', 'する', 'いる', '履ける', '満足', 'いく', '買い物', '出来る'], ['セール', '切る', '価格', '買う', 'しまう', '見た目', 'baya', '近い', '履く', 'クラッシック', 'あいかわる', '大きめ', '用途', '洗車', '突っ掛け', '代わり', 'ぴったり']]


In [13]:
with open("data/amazon_ja/neg.txt") as f:
    neg_doc = [t.tokenize(doc) for doc in tqdm(f.readlines())]
print(neg_doc[:5])


[['うい', 'ん', '表記', 'サイズ', '左右', '明らか', '小さい', '届く', 'サイズ', '丁度', '良い', '小さい', '日本製'], ['5cm', 'サイズ', '靴', 'スニーカー', '履く', 'いる', '25cm', '購入', 'する', '人差し指', '1本', '1cm', '大きい', '0cm', '指', '当たる', '思う', '5cm', '欲しい', '小走り', '無理', '幅広', 'サイズ', '思う', '横', '幅', '当たる', 'いる', '街', '安い', '買える', '返品', '勿体ない', '残念', '車', '運転', 'なるい'], ['Amazon', '販売', '発送', 'する', 'ページ', '購入', '履く', 'つま先', 'かかる', '体勢', '足', '激痛', '痛む', '場所', '昔', '釘', '踏む', '足', '残る', 'てる', 'せい', '思う', '先日', '友人', '履く', '痛い', '言う', '調べる', '金属片', '混入', '裏', 'くる', '曲げる', '確認', 'する', '貫通', 'する', '無い', 'トゲ', '抜き', '抜ける', '写真', 'カッター', 'ゴム', '切る', 'いく', '抜ける', '完全', 'ゴム', '混入', 'する'], ['公式サイト', '購入', 'する', '正規', '小さい', 'Amazon', '購入', 'する', 'Amazon', '販売', '発送', '偽物', '思える', 'の'], ['小さい', 'なる', '印象', '使う', 'いる', 'サイズ', '購入', 'する', 'きつい', '履く', 'いる', '伸びる', 'の', 'ない']]


In [16]:
d = corpora.Dictionary(pos_doc+neg_doc)

In [17]:
pos_bow = [d.doc2bow(doc) for doc  in tqdm(pos_doc)]




In [18]:
neg_bow = [d.doc2bow(doc) for doc  in tqdm(neg_doc)]




In [19]:
df = pd.DataFrame([len(b) for b in pos_bow+neg_bow],columns=["length"])

In [20]:
df.head()

Unnamed: 0,length
0,44
1,17
2,14
3,28
4,38


In [21]:
df.describe()

Unnamed: 0,length
count,48967.0
mean,22.091266
std,21.957886
min,0.0
25%,10.0
50%,17.0
75%,26.0
max,803.0


In [24]:
len(d)

46858

In [25]:
pos_label = [1 for b in pos_doc]
neg_label = [0 for b in neg_doc]
print(len(pos_label))
print(len(neg_label))

43179
5788


In [26]:
label = pos_label + neg_label

In [27]:
len(label)

48967

In [44]:
dense = list(matutils.corpus2dense(pos_bow+neg_bow,  num_terms=len(d)))

In [45]:
dense = np.array(dense)

In [46]:
dense.shape

(46858, 48967)

In [47]:
data_train_s, data_test_s, label_train_s, label_test_s = model_selection.train_test_split(dense.T, label, test_size=0.05)

In [48]:
estimator = RandomForestClassifier(verbose=10)

In [49]:
estimator.fit(data_train_s, label_train_s)

building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.7min remaining:    0.0s


building tree 2 of 10


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  3.5min remaining:    0.0s


building tree 3 of 10


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  5.2min remaining:    0.0s


building tree 4 of 10


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  7.4min remaining:    0.0s


building tree 5 of 10


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  9.6min remaining:    0.0s


building tree 6 of 10


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 12.0min remaining:    0.0s


building tree 7 of 10


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 14.2min remaining:    0.0s


building tree 8 of 10


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 16.5min remaining:    0.0s


building tree 9 of 10


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 18.8min remaining:    0.0s


building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 21.1min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=10,
            warm_start=False)

In [50]:
estimator.score(data_test_s, label_test_s)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished


0.9158840342997142

In [51]:
tuned_parameters = [{'n_estimators': [50, 70, 90, 110, 130, 150]}]#, 'max_features': ['auto', 'sqrt', 'log2', None]}]

clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=2, scoring='accuracy', n_jobs=3,verbose=10)

In [52]:
clf.fit(data_train_s, label_train_s)

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] n_estimators=50 .................................................
[CV] n_estimators=50 .................................................
[CV] n_estimators=70 .................................................
[CV] ........................ n_estimators=50, score=0.909368 -62.3min
[CV] n_estimators=70 .................................................
[CV] ........................ n_estimators=50, score=0.907520 -66.0min
[CV] n_estimators=90 .................................................


[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed: 68.7min


[CV] ........................ n_estimators=70, score=0.909024 -89.6min
[CV] n_estimators=90 .................................................
[CV] ........................ n_estimators=70, score=0.909368 -90.6min
[CV] n_estimators=110 ................................................
[CV] ....................... n_estimators=90, score=0.908938 -114.2min
[CV] n_estimators=110 ................................................
[CV] ....................... n_estimators=90, score=0.908552 -118.4min
[CV] n_estimators=130 ................................................
[CV] ...................... n_estimators=110, score=0.908895 -133.8min


[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed: 288.6min


[CV] n_estimators=130 ................................................
[CV] ...................... n_estimators=110, score=0.907907 -137.6min
[CV] n_estimators=150 ................................................
[CV] ...................... n_estimators=130, score=0.908981 -156.0min


[Parallel(n_jobs=3)]: Done   9 out of  12 | elapsed: 368.0min remaining: 122.7min


[CV] n_estimators=150 ................................................
[CV] ...................... n_estimators=130, score=0.908294 -159.3min
[CV] ...................... n_estimators=150, score=0.907864 -160.7min
[CV] ...................... n_estimators=150, score=0.907434 -135.6min


[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed: 503.6min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=3,
       param_grid=[{'n_estimators': [50, 70, 90, 110, 130, 150]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=10)

In [53]:
print("best param")
print(clf.best_estimator_)

best param
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=70, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [54]:
for params, mean_score, all_scores in clf.grid_scores_:
        print("{:.3f} (+/- {:.3f}) for {}".format(mean_score, all_scores.std() / 2, params))

0.908 (+/- 0.000) for {'n_estimators': 50}
0.909 (+/- 0.000) for {'n_estimators': 70}
0.909 (+/- 0.000) for {'n_estimators': 90}
0.908 (+/- 0.000) for {'n_estimators': 110}
0.909 (+/- 0.000) for {'n_estimators': 130}
0.908 (+/- 0.000) for {'n_estimators': 150}


In [55]:
y_true, y_pred = label_test_s, clf.predict(data_test_s)
print(classification_report(y_true, y_pred,target_names=["nag","pos"]))

             precision    recall  f1-score   support

        nag       0.94      0.34      0.50       278
        pos       0.92      1.00      0.96      2171

avg / total       0.92      0.92      0.91      2449



# Tf-Idf + random forest

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
with open("data/amazon_ja/pos.txt") as f:
    raw_doc = f.readlines()
with open("data/amazon_ja/neg.txt") as f:
    raw_doc += f.readlines()
len(raw_doc)

48967

In [32]:
vectorizer = TfidfVectorizer(tokenizer=t.tokenize)
train_matrix = vectorizer.fit_transform(raw_doc)

In [35]:
#data_train_s, data_test_s, label_train_s, label_test_s = model_selection.train_test_split(dense.T, label, test_size=0.1)
data_train_s, data_test_s, label_train_s, label_test_s = model_selection.train_test_split(train_matrix, label, test_size=0.05)

In [36]:
estimator = RandomForestClassifier(verbose=10)

In [37]:
estimator.fit(data_train_s, label_train_s)

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   44.8s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=True,
            warm_start=False)

In [38]:
estimator.score(data_test_s, label_test_s)

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished


0.9089424254797877

In [39]:
tuned_parameters = [{'n_estimators': [50, 70, 90, 110, 130, 150]}]#, 'max_features': ['auto', 'sqrt', 'log2', None]}]

clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=2, scoring='accuracy', n_jobs=3,verbose=10)

In [40]:
clf.fit(data_train_s, label_train_s)

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] n_estimators=50 .................................................
[CV] n_estimators=50 .................................................
[CV] n_estimators=70 .................................................
[CV] ........................ n_estimators=50, score=0.905972 - 2.4min
[CV] n_estimators=70 .................................................
[CV] ........................ n_estimators=50, score=0.908079 - 2.4min
[CV] n_estimators=90 .................................................


[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:  2.5min


[CV] ........................ n_estimators=70, score=0.907993 - 3.4min
[CV] n_estimators=90 .................................................
[CV] ........................ n_estimators=70, score=0.905284 - 3.1min
[CV] n_estimators=110 ................................................
[CV] ........................ n_estimators=90, score=0.907520 - 4.3min
[CV] n_estimators=110 ................................................
[CV] ........................ n_estimators=90, score=0.906445 - 4.1min
[CV] n_estimators=130 ................................................
[CV] ....................... n_estimators=110, score=0.907477 - 5.2min
[CV] n_estimators=130 ................................................


[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed: 10.8min


[CV] ....................... n_estimators=110, score=0.907391 - 5.3min
[CV] n_estimators=150 ................................................
[CV] ....................... n_estimators=130, score=0.907821 - 6.2min
[CV] n_estimators=150 ................................................


[Parallel(n_jobs=3)]: Done   9 out of  12 | elapsed: 13.7min remaining:  4.6min


[CV] ....................... n_estimators=130, score=0.905585 - 6.0min
[CV] ....................... n_estimators=150, score=0.907563 - 7.0min
[CV] ....................... n_estimators=150, score=0.906316 - 6.6min


[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed: 20.3min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=3,
       param_grid=[{'n_estimators': [50, 70, 90, 110, 130, 150]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=10)

In [41]:
print("best param")
print(clf.best_estimator_)

best param
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=110, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [42]:
for params, mean_score, all_scores in clf.grid_scores_:
        print("{:.3f} (+/- {:.3f}) for {}".format(mean_score, all_scores.std() / 2, params))

0.907 (+/- 0.001) for {'n_estimators': 50}
0.907 (+/- 0.001) for {'n_estimators': 70}
0.907 (+/- 0.000) for {'n_estimators': 90}
0.907 (+/- 0.000) for {'n_estimators': 110}
0.907 (+/- 0.001) for {'n_estimators': 130}
0.907 (+/- 0.000) for {'n_estimators': 150}


In [43]:
y_true, y_pred = label_test_s, clf.predict(data_test_s)
print(classification_report(y_true, y_pred,target_names=["nag","pos"]))

             precision    recall  f1-score   support

        nag       0.88      0.27      0.42       286
        pos       0.91      0.99      0.95      2163

avg / total       0.91      0.91      0.89      2449

