In [2]:
import logging
from pathlib import Path
import numpy as np
import pandas as pd
import re
from gensim import corpora, models,matutils
import random
from tqdm import tqdm_notebook as tqdm
import sys

import notebookutil as nbu
sys.meta_path.append(nbu.NotebookFinder())
from util import load_data_and_labels



from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
#from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

%load_ext jupyternotify

# Character level + random forest

In [3]:
positive_data_file = "data/amazon_ja/pos.txt"
negative_data_file = "data/amazon_ja/neg.txt"

In [4]:
files = ["data/amazon_ja/r_{}.txt".format(i) for i in range(1,6)]

In [5]:
%%notify
level = "char" #"char"
x_text, y, ratio = load_data_and_labels(positive_data_file, negative_data_file, level=level, lang="Ja")
#level="char"
#x_text, y = load_data_and_labels_multiclass(files, level=level, lang="Ja")

# pos:  62402
# neg:  9060
pos/neg: 6.887637969094922


<IPython.core.display.Javascript object>

In [6]:
x_text[0]

' 書 き 込 み 、 読 み 出 し 、 転 送 速 度 、 い ず れ も 満 足 で す 。 1 6 0 0 万 画 素 の コ ン パ ク ト カ メ ラ タ イ プ の デ ジ カ メ に 入 れ て 撮 影 に 使 い 、 撮 影 後 は カ ー ド リ ー ダ に 接 続 し て 、 撮 影 し た 膨 大 な 量 の 画 像 デ ー タ を サ ム ネ イ ル 表 示 に し た り 、 ピ ッ ク ア ッ プ し た 画 像 を コ ピ ペ し た り し て い ま す が 、 と く に ス ト レ ス を 感 じ る こ と な く 、 快 適 に 使 え て い ま す 。 【 a m a z o n . c o . j p 限 定 】 の 個 体 は 、 s d カ ー ド 本 体 が シ ン プ ル な 小 さ い ボ ー ル 紙 に 挟 ま れ て い る だ け 。 梱 包 は 超 シ ン プ ル で す が 、 実 売 価 格 が 安 く 、 性 能 に も 満 足 出 来 て い る の で 買 っ て 良 か っ た と 思 っ て い ま す 。 耐 久 性 は わ か り ま せ ん 。 そ こ は 要 経 過 観 察 で す ね 。 \n '

In [7]:
x_text_sp = [doc[:-2].split() for doc in x_text]

In [8]:
%%notify
d = corpora.Dictionary(x_text_sp)

<IPython.core.display.Javascript object>

In [9]:
d[100]

'影'

In [10]:
# bag of char
boc = [d.doc2bow(doc) for doc  in tqdm(x_text_sp)]




In [11]:
df = pd.DataFrame([len(b) for b in boc],columns=["length"])

In [12]:
df.head()

Unnamed: 0,length
0,140
1,25
2,57
3,71
4,72


In [13]:
df.describe()

Unnamed: 0,length
count,71462.0
mean,62.733201
std,39.017601
min,1.0
25%,37.0
50%,53.0
75%,77.0
max,730.0


In [14]:
len(d)

3794

In [None]:
%%notify
dense = list(matutils.corpus2dense(boc,  num_terms=len(d)))

In [None]:
dense = np.array(dense).T

In [None]:
dense.shape

In [None]:
%%notify
data_train_s, data_test_s, label_train_s, label_test_s = model_selection.train_test_split(dense, y, test_size=0.05)

In [None]:
estimator = RandomForestClassifier(verbose=10)

In [None]:
%%notify
estimator.fit(data_train_s, label_train_s)

In [None]:
estimator.score(data_test_s, label_test_s)

In [None]:
tuned_parameters = [
    {
        "n_estimators": [50, 70, 90, 110, 130, 150]
    }
]

clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=2, scoring='accuracy', n_jobs=1,verbose=10)

In [None]:
%%notify
clf.fit(data_train_s, label_train_s)

In [None]:
y_true, y_pred = label_test_s, clf.predict(data_test_s)
print(accuracy_score(y_true, y_pred, target_names=["nag","pos"]))

# Character level + LogisticRegression

In [None]:
y_ = [v.argmax() for v in y]

In [None]:
y_[:10]

In [None]:
data_train_s, data_test_s, label_train_s, label_test_s = model_selection.train_test_split(dense, y_, test_size=0.05)

In [None]:
estimator = LogisticRegression(verbose=10)

In [None]:
estimator.fit(data_train_s, label_train_s)

In [None]:
estimator.score(data_test_s, label_test_s)

In [None]:
diparameter={"C": [10**i for i in range(-2,4)]}
licv=GridSearchCV(LogisticRegression(),param_grid=diparameter, cv=2, scoring='accuracy', n_jobs=1,verbose=10)
licv.fit(data_train_s, label_train_s)
predictor=licv.best_estimator_

In [None]:
y_pred[:10].tolist()

In [None]:
y_true[:10]

In [None]:
y_true, y_pred = label_test_s, licv.predict(data_test_s)
#print(classification_report(y_true, y_pred, target_names=["nag","pos"], digits=4))

# BOW 

In [15]:
%%notify
level = "word" 
x_text, y, ratio = load_data_and_labels(positive_data_file, negative_data_file, level=level, lang="Ja")

Japanese stopword:  あそこ, あたり, あちら ...
English stopword: ... you've, z, zero
# pos:  62402
# neg:  9060
pos/neg: 6.887637969094922


<IPython.core.display.Javascript object>

In [16]:
d = corpora.Dictionary(x_text)

In [17]:
len(d)

56271

In [18]:
bow =[d.doc2bow(doc) for doc  in tqdm(x_text)]




In [19]:
df = pd.DataFrame([len(b) for b in bow],columns=["length"])

In [20]:
df.head()

Unnamed: 0,length
0,51
1,6
2,17
3,27
4,29


In [21]:
df.describe()

Unnamed: 0,length
count,71462.0
mean,21.827671
std,21.068867
min,0.0
25%,10.0
50%,16.0
75%,26.0
max,803.0


In [22]:
label =  [np.argmax(v) for v in y]

In [23]:
len(label)

71462

In [24]:
dense = list(matutils.corpus2dense(bow,  num_terms=len(d)))

In [25]:
dense = np.array(dense).T

In [26]:
dense.shape

(71462, 56271)

# BOW + random forest

In [None]:
data_train_s, data_test_s, label_train_s, label_test_s = model_selection.train_test_split(dense, label, test_size=0.05)

In [None]:
estimator = RandomForestClassifier(verbose=10)

In [None]:
estimator.fit(data_train_s, label_train_s)

In [None]:
estimator.score(data_test_s, label_test_s)

In [None]:
tuned_parameters = [{'n_estimators': [50, 70, 90, 110, 130, 150]}]#, 'max_features': ['auto', 'sqrt', 'log2', None]}]

clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=2, scoring='accuracy', n_jobs=1,verbose=10)

In [None]:
clf.fit(data_train_s, label_train_s)

In [None]:
print("best param")
print(clf.best_estimator_)

In [None]:
for params, mean_score, all_scores in clf.grid_scores_:
        print("{:.3f} (+/- {:.3f}) for {}".format(mean_score, all_scores.std() / 2, params))

In [None]:
y_true, y_pred = label_test_s, clf.predict(data_test_s)
print(classification_report(y_true, y_pred,target_names=["nag","pos"]))

# BOW + LogisticRegression

In [27]:
y_ = [np.argmax(v) for v in label]

In [28]:
y_[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [34]:
data_train_s, data_test_s, label_train_s, label_test_s = model_selection.train_test_split(dense, label, test_size=0.05)

In [35]:
estimator = LogisticRegression(verbose=10)

In [37]:
estimator.fit(data_train_s, label_train_s)

[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=10, warm_start=False)

In [38]:
estimator.score(data_test_s, label_test_s)

0.9328483491885842

In [None]:
diparameter={"C": [10**i for i in range(-2,4)]}
licv=GridSearchCV(LogisticRegression(),param_grid=diparameter, cv=2, scoring='accuracy', n_jobs=1,verbose=10)
licv.fit(data_train_s, label_train_s)
predictor=licv.best_estimator_

# Tf-Idf + random forest

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
with open("data/amazon_ja/pos.txt") as f:
    raw_doc = f.readlines()
with open("data/amazon_ja/neg.txt") as f:
    raw_doc += f.readlines()
len(raw_doc)

In [None]:
vectorizer = TfidfVectorizer(tokenizer=t.tokenize)
train_matrix = vectorizer.fit_transform(raw_doc)

In [None]:
train_matrix.shape

In [None]:
train_matrix[0]

In [None]:
#data_train_s, data_test_s, label_train_s, label_test_s = model_selection.train_test_split(dense.T, label, test_size=0.1)
data_train_s, data_test_s, label_train_s, label_test_s = model_selection.train_test_split(train_matrix, label, test_size=0.05)

In [None]:
estimator = RandomForestClassifier(verbose=10)

In [None]:
estimator.fit(data_train_s, label_train_s)

In [None]:
estimator.score(data_test_s, label_test_s)

In [None]:
tuned_parameters = [{'n_estimators': [50, 70, 90, 110, 130, 150]}]#, 'max_features': ['auto', 'sqrt', 'log2', None]}]

clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=2, scoring='accuracy', n_jobs=3,verbose=10)

In [None]:
clf.fit(data_train_s, label_train_s)

In [None]:
print("best param")
print(clf.best_estimator_)

In [None]:
for params, mean_score, all_scores in clf.grid_scores_:
        print("{:.3f} (+/- {:.3f}) for {}".format(mean_score, all_scores.std() / 2, params))

In [None]:
y_true, y_pred = label_test_s, clf.predict(data_test_s)
print(classification_report(y_true, y_pred,target_names=["nag","pos"]))

# Tf-Idf + LogisticRegression

In [None]:
y_ = [np.argmax(v) for v in label]

In [None]:
data_train_s, data_test_s, label_train_s, label_test_s = model_selection.train_test_split(train_matrix, label, test_size=0.05)

In [None]:
estimator = LogisticRegression(verbose=10)

In [None]:
estimator.fit(data_train_s, label_train_s)

In [None]:
estimator.score(data_test_s, label_test_s)

In [None]:
diparameter={"C": [10**i for i in range(-2,4)]}
licv=GridSearchCV(LogisticRegression(),param_grid=diparameter, cv=2, scoring='accuracy', n_jobs=1,verbose=10)
licv.fit(data_train_s, label_train_s)
predictor=licv.best_estimator_