In [52]:
import platform
import MeCab
from  urllib  import request
import re
import numpy as np

In [53]:
class Tokenizer:
    def __init__(self, stopwords=None, parser=None, include_pos=None, exclude_posdetail=None, exclude_reg=None):
    
        if stopwords:
            self.stopwords = stopwords
        else:
            self.stopwords = self.get_stopwords()
        self.include_pos = include_pos if include_pos else  ["名詞", "動詞", "形容詞"]
        self.exclude_posdetail = exclude_posdetail if exclude_posdetail else ["接尾", "数"]
        self.exclude_reg = exclude_reg if exclude_reg else r"$^"  # no matching reg
        if parser:
            self.parser = parser
        else:
            if platform.system() == "Darwin":
                dic_dir = "/usr/local/lib/mecab/dic/mecab-ipadic-neologd/" #mac
            else:
                dic_dir = "/usr/lib/mecab/dic/mecab-ipadic-neologd"
            mecab = MeCab.Tagger("-Ochasen -d {}".format(dic_dir))
            self.parser = mecab.parse
            

    def tokenize(self, text, show_pos=False):
        text = re.sub(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", "", text)    #URL
        text = re.sub(r"\"?([-a-zA-Z0-9.`?{}]+\.jp)\"?" ,"", text)  # xxx.jp 
        text = text.lower()
        l = [line.split("\t") for line in self.parser(text).split("\n")]
        res = [
            i[2] if not show_pos else (i[2],i[3]) for i in l 
                if len(i) >=4 # has POS.
                    and i[3].split("-")[0] in self.include_pos
                    and i[3].split("-")[1] not in self.exclude_posdetail
                    and not re.search(r"(-|−)\d", i[2])
                    and not re.search(self.exclude_reg, i[2])
                    and i[2] not in self.stopwords          
            ]
        return res
    
    def get_stopwords(self):
        res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt")
        stopwords = [line.decode("utf-8").strip() for line in res]
        print("Japanese stopword: ", ", ".join(stopwords[:3]), "...")
        res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/English.txt")
        stopwords += [line.decode("utf-8").strip() for line in res]
        print("English stopword: ...", ", ".join(stopwords[-3:]), )
        return stopwords

In [54]:
t = Tokenizer()

Japanese stopword:  あそこ, あたり, あちら ...
English stopword: ... you've, z, zero


In [55]:
t.tokenize("認めたくないものだな。自分自身の若さ故の過ちというものを。")

['認める', '自分自身', '若さ故の過ち']

In [56]:
def load_data_and_labels(positive_data_file, negative_data_file, level="char", lang="En"):
       
    positive_examples = list(open(positive_data_file, "r").readlines())
    negative_examples = list(open(negative_data_file, "r").readlines())
    if level == "char":
        positive_examples = [s.replace(" ", "").replace("", " ").lower() for s in positive_examples]
        negative_examples = [s.replace(" ", "").replace("", " ").lower() for s in negative_examples]
    elif level == "word":
        if lang == "Ja":
            t = Tokenizer()
            positive_examples = [t.tokenize(s) for s in positive_examples]
            negative_examples = [t.tokenize(s) for s in negative_examples]
        else:
            positive_examples = [s.strip() for s in positive_examples]
            negative_examples = [s.strip() for s in negative_examples]
    else:
        print("invaid value of 'level'. ('char' or 'word') ")
        
    n_pos = len(positive_examples)
    n_neg = len(negative_examples)
    ratio = n_pos/n_neg
    print("# pos: ", n_pos)
    print("# neg: ", n_neg)
    print("pos/neg:", ratio)
    x_text = positive_examples + negative_examples

    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    
    return x_text, y, ratio

In [57]:
def load_data_and_labels_multiclass(files, level="char", lang="En"):
    labels = []
    x_text = []
    n_classes = len(files)
    
    for i, f in enumerate(files):
        positive_examples = list(open(f, "r").readlines())
        if level == "char":
            positive_examples = [s.replace(" ", "").replace("", " ").lower() for s in positive_examples]
        elif level == "word":
            if lang == "Ja":
                t = Tokenizer()
                positive_examples = [t.tokenize(s) for s in positive_examples]
            else:
                positive_examples = [s.strip() for s in positive_examples]
        else:
            print("invaid value of 'level'. ('char' or 'word') ")
        print(len(positive_examples))
        x_text += positive_examples
        positive_labels = [np.identity(n_classes)[i] for _ in positive_examples]
        labels.append(positive_labels)
    
    y = np.concatenate(labels, 0)
    
    return x_text, y