In [1]:
from pathlib import Path
import re
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from tensorflow.keras.utils import get_file
import nltk
import gensim
import spacy
import spacy.lang.ja
import MeCab
from pyknp import Juman
from IPython.display import display
pd.set_option('max_rows', 5)
pd.set_option('max_columns', 13)

# 自然言語の前処理
---
人間が日常の意思疎通のために用いている言語のことを自然言語という。  
自然言語は前処理を施さないと分析に適しないが、前処理の方法は言語ごとに異なる。

- 一般的な処理手順
 1. 形態素解析 (文書を単語単位に分割+単語の変化形などを1つにまとめる)
   - 英語の場合は[NLTK](https://www.nltk.org/)や[gensim](https://radimrehurek.com/gensim/)を使用し、分割してから[stemming](https://en.wikipedia.org/wiki/Stemming)や[lemmatisation](https://en.wikipedia.org/wiki/Lemmatisation)を実施
   - 日本語の場合は[spaCy](https://spacy.io/)+[GiNZA](https://megagonlabs.github.io/ginza/)・[MeCab](https://taku910.github.io/mecab/)・[JUMAN++](http://nlp.ist.i.kyoto-u.ac.jp/index.php?JUMAN++)などを使用
 1. ストップワードを除去
   - どんな文書にも頻繁に現れ、文書の特徴を表すのに役立たない単語 ("I", "is"や"私"、"です"など)を除去
   - 通常は言語ごとのストップワードの辞書を使用する

In [2]:
remove = ('headers', 'footers', 'quotes')

loader = fetch_20newsgroups(subset='all', random_state=1234, remove=remove)
news = pd.DataFrame(dict(document=loader.data, category=loader.target))
news['category'] = pd.Categorical.from_codes(news['category'],
                                             categories=loader.target_names)
print('news')
display(news)

news


Unnamed: 0,document,category
0,\n\n\nLikewise for me please. First time I've ...,comp.graphics
1,"Sorry, but I just wanted to be the first hypoc...",talk.politics.misc
...,...,...
18844,\nPut up or shut up. Where is your evidence?\n...,talk.politics.misc
18845,\n\n\n\nWe're looking at a series of chips by ...,sci.electronics


## Bag of Words (BoW)
---
ある文書における各単語の出現回数を数え、それをその文書の特徴とする手法。

### PythonでのBag of Words作成方法
---
英語の場合は、`sklearn.feature_extraction.text.CountVectorizer`や`gensim.corpora.Dictionary.doc2bow`を使用する。

`scikit-learn`の例

In [3]:
help(CountVectorizer)

Help on class CountVectorizer in module sklearn.feature_extraction.text:

class CountVectorizer(_VectorizerMixin, sklearn.base.BaseEstimator)
 |  CountVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>)
 |  
 |  Convert a collection of text documents to a matrix of token counts
 |  
 |  This implementation produces a sparse representation of the counts using
 |  scipy.sparse.csr_matrix.
 |  
 |  If you do not provide an a-priori dictionary and you do not use an analyzer
 |  that does some kind of feature selection then the number of features will
 |  be equal to the vocabulary size found by analyzing the data.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ------

In [4]:
# ノイズが多いのでアルファベットのみで構成され、文書全体で20回以上出現する単語に限定
vectorizer = CountVectorizer(stop_words='english',
                             token_pattern='(?u)\\b[a-z][a-z]+\\b',
                             min_df=20)
vec = vectorizer.fit_transform(news['document'])
news_bow = pd.DataFrame.sparse.from_spmatrix(
    vec, columns=vectorizer.get_feature_names())
news_bow

Unnamed: 0,aa,aaa,aaron,ab,abandon,abandoned,...,zionist,zionists,zip,zone,zoom,zx
0,0,0,0,0,0,0,...,0,0,0,0,0,0
1,0,0,0,0,0,0,...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18844,0,0,0,0,0,0,...,0,0,0,0,0,0
18845,0,0,0,0,0,0,...,0,0,0,0,0,0


In [5]:
def check_first_sample(df, original_data):
    exists = df.iloc[0] > 0
    display(pd.DataFrame().append([df.iloc[0, list(exists)]]))
    print(original_data.loc[0, 'document'])

In [6]:
check_first_sample(news_bow, news)

Unnamed: 0,hear,like,likewise,looking,months,past,time,ve
0,1,1,1,1,1,1,1,2





Likewise for me please. First time I've hear of it, but I've beem looking
for something like this for the past few months.


`nltk`+`gensim`の例

In [7]:
help(gensim.corpora.Dictionary.doc2bow)

Help on function doc2bow in module gensim.corpora.dictionary:

doc2bow(self, document, allow_update=False, return_missing=False)
    Convert `document` into the bag-of-words (BoW) format = list of `(token_id, token_count)` tuples.
    
    Parameters
    ----------
    document : list of str
        Input document.
    allow_update : bool, optional
        Update self, by adding new tokens from `document` and updating internal corpus statistics.
    return_missing : bool, optional
        Return missing tokens (tokens present in `document` but not in self) with frequencies?
    
    Return
    ------
    list of (int, int)
        BoW representation of `document`.
    list of (int, int), dict of (str, int)
        If `return_missing` is True, return BoW representation of `document` + dictionary with missing
        tokens and their frequencies.
    
    Examples
    --------
    .. sourcecode:: pycon
    
        >>> from gensim.corpora import Dictionary
        >>> dct = Dictionary(["

In [8]:
def doc2matrix(documents):
    dictionary = gensim.corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    matrix = gensim.matutils.corpus2csc(corpus, num_terms=len(dictionary)).T
    dictionary[0]  # dictionary.id2tokenの作成に必要
    bow_matrix = pd.DataFrame.sparse.from_spmatrix(
        matrix.astype(int),
        columns=[dictionary.id2token[i] for i in range(len(dictionary))])
    return bow_matrix

In [9]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
docs = []
for document in news['document']:
    words = [
        w for w in nltk.tokenize.word_tokenize(document) if w not in stop_words
    ]
    docs.append(words)
news_bow = doc2matrix(docs)
news_bow

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,'ve,",",.,First,I,Likewise,...,congo,GROWN,PSD3xx,WSI,_mega_,sourcing
0,2,1,2,1,2,1,...,0,0,0,0,0,0
1,0,1,2,0,3,0,...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18844,0,1,5,0,1,0,...,0,1,0,0,0,0
18845,0,6,3,0,0,0,...,0,0,1,1,1,1


In [10]:
check_first_sample(news_bow, news)

Unnamed: 0,'ve,",",.,First,I,Likewise,...,looking,months,past,please,something,time
0,2,1,2,1,2,1,...,1,1,1,1,1,1





Likewise for me please. First time I've hear of it, but I've beem looking
for something like this for the past few months.


日本語の場合は、`GiNZA`・`mecab`・`JUMAN++`などで形態素解析を行なってから、`gensim`を使用する。

In [11]:
url = 'https://www.aozora.gr.jp/cards/001562/files/52409_ruby_51058.zip'
file_name = url.split('/')[-1]
out_path = Path(get_file(file_name, url, extract=True))
with open(out_path.parent / '01jo.txt', 'rb') as f:
    text = f.read().decode('shift_jis')
    text = text.split('底本：')[0]
    text = re.split(r'\-{55,}', text)[2]
    text = re.sub(r'｜(.+?)《.+?》', r'\1', text)
    text = re.sub(r'《.+?》', '', text)
    text = re.sub(r'［＃.+?］', '', text)
document = [paragraph.strip() for paragraph in text.split('×')]
sangokushi = pd.DataFrame(dict(document=document))
print('sangokushi')
display(sangokushi)

sangokushi


Unnamed: 0,document
0,三国志は、いうまでもなく、今から約千八百年前の古典であるが、三国志の中に活躍している登場人物...
1,三国志には、詩がある。\r\n　単に尨大な治乱興亡を記述した戦記軍談の類でない所に、東洋人の...
...,...
4,現在の地名と、原本の誌す地名とは、当然時代による異いがあるので、分っている地方は下に註を加え...
5,原本には「通俗三国志」「三国志演義」その他数種あるが、私はそのいずれの直訳にもよらないで、随...


`spaCy`+`GiNZA`+`gensim`の例

In [12]:
tokenizer = spacy.load('ja_ginza')
docs = []
for document in sangokushi['document']:
    doc = tokenizer(document)
    words = []
    for sentence in doc.sents:
        words += [token.lemma_ for token in sentence if not token.is_stop]
    docs.append(words)
sangokushi_bow = doc2matrix(docs)
sangokushi_bow

Unnamed: 0,1800,―,、,。,かしら,きっと,...,通俗,違う,選る,長所,随時,難渋
0,1,2,14,4,1,1,...,0,0,0,0,0,0
1,0,0,16,7,0,0,...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,0,0,6,4,0,0,...,0,0,0,0,0,0
5,0,0,14,3,0,0,...,1,1,1,1,1,1


In [13]:
check_first_sample(sangokushi_bow, sangokushi)

Unnamed: 0,1800,―,、,。,かしら,きっと,...,要人,見る,親しむ,誰,過言,雑多
0,1,2,14,4,1,1,...,1,1,1,1,1,1


三国志は、いうまでもなく、今から約千八百年前の古典であるが、三国志の中に活躍している登場人物は、現在でも中国大陸の至る所にそのまま居るような気がする。――中国大陸へ行って、そこの雑多な庶民や要人などに接し、特に親しんでみると、三国志の中に出て来る人物の誰かしらときっと似ている。或いは、共通したものを感じる場合がしばしばある。
　だから、現代の中国大陸には、三国志時代の治乱興亡がそのままあるし、作中の人物も、文化や姿こそ変っているが、なお、今日にも生きているといっても過言でない。


`macab`+`gensim`の例

In [14]:
chasen = MeCab.Tagger('-Ochasen')
docs = []
for document in sangokushi['document']:
    words = []
    for sentence in document.split('\r\n'):
        words += [
            token.split('\t')[2]
            for token in chasen.parse(sentence).split('\n')
            if token not in ['EOS', '']
        ]
    docs.append(words)
sangokushi_bow = doc2matrix(docs)
sangokushi_bow

Unnamed: 0,――,Unnamed: 2,、,。,ある,いう,...,通俗,酌む,長所,随時,難渋,頃
0,1,1,14,4,3,2,...,0,0,0,0,0,0
1,0,4,16,7,4,1,...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,0,0,6,4,3,0,...,0,0,0,0,0,0
5,0,0,14,3,3,1,...,1,1,1,1,1,1


In [15]:
check_first_sample(sangokushi_bow, sangokushi)

Unnamed: 0,――,Unnamed: 2,、,。,ある,いう,...,行く,要人,親しむ,誰,過言,雑多
0,1,1,14,4,3,2,...,1,1,1,1,1,1


三国志は、いうまでもなく、今から約千八百年前の古典であるが、三国志の中に活躍している登場人物は、現在でも中国大陸の至る所にそのまま居るような気がする。――中国大陸へ行って、そこの雑多な庶民や要人などに接し、特に親しんでみると、三国志の中に出て来る人物の誰かしらときっと似ている。或いは、共通したものを感じる場合がしばしばある。
　だから、現代の中国大陸には、三国志時代の治乱興亡がそのままあるし、作中の人物も、文化や姿こそ変っているが、なお、今日にも生きているといっても過言でない。


`JUMAN++`+`gensim`の例

In [16]:
juman = Juman()
docs = []
for document in sangokushi['document']:
    words = []
    for sentence in document.split('\r\n'):
        words += [
            token.genkei for token in juman.analysis(sentence).mrph_list()
        ]
    docs.append(words)
sangokushi_bow = doc2matrix(docs)
sangokushi_bow

Unnamed: 0,――,Unnamed: 2,、,。,ある,いう,...,読者,通俗,酌む,長所,随時,難渋
0,1,1,14,4,2,2,...,0,0,0,0,0,0
1,0,4,16,7,3,1,...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,0,0,6,4,2,0,...,0,0,0,0,0,0
5,0,0,14,3,1,1,...,1,1,1,1,1,1


In [17]:
check_first_sample(sangokushi_bow, sangokushi)

Unnamed: 0,――,Unnamed: 2,、,。,ある,いう,...,行く,要人,親しむ,誰,過言,雑多だ
0,1,1,14,4,2,2,...,1,1,1,1,1,1


三国志は、いうまでもなく、今から約千八百年前の古典であるが、三国志の中に活躍している登場人物は、現在でも中国大陸の至る所にそのまま居るような気がする。――中国大陸へ行って、そこの雑多な庶民や要人などに接し、特に親しんでみると、三国志の中に出て来る人物の誰かしらときっと似ている。或いは、共通したものを感じる場合がしばしばある。
　だから、現代の中国大陸には、三国志時代の治乱興亡がそのままあるし、作中の人物も、文化や姿こそ変っているが、なお、今日にも生きているといっても過言でない。


## tf-idf
---
多くの文書に出現する単語の数値は低く、少数の文書にしか出現しない単語の数値は高くなるようにし、文書の特徴を強調する指標。  
tf (term frequency) は、1つの文書 (上のデータフレームでは1サンプル=1行) における各単語の出現頻度を、 idf (inverse document frequency) は全文書において各単語が出現する文書の割合の逆数 (の対数) を表す。

文書数を $n$ 、全文書中に出現する単語の種類を $m$ 、文書 $j$ に単語 $i$ の出現する回数を $c_{i,j}$ 、文書 $j$ に単語 $i$ が出現するかどうかを表す変数 (出現する場合は1、出現しない場合は0をとる2値変数) を $d_{i,j}$ とすると、 tf-idf は以下の $tf_{i,j}$ ・ $idf_{i}$ の積で表される。

$
\displaystyle \begin{cases}
    tf_{i,j} & ={\displaystyle \frac
        {c_{i,j}}
        {{\displaystyle \sum ^{m}_{k=1} c_{k,j}}}
    }\\
    idf_{i} & ={\displaystyle log\frac
        {n}
        {{\displaystyle \sum ^{n}_{k=1} d_{i,k}}}
    }
\end{cases}
$

練習問題

---
`sangokushi`データセットのBoWを表す`sangokushi_bow`を元に、出現回数の値をtfに置き換えた`sangokushi_tf`を作成する。

In [18]:
print('sangokushi_bow')
display(sangokushi_bow)

sangokushi_bow


Unnamed: 0,――,Unnamed: 2,、,。,ある,いう,...,読者,通俗,酌む,長所,随時,難渋
0,1,1,14,4,2,2,...,0,0,0,0,0,0
1,0,4,16,7,3,1,...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,0,0,6,4,2,0,...,0,0,0,0,0,0
5,0,0,14,3,1,1,...,1,1,1,1,1,1


解答例

---

In [19]:
sangokushi_tf = sangokushi_bow.div(sangokushi_bow.sum(axis='columns'),
                                   axis='rows')
sangokushi_tf

Unnamed: 0,――,Unnamed: 2,、,。,ある,いう,...,読者,通俗,酌む,長所,随時,難渋
0,0.006757,0.006757,0.094595,0.027027,0.013514,0.013514,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.020833,0.083333,0.036458,0.015625,0.005208,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,0.000000,0.000000,0.071429,0.047619,0.023810,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.000000,0.000000,0.089172,0.019108,0.006369,0.006369,...,0.006369,0.006369,0.006369,0.006369,0.006369,0.006369


---

練習問題

---
`sangokushi`データセットのBoWを表す`sangokushi_bow`を元に、各単語のidfを表す`sangokushi_idf`を作成する。

解答例

---

In [20]:
sangokushi_binary = sangokushi_bow.astype(bool)
sangokushi_df = sangokushi_binary.sum(axis='rows') / sangokushi_bow.index.size
sangokushi_idf = -np.log(sangokushi_df)
sangokushi_idf

――    1.791759
　     1.098612
        ...   
随時    1.791759
難渋    1.791759
Length: 367, dtype: float64

---

練習問題

---
`sangokushi_tf`と`sangokushi_idf`から、`sangokushi_bow`の各値をtf-idfに置き換えた`sangokushi_tfidf`を作成し、各文書のBoW上位5単語とtf-idf上位5単語を比較する。

解答例

---

In [21]:
sangokushi_tfidf = sangokushi_tf.mul(sangokushi_idf, axis='columns')
columns = sangokushi_bow.columns.values
print('Bag of Words')
print(columns[np.argsort(sangokushi_bow)][:, :-5:-1])
print('tf-idf')
print(columns[np.argsort(sangokushi_tfidf)][:, :-5:-1])

Bag of Words
[['、' 'の' 'に' 'が']
 ['、' 'に' 'を' 'の']
 ['の' '、' 'に' 'と']
 ['、' 'の' 'と' 'も']
 ['、' 'の' '。' 'は']
 ['、' 'に' 'の' 'は']]
tf-idf
[['大陸' '中国' 'そのまま' '中']
 ['\u3000' '詩' '搏' 'しまう']
 ['代' 'さ' '中国' '年']
 ['民俗' 'られる' '彩る' '生']
 ['地名' '文字' '分る' '特有だ']
 ['寝る' '演義' '「' '」']]


---

### Pythonでのtf-idf作成方法
---
`sklearn.feature_extraction.text.TfidfVectorizer`または`sklearn.feature_extraction.text.TfidfTransformer`を使用する。  
デフォルト設定は計算の安定その他の理由から、上記 tf-idf の説明とは少し異なる値を返すようになっている。

In [22]:
help(TfidfVectorizer)

Help on class TfidfVectorizer in module sklearn.feature_extraction.text:

class TfidfVectorizer(CountVectorizer)
 |  TfidfVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.float64'>, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
 |  
 |  Convert a collection of raw documents to a matrix of TF-IDF features.
 |  
 |  Equivalent to :class:`CountVectorizer` followed by
 |  :class:`TfidfTransformer`.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ----------
 |  input : {'filename', 'file', 'content'}, default='content'
 |      If 'filename', the sequence passed as an argument to fit is
 |      expected to be a list of filenames that need reading to fetch
 |  

In [23]:
vectorizer = TfidfVectorizer(stop_words='english',
                             token_pattern='(?u)\\b[a-z][a-z]+\\b',
                             min_df=20)
vec = vectorizer.fit_transform(news['document'])
news_tfidf = pd.DataFrame.sparse.from_spmatrix(
    vec, columns=vectorizer.get_feature_names())
news_tfidf

Unnamed: 0,aa,aaa,aaron,ab,abandon,abandoned,...,zionist,zionists,zip,zone,zoom,zx
0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18844,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
18845,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
help(TfidfTransformer)

Help on class TfidfTransformer in module sklearn.feature_extraction.text:

class TfidfTransformer(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator)
 |  TfidfTransformer(*, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
 |  
 |  Transform a count matrix to a normalized tf or tf-idf representation
 |  
 |  Tf means term-frequency while tf-idf means term-frequency times inverse
 |  document-frequency. This is a common term weighting scheme in information
 |  retrieval, that has also found good use in document classification.
 |  
 |  The goal of using tf-idf instead of the raw frequencies of occurrence of a
 |  token in a given document is to scale down the impact of tokens that occur
 |  very frequently in a given corpus and that are hence empirically less
 |  informative than features that occur in a small fraction of the training
 |  corpus.
 |  
 |  The formula that is used to compute the tf-idf for a term t of a document d
 |  in a document set is tf-idf(t, d)

In [25]:
transformer = TfidfTransformer()
vec = transformer.fit_transform(sangokushi_bow)
sangokushi_tfidf = pd.DataFrame.sparse.from_spmatrix(
    vec, columns=sangokushi_bow.columns)
sangokushi_tfidf

Unnamed: 0,――,Unnamed: 2,、,。,ある,いう,...,読者,通俗,酌む,長所,随時,難渋
0,0.07497,0.061476,0.465906,0.133116,0.066558,0.088953,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.00000,0.209835,0.454361,0.198783,0.085193,0.037953,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,0.00000,0.000000,0.314257,0.209505,0.104752,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.00000,0.000000,0.442520,0.094826,0.031609,0.042244,...,0.071207,0.071207,0.071207,0.071207,0.071207,0.071207


## 推薦図書
---
- [見て試してわかる機械学習アルゴリズムの仕組み 機械学習図鑑](https://www.amazon.co.jp/%E8%A6%8B%E3%81%A6%E8%A9%A6%E3%81%97%E3%81%A6%E3%82%8F%E3%81%8B%E3%82%8B%E6%A9%9F%E6%A2%B0%E5%AD%A6%E7%BF%92%E3%82%A2%E3%83%AB%E3%82%B4%E3%83%AA%E3%82%BA%E3%83%A0%E3%81%AE%E4%BB%95%E7%B5%84%E3%81%BF-%E6%A9%9F%E6%A2%B0%E5%AD%A6%E7%BF%92%E5%9B%B3%E9%91%91-%E7%A7%8B%E5%BA%AD-%E4%BC%B8%E4%B9%9F/dp/4798155659/)
- [Python 機械学習プログラミング 達人データサイエンティストによる理論と実践](https://www.amazon.co.jp/Python-%E6%A9%9F%E6%A2%B0%E5%AD%A6%E7%BF%92%E3%83%97%E3%83%AD%E3%82%B0%E3%83%A9%E3%83%9F%E3%83%B3%E3%82%B0-%E9%81%94%E4%BA%BA%E3%83%87%E3%83%BC%E3%82%BF%E3%82%B5%E3%82%A4%E3%82%A8%E3%83%B3%E3%83%86%E3%82%A3%E3%82%B9%E3%83%88%E3%81%AB%E3%82%88%E3%82%8B%E7%90%86%E8%AB%96%E3%81%A8%E5%AE%9F%E8%B7%B5-impress-gear/dp/4295003379/)
- [Kaggleで勝つデータ分析の技術](https://www.amazon.co.jp/Kaggle%E3%81%A7%E5%8B%9D%E3%81%A4%E3%83%87%E3%83%BC%E3%82%BF%E5%88%86%E6%9E%90%E3%81%AE%E6%8A%80%E8%A1%93-%E9%96%80%E8%84%87-%E5%A4%A7%E8%BC%94/dp/4297108437/)