<a href="https://colab.research.google.com/github/Re14m/training/blob/master/2022_03_26_recipie22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ライブラリのインストール
!pip install scikit-learn==0.23.2



In [2]:
# バージョン確認（python）
import platform
print("python " + platform.python_version())

python 3.7.13


In [3]:
# datasetの読込
from sklearn.datasets import fetch_20newsgroups
train = fetch_20newsgroups(subset='train')

In [4]:
# datasetの確認
train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
# datasetからカテゴリ指定して抽出
categories=['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space']
train = fetch_20newsgroups(subset='train', categories=categories)

In [6]:
# datasetのサイズ確認
print(len(train.data))  # train.data=ニュース記事のテキスト
print(len(train.target))  # train.target=ニュース記事のテキストに対する正解ラベル

2373
2373


In [7]:
# ニュース記事のテキストを確認
print(train.data[0])

From: al@escom.com (Al Donaldson)
Subject: Re: Once tapped, your code is no good any more.
Reply-To: al@escom.COM (Al Donaldson)
Organization: ESCOM Corp., Oakton VA (USA)
Distribution: na
Lines: 16

amolitor@nmsu.edu (Andrew Molitor) writes:
>Yes, those evil guys in the FBI can probably, with some
>effort, abuse the system. I got news for you, if the evil guys in
>the FBI decide they want to persecute you, they're gonna, ...

And if Richard Nixon had had this kind of toy, he wouldn't have had
to send people into the Watergate.

But that's not really the issue.  The real issue is whether this 
will be used to justify a ban against individuals' use of private 
(i.e., anything else) encryption methods.

Unrelated question...isn't the term "Clipper," as neat as it is,
already taken by Intergraph?

Al



In [8]:
# ニュース記事のラベルを確認
train.target[0] # ラベルが0=分類が0

0

In [9]:
# 分類をすべて表示
train.target_names

['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space']

In [10]:
# 記事のラベルから分類名を表示
train.target_names[train.target[0]]

'sci.crypt'

In [11]:
# テキストの最適化（TS-IDF）
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train.data).toarray()
print(len(vectorizer.get_feature_names()))
print(X_train)
print(X_train.shape)

38683
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(2373, 38683)


In [12]:
# 訓練（RandomForest）

# 分類器の生成
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [13]:
# 訓練の実施
clf.fit(X_train, train.target) # fitの第一引数に訓練データセットの特徴量、第二引数に訓練データセットの正解ラベルを指定

RandomForestClassifier()

In [14]:
# 精度の検証

# testdatasetの読込
test = fetch_20newsgroups(subset='test', categories=categories)

In [15]:
# TS-IDFで数値化
X_test = vectorizer.transform(test.data).toarray()
X_test.shape

(1579, 38683)

In [16]:
# testdataと分類器で予測
preds = clf.predict(X_test)
preds

array([2, 3, 2, ..., 1, 3, 2])

In [17]:
# 訓練した分類器の正解率を計算（Accurancy）
from sklearn.metrics import accuracy_score
accuracy_score(test.target, preds)

0.8549715009499683