In [1]:
# SNSメッセージを使用して、ナイーブベイズでスパムか分類

In [2]:
import numpy as np
import pandas as pd
import matplotlib

import nltk
from nltk.corpus import stopwords
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [3]:
# NLTLモジュールのダウンロード
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# spam.csv読み込み
spam = pd.read_csv('spam.csv', encoding='latin-1')

In [5]:
spam.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
spam.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [7]:
spam.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [8]:
# スパムデータを確認
spam['v2'][spam['v1'] == 'spam'][0:5]

2     Free entry in 2 a wkly comp to win FA Cup fina...
5     FreeMsg Hey there darling it's been 3 week's n...
8     WINNER!! As a valued network customer you have...
9     Had your mobile 11 months or more? U R entitle...
11    SIX chances to win CASH! From 100 to 20,000 po...
Name: v2, dtype: object

In [9]:
# 不要な Unnamed カラム削除
spam = spam.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [10]:
# カラム名の変更
spam = spam.rename(columns={'v1':'target', 'v2':'SNS message'})

In [11]:
spam.head()

Unnamed: 0,target,SNS message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
spam.describe()

Unnamed: 0,target,SNS message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [13]:
# メッセージの前処理
# 1.句読点・記号を取り除き、小文字にする。
# 2. ストップワードの削除
def clean_message(ch_message):
    retun_message = ''.join([char for char in ch_message if char not in string.punctuation]).lower()
    retun_message = ' '.join([word for word in retun_message.split() if word not in (stopwords.words('english'))])
    return retun_message

In [14]:
print(spam['SNS message'][4])

Nah I don't think he goes to usf, he lives around here though


In [15]:
print(clean_message(spam['SNS message'][4]))

nah dont think goes usf lives around though


In [16]:
# 句読点と記号
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
# メッセージを処理
spam['SNS message'] = spam['SNS message'].apply(clean_message)

In [18]:
spam.head()

Unnamed: 0,target,SNS message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though


In [19]:
# 訓練データとテストデータを分割
train_set, test_set = train_test_split(spam, test_size=0.2, random_state=42)

In [20]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [19]:
# 日本語はない。。。
print(stopwords.words('japanese'))

OSError: No such file or directory: 'C:\\Users\\owner\\AppData\\Roaming\\nltk_data\\corpora\\stopwords\\japanese'

In [23]:
# 訓練データとテストデータに分割
train_set, test_set = train_test_split(spam, test_size=0.2, random_state=42)

In [24]:
# BoWを試す

In [25]:
print(spam['SNS message'][4])
print(spam['SNS message'][5])

Nah I don't think he goes to usf, he lives around here though
FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv


In [23]:
# strオブジェクトだとNG。listオブジェクトで（Iterable）
text = [''.join(spam['SNS message'][4] + spam['SNS message'][5])]

In [24]:
text

['nah dont think goes usf lives around thoughfreemsg hey darling 3 weeks word back id like fun still tb ok xxx std chgs send å£150 rcv']

In [25]:
# CountVectorizer
vectorrizer = CountVectorizer()

In [26]:
# 文章をCountVectorrizerで訓練する
vectorrizer.fit(text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [28]:
type(spam['SNS message'][4])

str

In [29]:
type(text)

list

In [46]:
print(vectorrizer.vocabulary_)

{'nah': 17, 'don': 7, 'think': 28, 'he': 11, 'goes': 10, 'to': 30, 'usf': 32, 'lives': 16, 'around': 2, 'here': 12, 'thoughfreemsg': 29, 'hey': 13, 'there': 27, 'darling': 6, 'it': 14, 'been': 4, 'week': 33, 'now': 19, 'and': 1, 'no': 18, 'word': 34, 'back': 3, 'like': 15, 'some': 23, 'fun': 9, 'you': 36, 'up': 31, 'for': 8, 'still': 25, 'tb': 26, 'ok': 20, 'xxx': 35, 'std': 24, 'chgs': 5, 'send': 22, '50': 0, 'rcv': 21}


In [33]:
vector_1 = vectorrizer.transform([''.join(spam['SNS message'][4])])
vector_2 = vectorrizer.transform([''.join(spam['SNS message'][5])])
print(vector_1.toarray())
print(vector_2.toarray())                                 

[[0 1 0 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0]]
[[1 0 1 1 1 0 1 0 1 1 1 0 0 1 1 1 1 1 1 0 0 0 1 1 1]]


In [34]:
vector_1.shape, vector_2.shape

((1, 25), (1, 25))

In [35]:
print(type(vector_1))

<class 'scipy.sparse.csr.csr_matrix'>


In [38]:
len(vectorrizer.vocabulary_)

25

In [40]:
# 訓練データのメッセージを学習
count = vectorrizer.fit_transform(train_set['SNS message'])

In [41]:
# ターゲットの分割
target_train = train_set['target'].values

In [44]:
# 「多項分布・ナイーブベイズ（MultinomialNB）」
classifier = MultinomialNB()
classifier.fit(count, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [45]:
# 訓練データで予測
y_pred = classifier.predict(count)

In [46]:
y_pred

array(['ham', 'spam', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [48]:
# 訓練データ結果の混合行列
confusion_matrix(target_train, y_pred)

array([[3844,   16],
       [  15,  582]], dtype=int64)

In [49]:
# 訓練データの正解率
accuracy_score(target_train, y_pred)

0.9930446488669509

In [50]:
# テストデータの特徴量を Bag of words に変換
test_count = vectorrizer.transform(test_set['SNS message'])

In [51]:
# テストデータのターゲットの分割
test_target = test_set['target']

In [52]:
# テストデータで予測
y_pred_test = classifier.predict(test_count)

In [53]:
# テストデータで予測結果の混合行列
confusion_matrix(test_target, y_pred_test)

array([[961,   4],
       [ 18, 132]], dtype=int64)

In [54]:
# テストデータでの正解率
accuracy_score(test_target, y_pred_test)

0.9802690582959641