In [1]:
# SNSメッセージを使用して、ナイーブベイズでスパムか分類

In [4]:
import numpy as np
import pandas as pd
import matplotlib

import nltk
from nltk.corpus import stopwords
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [5]:
# NLTLモジュールのダウンロード
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
# spam.csv読み込み
spam = pd.read_csv('spam.csv', encoding='latin-1')

In [8]:
spam.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [9]:
spam.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""",GE,"GNT:-)"""
freq,4825,30,3,2,2


In [11]:
spam.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [12]:
# スパムデータを確認
spam['v2'][spam['v1'] == 'spam'][0:5]

2     Free entry in 2 a wkly comp to win FA Cup fina...
5     FreeMsg Hey there darling it's been 3 week's n...
8     WINNER!! As a valued network customer you have...
9     Had your mobile 11 months or more? U R entitle...
11    SIX chances to win CASH! From 100 to 20,000 po...
Name: v2, dtype: object

In [14]:
# 不要な Unnamed カラム削除
spam = spam.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [15]:
# カラム名の変更
spam = spam.rename(columns={'v1':'target', 'v2':'SNS message'})

In [16]:
spam.head()

Unnamed: 0,target,SNS message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
spam.describe()

Unnamed: 0,target,SNS message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [28]:
# メッセージの前処理
# 1.句読点・記号を取り除き、小文字にする。
# 2. ストップワードの削除
def clean_message(input):
    retun_message = ''.join([char for char in input if char not in string.punctuation]).lower()
    retun_message = ' '.join([word for word in retun_message.split() if word not in (stopwords.words('english'))])
    return retun_message

In [29]:
print(spam['SNS message'][4])

Nah I don't think he goes to usf, he lives around here though


In [30]:
print(clean_message(spam['SNS message'][4]))

nah dont think goes usf lives around though


In [24]:
# 句読点と記号
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [31]:
# メッセージを処理
spam['SNS message'] = spam['SNS message'].apply(clean_message)

In [32]:
spam.head()

Unnamed: 0,target,SNS message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though


In [34]:
# 訓練データとテストデータを分割
train_set, test_set = train_test_split(spam, test_size=0.2, random_state=42)