In [1]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk import word_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk import RegexpTokenizer
from nltk import stem
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import fbeta_score, make_scorer
from sklearn import cross_validation
from sklearn.base import TransformerMixin
from sklearn.model_selection import learning_curve
from sklearn.metrics import roc_curve, auc



# С чем работаем?

Подготовим данные - прочтём и изучим их сбалансированность.

In [2]:
path = 'smsspamcollection/SMSSpamCollection'
mes = pandas.read_csv(path, sep='\t',names=["label", "message"])
mes['label'] = mes['label'].map({'ham': 0, 'spam': 1}).astype(int)
print(mes.groupby('label').count())

       message
label         
0         4825
1          747


Выборка, очевидно, не сбалансирована - в ~6.5 раз меньше спама чем полезных сообщений.

In [3]:
mes_train, mes_test, label_train, label_test = train_test_split(mes['message'], mes['label'], test_size=0.2)
countvec = CountVectorizer(tokenizer = word_tokenize, stop_words = "english")
#train_bowed_mes = countvec.fit(mes_train)
#test_bowed_mes = countvec.fit(mes_test)
countvec.fit(mes_train)
countvec.fit(mes_test)
train_bowed_mes = countvec.transform(mes_train)
test_bowed_mes = countvec.transform(mes_test)
print(train_bowed_mes.shape, test_bowed_mes.shape)

(4457, 3625) (1115, 3625)


# Dummy classifier

Первым делом проделаем векторизацию, иначе классификатор не сможет обработать то, что мы ему дадим, ведь у нас половина таблицы - это текст. Затем обучим классифиикатор.

In [4]:
#обучение DC
clf = DummyClassifier(strategy='most_frequent', random_state=0)
clf = clf.fit(train_bowed_mes, label_train)
print(classification_report(label_test, clf.predict(test_bowed_mes)))

             precision    recall  f1-score   support

          0       0.87      1.00      0.93       974
          1       0.00      0.00      0.00       141

avg / total       0.76      0.87      0.81      1115



  'precision', 'predicted', average, warn_for)


Мы видим, что DC не справляется с тем, чтобы определить спам. Поэтому работать мы с ним не будем

# Токенизация

Токенизация со знаками препинания. CountVectorizer()

In [5]:
#наивный Байес
naive_model = MultinomialNB()
naive_model.fit(train_bowed_mes, label_train)
#выдача результатов
cv_results = cross_val_score(naive_model, test_bowed_mes, label_test,  cv=10, scoring='accuracy')
print(classification_report(label_test, naive_model.predict(test_bowed_mes)))

             precision    recall  f1-score   support

          0       1.00      0.97      0.98       974
          1       0.81      0.98      0.88       141

avg / total       0.97      0.97      0.97      1115



Токенизация со знаками препинания. TfidfVectorizer()

In [12]:
bow_token_TIV = TfidfVectorizer()
bow_token_TIV.fit(mes_train)
bow_token_TIV.fit(mes_test)
train_bowed_mes = bow_token_TIV.transform(mes_train)
test_bowed_mes = bow_token_TIV.transform(mes_test)
#bowed_messages = bow_token_TIV.transform(mes['message'])
naive_model = MultinomialNB()
naive_model.fit(train_bowed_mes, label_train)
cv_results = cross_val_score(naive_model, test_bowed_mes, label_test,  cv=10, scoring='accuracy')
print(classification_report(label_test, naive_model.predict(test_bowed_mes)))

             precision    recall  f1-score   support

          0       0.98      1.00      0.99       974
          1       0.98      0.84      0.90       141

avg / total       0.98      0.98      0.98      1115



Токенизация без знаков препинания. CountVectorizer()

In [7]:
bow_token_P_CV = CountVectorizer(tokenizer=RegexpTokenizer(r'\w+').tokenize)
#Пытался сунуть другой токенайзер, но он отказался работать((((
bow_token_P_CV.fit(mes_train)
bow_token_P_CV.fit(mes_test)
train_bowed_mes = bow_token_P_CV.transform(mes_train)
test_bowed_mes = bow_token_P_CV.transform(mes_test)
naive_model = MultinomialNB()
naive_model.fit(train_bowed_mes, label_train)
cv_results = cross_val_score(naive_model, test_bowed_mes, label_test,  cv=10, scoring='accuracy')
print(classification_report(label_test, naive_model.predict(test_bowed_mes)))

             precision    recall  f1-score   support

          0       0.99      0.98      0.99       974
          1       0.87      0.96      0.91       141

avg / total       0.98      0.98      0.98      1115



Токенизация без знаков препинания. TfidfVectorizer()

In [13]:
bow_token_P_TIV = TfidfVectorizer(tokenizer=RegexpTokenizer(r'\w+').tokenize)
bow_token_P_TIV.fit(mes_train)
bow_token_P_TIV.fit(mes_test)
train_bowed_mes = bow_token_P_TIV.transform(mes_train)
test_bowed_mes = bow_token_P_TIV.transform(mes_test)
naive_model = MultinomialNB()
naive_model.fit(train_bowed_mes, label_train)
cv_results = cross_val_score(naive_model, test_bowed_mes, label_test,  cv=10, scoring='accuracy')
print(classification_report(label_test, naive_model.predict(test_bowed_mes)))

             precision    recall  f1-score   support

          0       0.98      1.00      0.99       974
          1       0.98      0.84      0.91       141

avg / total       0.98      0.98      0.98      1115



Токенизация со знаками препинания показала себя чуть лучше. Буквально на капельку. Можно сказать, что разницы нет вообще.
Но в обоихъ случаях TfidfVectorizer() явно хуже, чем CountVectorizer()

# Стемминг и лемматизация

Стемминг. CountVectorizer()

In [9]:
bow_stem_CV = CountVectorizer(tokenizer=RegexpTokenizer(r'\w+').tokenize)
bow_stem_CV.fit(mes_train)
bow_stem_CV.fit(mes_test)
train_bowed_mes = bow_stem_CV.transform(mes_train)
test_bowed_mes = bow_stem_CV.transform(mes_test)
naive_model = MultinomialNB()
naive_model.fit(train_bowed_mes, label_train)
cv_results = cross_val_score(naive_model, test_bowed_mes, label_test,  cv=10, scoring='accuracy')
print(classification_report(label_test, naive_model.predict(test_bowed_mes)))

             precision    recall  f1-score   support

          0       0.99      0.98      0.99       974
          1       0.87      0.96      0.91       141

avg / total       0.98      0.98      0.98      1115



Стемминг. TfidfVectorizer()

In [14]:
bow_stem_TIV = TfidfVectorizer()
bow_stem_TIV.fit(mes_train)
bow_stem_TIV.fit(mes_test)
train_bowed_mes = bow_stem_TIV.transform(mes_train)
test_bowed_mes = bow_stem_TIV.transform(mes_test)
naive_model.fit(train_bowed_mes, label_train)
cv_results = cross_val_score(naive_model, test_bowed_mes, label_test,  cv=10, scoring='accuracy')
print(classification_report(label_test, naive_model.predict(test_bowed_mes)))

             precision    recall  f1-score   support

          0       0.98      1.00      0.99       974
          1       0.98      0.84      0.90       141

avg / total       0.98      0.98      0.98      1115



Лемматизация. CountVectorizer()

In [10]:
bow_lem_CV = CountVectorizer(tokenizer=RegexpTokenizer(r'\w+').tokenize)
bow_lem_CV.fit(mes_train)
bow_lem_CV.fit(mes_test)
train_bowed_mes = bow_lem_CV.transform(mes_train)
test_bowed_mes = bow_lem_CV.transform(mes_test)
naive_model = MultinomialNB()
naive_model.fit(train_bowed_mes, label_train)
cv_results = cross_val_score(naive_model, test_bowed_mes, label_test,  cv=10, scoring='accuracy')
print(classification_report(label_test, naive_model.predict(test_bowed_mes)))

             precision    recall  f1-score   support

          0       0.99      0.98      0.99       974
          1       0.87      0.96      0.91       141

avg / total       0.98      0.98      0.98      1115



Лемматизация. TfidfVectorizer()

In [15]:
bow_lem_TIV = TfidfVectorizer(tokenizer=RegexpTokenizer(r'\w+').tokenize)
bow_lem_TIV.fit(mes_train)
bow_lem_TIV.fit(mes_test)
train_bowed_mes = bow_lem_TIV.transform(mes_train)
test_bowed_mes = bow_lem_TIV.transform(mes_test)
naive_model = MultinomialNB()
naive_model.fit(train_bowed_mes, label_train)
cv_results = cross_val_score(naive_model, test_bowed_mes, label_test,  cv=10, scoring='accuracy')
print(classification_report(label_test, naive_model.predict(test_bowed_mes)))

             precision    recall  f1-score   support

          0       0.98      1.00      0.99       974
          1       0.98      0.84      0.91       141

avg / total       0.98      0.98      0.98      1115



Между лемматизацией и стеммингом никакой разницы. При этом CountVectorizer() и TfidfVectorizer() сохраняют свои различия. 

# Стоп-слова

In [11]:
bow_SW_CV = CountVectorizer(tokenizer=RegexpTokenizer(r'\w+').tokenize, stop_words='english')
bow_SW_CV.fit(mes_train)
bow_SW_CV.fit(mes_test)
train_bowed_mes = bow_SW_CV.transform(mes_train)
test_bowed_mes = bow_SW_CV.transform(mes_test)
naive_model = MultinomialNB()
naive_model.fit(train_bowed_mes, label_train)
cv_results = cross_val_score(naive_model, test_bowed_mes, label_test,  cv=10, scoring='accuracy')
print(classification_report(label_test, naive_model.predict(test_bowed_mes)))

             precision    recall  f1-score   support

          0       1.00      0.98      0.99       974
          1       0.86      0.98      0.92       141

avg / total       0.98      0.98      0.98      1115



In [16]:
bow_SW_TIV = TfidfVectorizer(tokenizer=RegexpTokenizer(r'\w+').tokenize)
bow_SW_TIV.fit(mes_train)
bow_SW_TIV.fit(mes_test)
train_bowed_mes = bow_SW_TIV.transform(mes_train)
test_bowed_mes = bow_SW_TIV.transform(mes_test)
naive_model = MultinomialNB()
naive_model.fit(train_bowed_mes, label_train)
cv_results = cross_val_score(naive_model, test_bowed_mes, label_test,  cv=10, scoring='accuracy')
print(classification_report(label_test, naive_model.predict(test_bowed_mes)))

             precision    recall  f1-score   support

          0       0.98      1.00      0.99       974
          1       0.98      0.84      0.91       141

avg / total       0.98      0.98      0.98      1115



Впрочем, ничего нового.

Я искренне надеялся сделать побольше и сам, но увы. Я хотя бы попытался и разобрался в том, что изучил.