In [153]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
from nltk import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, precision_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

In [63]:
#https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

path = 'C:/Anya/Proga/2/SMSSpamCollection'

messages = pandas.read_csv(path, sep='\t',
                           names=["label", "message"])

In [8]:
print(messages.groupby('label').describe())

                                                        message
label                                                          
ham   count                                                4825
      unique                                               4516
      top                                Sorry, I'll call later
      freq                                                   30
spam  count                                                 747
      unique                                                653
      top     Please call our customer service representativ...
      freq                                                    4


In [48]:
# Датасет не сбалансирован, наблюдений ham - намного больше, чем spam.

In [86]:
# Просто всем новым наблюдениям присваивать класс ham. Оно будет пропускать spam.

In [9]:
messages['length'] = messages['message'].map(lambda text: len(text))

In [188]:
h = 0
messages1 = pandas.DataFrame(columns = ['label', 'message'])
for m in messages.iterrows():
    if m[1]['label'] == 'ham':
        h += 1
        if h < 748:
            messages1.loc[len(messages1)] = [m[1]['label'], m[1]['message']]
    if m[1]['label'] == 'spam':
        messages1.loc[len(messages1)] = [m[1]['label'], m[1]['message']]
print(messages1.groupby('label').describe())

                                                        message
label                                                          
ham   count                                                 747
      unique                                                730
      top                                Sorry, I'll call later
      freq                                                    6
spam  count                                                 747
      unique                                                653
      top     Please call our customer service representativ...
      freq                                                    4


In [121]:
messages1['length'] = messages1['message'].map(lambda text: len(text))
print(messages1.head())

  label                                            message  length
0   ham  Go until jurong point, crazy.. Available only ...     111
1   ham                      Ok lar... Joking wif u oni...      29
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...     155
3   ham  U dun say so early hor... U c already then say...      49
4   ham  Nah I don't think he goes to usf, he lives aro...      61


In [122]:
def split_into_lemmas(message):
    message = message.lower()
    return word_tokenize(message)
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages1['message'])

In [123]:
print(bow_transformer.vocabulary_)



In [124]:
messages_bow = bow_transformer.transform(messages1['message'])

In [85]:
print(messages_bow)

  (0, 91)	1
  (0, 118)	2
  (0, 1412)	1
  (0, 1646)	1
  (0, 2117)	1
  (0, 2119)	1
  (0, 2456)	1
  (0, 2746)	1
  (0, 3280)	1
  (0, 4082)	1
  (0, 4137)	1
  (0, 4175)	1
  (0, 4692)	1
  (0, 4995)	1
  (0, 5157)	1
  (0, 5977)	1
  (0, 6341)	1
  (0, 6753)	1
  (0, 8593)	1
  (0, 9027)	1
  (0, 9290)	1
  (0, 9531)	1
  (1, 118)	2
  (1, 4962)	1
  (1, 5194)	1
  :	:
  (5570, 3037)	1
  (5570, 3367)	1
  (5570, 3807)	1
  (5570, 3861)	1
  (5570, 3995)	1
  (5570, 4234)	1
  (5570, 4338)	1
  (5570, 4609)	2
  (5570, 4692)	1
  (5570, 4771)	1
  (5570, 4837)	1
  (5570, 5304)	1
  (5570, 6086)	1
  (5570, 7957)	1
  (5570, 7966)	1
  (5570, 8572)	1
  (5570, 8717)	1
  (5570, 9066)	1
  (5570, 9338)	1
  (5571, 116)	1
  (5571, 4856)	2
  (5571, 5996)	1
  (5571, 7377)	1
  (5571, 8717)	1
  (5571, 8864)	1


In [126]:
import random

In [141]:
ran = random.randint(0, 746)
messageran = messages1['message'][ran]

In [142]:
print(messageran)

Will do. Was exhausted on train this morning. Too much wine and pie. You sleep well too


In [143]:
bow = bow_transformer.transform([messageran])
print(bow)
print(bow.shape)

  (0, 57)	3
  (0, 1030)	1
  (0, 1803)	1
  (0, 1968)	1
  (0, 3038)	1
  (0, 3066)	1
  (0, 3237)	1
  (0, 3378)	1
  (0, 3934)	1
  (0, 4293)	1
  (0, 4359)	2
  (0, 4381)	1
  (0, 4625)	1
  (0, 4663)	1
  (0, 4699)	1
  (0, 4704)	1
  (0, 4871)	1
(1, 4956)


In [144]:
messages_bow = bow_transformer.transform(messages1['message'])

In [145]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidfran = tfidf_transformer.transform(bow)
print(tfidfran)

  (0, 4871)	0.0881375917584
  (0, 4704)	0.29928522041
  (0, 4699)	0.157215074093
  (0, 4663)	0.205129332571
  (0, 4625)	0.199766731276
  (0, 4381)	0.316113009101
  (0, 4359)	0.449845684328
  (0, 4293)	0.144826888681
  (0, 3934)	0.278084713922
  (0, 3378)	0.316113009101
  (0, 3237)	0.128222833326
  (0, 3066)	0.222678920908
  (0, 3038)	0.245361814744
  (0, 1968)	0.29928522041
  (0, 1803)	0.157215074093
  (0, 1030)	0.122568938149
  (0, 57)	0.199739889516


In [146]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

(1494, 4956)


In [147]:
spam_detector = MultinomialNB().fit(messages_tfidf, messages1['label'])

In [148]:
print('predicted:', spam_detector.predict(tfidfran)[0])
print('expected:', messages.label[ran])

predicted: ham
expected: ham


In [149]:
all_predictions = spam_detector.predict(messages_tfidf)
print(all_predictions)

['ham' 'ham' 'spam' ..., 'spam' 'spam' 'spam']


In [None]:
# Это как работает классификатор при токенизации со знаками препинания, без лемматизации, со стоп словами, с TfIdfVectorizer.

In [159]:
print('accuracy', accuracy_score(messages1['label'], all_predictions))
print('precision', precision_score(messages1['label'], all_predictions, average='weighted'))
print('confusion matrix\n', confusion_matrix(messages1['label'], all_predictions))
print('(row=expected, col=predicted)')

accuracy 0.981258366801
precision 0.981603596786
confusion matrix
 [[743   4]
 [ 24 723]]
(row=expected, col=predicted)


In [180]:
import re

In [191]:
punct = ['.', ',', '!', '?', ':', ';', '#', '/', '(', ')', '[', ']', '&']
messages2 = pandas.DataFrame(columns = ['label', 'message'])
for m in messages1.iterrows():
    sent = re.sub('[.,?!;#)(&:]', '', m[1]['message'])
    messages2.loc[len(messages2)] = [m[1]['label'], sent]

In [None]:
messages2['length'] = messages2['message'].map(lambda text: len(text))

In [196]:
def split_into_lemmas(message):
    message = message.lower()
    words = word_tokenize(message)
    return words
bow_transformer2 = CountVectorizer(analyzer=split_into_lemmas).fit(messages2['message'])

In [198]:
print(bow_transformer2.vocabulary_)



In [200]:
messages_bow2 = bow_transformer2.transform(messages2['message'])

In [201]:
print(messages_bow2)

  (0, 952)	1
  (0, 1041)	1
  (0, 1273)	1
  (0, 1275)	1
  (0, 1435)	1
  (0, 1562)	1
  (0, 1796)	1
  (0, 2158)	1
  (0, 2184)	1
  (0, 2202)	1
  (0, 2428)	1
  (0, 2553)	1
  (0, 2617)	1
  (0, 3020)	1
  (0, 3189)	1
  (0, 3363)	1
  (0, 4224)	1
  (0, 4433)	1
  (0, 4565)	1
  (0, 4693)	1
  (1, 2540)	1
  (1, 2641)	1
  (1, 3169)	1
  (1, 3187)	1
  (1, 4393)	1
  :	:
  (1493, 212)	1
  (1493, 361)	1
  (1493, 448)	2
  (1493, 512)	1
  (1493, 1268)	1
  (1493, 1312)	1
  (1493, 1438)	1
  (1493, 1523)	1
  (1493, 1807)	1
  (1493, 2262)	2
  (1493, 2482)	2
  (1493, 2923)	1
  (1493, 3115)	1
  (1493, 3189)	1
  (1493, 3283)	1
  (1493, 3394)	1
  (1493, 3434)	1
  (1493, 4214)	2
  (1493, 4237)	1
  (1493, 4261)	1
  (1493, 4345)	1
  (1493, 4393)	2
  (1493, 4576)	1
  (1493, 4678)	1
  (1493, 4889)	1


In [202]:
ran2 = random.randint(0, 746)
messageran2 = messages2['message'][ran2]

In [203]:
bow2 = bow_transformer2.transform([messageran2])
print(bow2)
print(bow2.shape)

  (0, 822)	1
  (0, 2362)	1
  (0, 2482)	1
  (0, 2626)	1
  (0, 2693)	1
  (0, 3424)	1
  (0, 3842)	1
  (0, 4796)	1
  (0, 4816)	1
(1, 4900)


In [204]:
tfidf_transformer2 = TfidfTransformer().fit(messages_bow2)
tfidfran2 = tfidf_transformer2.transform(bow2)
print(tfidfran2)

  (0, 4816)	0.136785525282
  (0, 4796)	0.317117277522
  (0, 3842)	0.462676464207
  (0, 3424)	0.462676464207
  (0, 2693)	0.278428929016
  (0, 2626)	0.488691186088
  (0, 2482)	0.181524764837
  (0, 2362)	0.28892742535
  (0, 822)	0.140766793195


In [205]:
messages_tfidf2 = tfidf_transformer2.transform(messages_bow2)
print(messages_tfidf2.shape)

(1494, 4900)


In [206]:
spam_detector2 = MultinomialNB().fit(messages_tfidf2, messages2['label'])

In [207]:
print('predicted:', spam_detector2.predict(tfidfran2)[0])
print('expected:', messages.label[ran2])

predicted: ham
expected: ham


In [208]:
all_predictions2 = spam_detector2.predict(messages_tfidf2)
print(all_predictions2)

['ham' 'ham' 'spam' ..., 'spam' 'spam' 'spam']


In [212]:
# Это как работает классификатор при токенизации без знаками препинания, без лемматизации, со стоп словами, с TfIdfVectorizer.

In [209]:
print('accuracy', accuracy_score(messages2['label'], all_predictions2))
print('precision', precision_score(messages2['label'], all_predictions2, average='weighted'))
print('confusion matrix\n', confusion_matrix(messages2['label'], all_predictions2))
print('(row=expected, col=predicted)')

accuracy 0.981258366801
precision 0.98175565494
confusion matrix
 [[745   2]
 [ 26 721]]
(row=expected, col=predicted)


In [211]:
# Accuracy осталась такой же, а precision возросла.

In [234]:
from nltk.stem.snowball import SnowballStemmer
st = SnowballStemmer('english')

In [241]:
messages3 = pandas.DataFrame(columns = ['label', 'message'])
for m in messages2.iterrows():
    words = m[1]['message'].split()
    sent = ''
    for word in words:
        word = st.stem(word)
        sent += word + ' '
    messages3.loc[len(messages3)] = [m[1]['label'],  sent[:-1]]

In [242]:
messages3['length'] = messages3['message'].map(lambda text: len(text))

In [243]:
def split_into_lemmas(message):
    message = message.lower()
    words = word_tokenize(message)
    return words
bow_transformer3 = CountVectorizer(analyzer=split_into_lemmas).fit(messages3['message'])

In [244]:
print(bow_transformer3.vocabulary_)

{'go': 1999, 'until': 3984, 'jurong': 2352, 'point': 3065, 'crazi': 1478, 'avail': 1015, 'onli': 2917, 'in': 2243, 'bugi': 1226, 'n': 2765, 'great': 2041, 'world': 4214, 'la': 2414, 'e': 1679, 'buffet': 1224, 'cine': 1371, 'there': 3800, 'got': 2023, 'amor': 935, 'wat': 4100, 'ok': 2900, 'lar': 2436, 'joke': 2339, 'wif': 4163, 'u': 3945, 'oni': 2916, 'free': 1901, 'entri': 1736, '2': 438, 'a': 812, 'wkli': 4190, 'comp': 1417, 'to': 3845, 'win': 4170, 'fa': 1788, 'cup': 1500, 'final': 1846, 'tkts': 3840, '21st': 460, 'may': 2628, '2005': 449, 'text': 3774, '87121': 757, 'receiv': 3223, 'questionstd': 3172, 'txt': 3930, 'ratetc': 3195, 'appli': 969, '08452810075over18': 113, 'dun': 1675, 'say': 3358, 'so': 3520, 'earli': 1684, 'hor': 2165, 'c': 1249, 'alreadi': 922, 'then': 3798, 'nah': 2770, 'i': 2215, 'do': 1624, "n't": 2766, 'think': 3808, 'he': 2096, 'goe': 2004, 'usf': 4009, 'live': 2493, 'around': 988, 'here': 2124, 'though': 3817, 'freemsg': 1910, 'hey': 2130, 'darl': 1528, 'it': 

In [245]:
messages_bow3 = bow_transformer3.transform(messages2['message'])
print(messages_bow3)

  (0, 1224)	1
  (0, 1371)	1
  (0, 1479)	1
  (0, 1679)	1
  (0, 1999)	1
  (0, 2023)	1
  (0, 2041)	1
  (0, 2243)	1
  (0, 2352)	1
  (0, 2414)	1
  (0, 2765)	1
  (0, 3065)	1
  (0, 3800)	1
  (0, 3984)	1
  (0, 4100)	1
  (0, 4214)	1
  (1, 2436)	1
  (1, 2900)	1
  (1, 2916)	1
  (1, 3945)	1
  (1, 4163)	1
  (2, 113)	1
  (2, 438)	1
  (2, 449)	1
  (2, 460)	1
  :	:
  (1492, 3810)	1
  (1492, 3845)	1
  (1492, 4021)	1
  (1492, 4182)	1
  (1492, 4340)	1
  (1493, 202)	1
  (1493, 351)	1
  (1493, 438)	2
  (1493, 502)	1
  (1493, 1259)	1
  (1493, 1374)	1
  (1493, 1443)	1
  (1493, 2091)	2
  (1493, 2289)	2
  (1493, 2851)	1
  (1493, 2997)	1
  (1493, 3092)	1
  (1493, 3129)	1
  (1493, 3791)	2
  (1493, 3810)	1
  (1493, 3833)	1
  (1493, 3945)	2
  (1493, 4108)	1
  (1493, 4202)	1
  (1493, 4407)	1


In [246]:
ran3 = random.randint(0, 746)
messageran3 = messages3['message'][ran3]

In [247]:
bow3 = bow_transformer3.transform([messageran3])
print(bow3)
print(bow3.shape)

  (0, 928)	1
  (0, 1259)	1
  (0, 1901)	1
  (0, 2215)	1
  (0, 2578)	1
  (0, 2631)	1
  (0, 4334)	1
(1, 4418)


In [248]:
tfidf_transformer3 = TfidfTransformer().fit(messages_bow3)
tfidfran3 = tfidf_transformer3.transform(bow3)
print(tfidfran3)

  (0, 4334)	0.198196406135
  (0, 2631)	0.320737045054
  (0, 2578)	0.708092735637
  (0, 2215)	0.229284007037
  (0, 1901)	0.291328619455
  (0, 1259)	0.224300183128
  (0, 928)	0.410726816407


In [249]:
messages_tfidf3 = tfidf_transformer3.transform(messages_bow3)
print(messages_tfidf3.shape)

(1494, 4418)


In [250]:
spam_detector3 = MultinomialNB().fit(messages_tfidf3, messages3['label'])

In [251]:
print('predicted:', spam_detector3.predict(tfidfran3)[0])
print('expected:', messages.label[ran3])

predicted: ham
expected: ham


In [252]:
all_predictions3 = spam_detector3.predict(messages_tfidf3)
print(all_predictions3)

['ham' 'ham' 'spam' ..., 'spam' 'spam' 'spam']


In [None]:
# Это как работает классификатор при токенизации без знаками препинания, c лемматизацией, со стоп словами, с TfIdfVectorizer.

In [253]:
print('accuracy', accuracy_score(messages3['label'], all_predictions3))
print('precision', precision_score(messages3['label'], all_predictions3, average='weighted'))
print('confusion matrix\n', confusion_matrix(messages3['label'], all_predictions3))
print('(row=expected, col=predicted)')

accuracy 0.973226238286
precision 0.973637056634
confusion matrix
 [[738   9]
 [ 31 716]]
(row=expected, col=predicted)


In [254]:
# И accuracy, и precision упали, поэтому для следующей провекуи мы будем брать предложения без лемматизации

In [257]:
import nltk
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))

In [259]:
messages4 = pandas.DataFrame(columns = ['label', 'message'])
for m in messages2.iterrows():
    words = m[1]['message'].split()
    sent = ''
    for word in words:
        if word not in stops:
            sent += word + ' '
    messages4.loc[len(messages4)] = [m[1]['label'],  sent[:-1]]

In [260]:
messages4['length'] = messages4['message'].map(lambda text: len(text))

In [261]:
def split_into_lemmas(message):
    message = message.lower()
    words = word_tokenize(message)
    return words
bow_transformer4 = CountVectorizer(analyzer=split_into_lemmas).fit(messages4['message'])

In [262]:
print(bow_transformer4.vocabulary_)



In [263]:
messages_bow4 = bow_transformer4.transform(messages4['message'])
print(messages_bow4)

  (0, 951)	1
  (0, 1040)	1
  (0, 1268)	1
  (0, 1270)	1
  (0, 1430)	1
  (0, 1557)	1
  (0, 1788)	1
  (0, 2148)	1
  (0, 2174)	1
  (0, 2192)	1
  (0, 2538)	1
  (0, 2602)	1
  (0, 3003)	1
  (0, 3344)	1
  (0, 4537)	1
  (0, 4664)	1
  (1, 2525)	1
  (1, 2626)	1
  (1, 3151)	1
  (1, 3169)	1
  (1, 4367)	1
  (1, 4604)	1
  (2, 13)	2
  (2, 123)	1
  (2, 448)	1
  :	:
  (1492, 3557)	1
  (1492, 3574)	1
  (1492, 4172)	1
  (1492, 4251)	1
  (1492, 4450)	1
  (1493, 212)	1
  (1493, 361)	1
  (1493, 448)	2
  (1493, 512)	1
  (1493, 1263)	1
  (1493, 1307)	1
  (1493, 1433)	1
  (1493, 1518)	1
  (1493, 1799)	1
  (1493, 2907)	1
  (1493, 3098)	1
  (1493, 3171)	1
  (1493, 3264)	1
  (1493, 3375)	1
  (1493, 3415)	1
  (1493, 4213)	1
  (1493, 4235)	1
  (1493, 4319)	1
  (1493, 4367)	2
  (1493, 4860)	1


In [264]:
ran4 = random.randint(0, 746)
messageran4 = messages4['message'][ran4]

In [265]:
bow4 = bow_transformer4.transform([messageran2])
print(bow4)
print(bow4.shape)

  (0, 822)	1
  (0, 2349)	1
  (0, 2468)	1
  (0, 2611)	1
  (0, 2678)	1
  (0, 3405)	1
  (0, 3822)	1
  (0, 4767)	1
  (0, 4787)	1
(1, 4871)


In [266]:
tfidf_transformer4 = TfidfTransformer().fit(messages_bow4)
tfidfran4 = tfidf_transformer4.transform(bow4)
print(tfidfran4)

  (0, 4787)	0.189196298157
  (0, 4767)	0.288529917855
  (0, 3822)	0.420967294037
  (0, 3405)	0.420967294037
  (0, 2678)	0.253329230892
  (0, 2611)	0.444636851325
  (0, 2468)	0.313214740926
  (0, 2349)	0.290578419657
  (0, 822)	0.286550870327


In [267]:
messages_tfidf4 = tfidf_transformer4.transform(messages_bow4)
print(messages_tfidf4.shape)

(1494, 4871)


In [268]:
spam_detector4 = MultinomialNB().fit(messages_tfidf4, messages4['label'])

In [270]:
print('predicted:', spam_detector4.predict(tfidfran4)[0])
print('expected:', messages.label[ran4])

predicted: ham
expected: ham


In [271]:
all_predictions4 = spam_detector4.predict(messages_tfidf4)
print(all_predictions4)

['ham' 'ham' 'spam' ..., 'spam' 'spam' 'spam']


In [None]:
# Это как работает классификатор при токенизации без знаками препинания, без лемматизации, без стоп слов, с TfIdfVectorizer.

In [273]:
print('accuracy', accuracy_score(messages4['label'], all_predictions4))
print('precision', precision_score(messages4['label'], all_predictions4, average='weighted'))
print('confusion matrix\n', confusion_matrix(messages4['label'], all_predictions4))
print('(row=expected, col=predicted)')

accuracy 0.983266398929
precision 0.983516818475
confusion matrix
 [[743   4]
 [ 21 726]]
(row=expected, col=predicted)


In [274]:
# И accuracy, и precision возросли. Ура! Значит дальше используем предложения без стоп слов.

In [276]:
count_vectorizer = CountVectorizer()

In [277]:
counts = count_vectorizer.fit_transform(messages4['message'].values)

In [281]:
spam_detector5 = MultinomialNB().fit(counts,  messages4['label'].values)

In [283]:
ran5 = random.randint(0, 746)
messageran5 = messages4['message'][ran5]

In [285]:
all_predictions5 = spam_detector5.predict(counts)
print(all_predictions5)

['ham' 'ham' 'spam' ..., 'spam' 'spam' 'spam']


In [288]:
# Это как работает классификатор при токенизации без знаками препинания, без лемматизации, без стоп слов, с CountVectorizer.

In [286]:
print('accuracy', accuracy_score(messages4['label'], all_predictions5))
print('precision', precision_score(messages4['label'], all_predictions5, average='weighted'))
print('confusion matrix\n', confusion_matrix(messages4['label'], all_predictions5))
print('(row=expected, col=predicted)')

accuracy 0.981258366801
precision 0.98128941723
confusion matrix
 [[736  11]
 [ 17 730]]
(row=expected, col=predicted)


In [287]:
# И accuracy, и precision упали, поэтому лучше всего работает классификатор без знаками препинания, без лемматизации, без стоп слов, с TfIdfVectorizer.

In [302]:
from sklearn.model_selection import learning_curve

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [304]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):   
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [307]:
msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size=0.2)

In [None]:
clf = DecisionTreeClassifier(min_samples_split=5)

In [None]:
clf.fit(np.array(X_train), np.array(y_train))