## Определение спама в СМС

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer



In [2]:
#Загрузим данные
data = pd.read_csv('../Data/SMSSpamCollection.txt', delimiter='\t',header=None,names=['label', 'sms'])
y = np.where(data.label == 'ham',0,1)
sms = data.sms

In [3]:
#Сформируем матрицу признаков
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sms)
print (X.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [4]:
#Оценим модель логистиеской регрессии на униграммах
log_model = LogisticRegression(random_state=2)
results = cross_val_score(log_model, X, y, cv=10, scoring='f1')
print('Mean: ',round(results.mean(),1))
print('Results: ',results)

Mean:  0.9
Results:  [0.95890411 0.89855072 0.91549296 0.95833333 0.93706294 0.91304348
 0.94444444 0.92753623 0.92198582 0.95104895]


In [5]:
#Обучим модельна всей выборке
log_model.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=2, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
#Сделаем пару предсказаний
test_sms = ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB"
,"FreeMsg: Txt: claim your reward of 3 hours talk time"
,"Have you visited the last lecture on physics?"
,"Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$"
,"Only 99$"]

log_model.predict(vectorizer.transform(test_sms))

array([1, 1, 0, 0, 0])

In [7]:
#Функция для оценки моделей
def test_model(X,y,m,ngram_range):
    v = CountVectorizer(ngram_range=ngram_range)
    x = v.fit_transform(X)
    m.fit(x,y)
    r = cross_val_score(m,x,y,cv=10,scoring='f1')
    return round(r.mean(),2)

In [8]:
#Оценим модель логистической регрессии с добавлением n-грамм
m = LogisticRegression(random_state=2)
log_results = [test_model(sms,y,m,r) for r in [(2,2),(3,3),(1,3)]]
log_results

[0.82, 0.73, 0.93]

In [9]:
#Оценим модель наивного Байеса с добавлением n-грамм
m = MultinomialNB()
bayes_results = [test_model(sms,y,m,r) for r in [(2,2),(3,3),(1,3)]]
bayes_results

[0.65, 0.38, 0.89]

In [10]:
#Оценим модель логистической регрессии на TF-IDF
m = LogisticRegression(random_state=2)
tfid_vectorizer = TfidfVectorizer()
X2 = tfid_vectorizer.fit_transform(sms)
m.fit(X2,y)
tfid_results = cross_val_score(m,X2,y,cv=10,scoring='f1')
tfid_results.mean()

0.8528599554172456