# Классификация текстов: спам-фильтр для SMS

В этом задании вам предстоит взять открытый датасет с SMS-сообщениями, размеченными на спам ("spam") и не спам ("ham"), построить на нем классификатор текстов на эти два класса, оценить его качество с помощью кросс-валидации, протестировать его работу на отдельных примерах, и посмотреть, что будет происходить с качеством, если менять параметры вашей модели.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
def save_answer(fname, number):
    with open(fname, 'w') as file:
        file.write(str(number))

In [16]:
def save_answer_array(fname, array):
    with open(fname, 'w') as file:
        file.write(" ".join([str(el) for el in array]))

In [3]:
data = pd.read_table("SMSSpamCollection.txt", sep='\t', names=['class', 'text'] )

In [4]:
data.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data['class'] = data['class'].apply(lambda x: 1 if x == 'spam' else (0 if x == 'ham' else 3))

In [6]:
labels = data['class'].to_numpy()

In [7]:
text = data['text'].to_numpy()

In [8]:
baseline = Pipeline([('vetorizer', CountVectorizer()), ('classifier', LogisticRegression(random_state=2))])

In [37]:
print(cross_val_score(baseline, text, labels, cv=10, scoring='f1').mean())

0.9326402983610631


In [36]:
print('{:.1}'.format(cross_val_score(baseline, text, labels, cv=10, scoring='f1').mean()))

0.9


In [39]:
save_answer('answer_1', 0.9)

## Task 2

In [12]:
clf_pipeline = Pipeline([('vectorizer', CountVectorizer()), ('classifier', LogisticRegression(random_state=2))])

clf_pipeline.fit(text, labels)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
             

In [14]:
test = [ "FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB",

"FreeMsg: Txt: claim your reward of 3 hours talk time",

"Have you visited the last lecture on physics?",

"Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",

"Only 99$"]

In [17]:
answer_2 = clf_pipeline.predict(test)

In [18]:
save_answer_array('answer_2', answer_2)

## Task 3

In [21]:
ngram = [(2,2), (3,3), (1,3)]
answers = []
for n in ngram:
    model_steps = Pipeline([('vectorizer', CountVectorizer(ngram_range=n)),
                            ('classifier', LogisticRegression(random_state=2))])
    answers.append(round(cross_val_score(model_steps, text, labels, cv = 10, scoring='f1').mean(), 2))

In [22]:
print(answers)

[0.82, 0.73, 0.93]


In [23]:
save_answer_array('answer_3', answers)

## Task 4

In [25]:
from sklearn.naive_bayes import MultinomialNB

In [29]:
bayes_answers = []
for n in ngram:    
    vec = CountVectorizer(ngram_range=n)
    features = vec.fit_transform(text)
    m = MultinomialNB()
    bayes_answers.append(round(cross_val_score(MultinomialNB(), features, labels, cv=10, scoring='f1').mean(), 2))

In [30]:
bayes_answers

[0.65, 0.38, 0.89]

In [31]:
save_answer_array('answer_4', bayes_answers)

## Task 5

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
uni_pipeline = Pipeline([('vectorizer', TfidfVectorizer(ngram_range=(1,1))),
                         ('classifier', LogisticRegression(random_state=2))])
print(cross_val_score(uni_pipeline, text, labels, cv=10, scoring='f1').mean())

0.8785100455343396


In [38]:
save_answer('answer_5', -1)

### Note

In [42]:
print(' - '.join([str(el) for el in [1, 2, 3, 4, 5]]))

1 - 2 - 3 - 4 - 5


In [48]:
print(':'.join(['a', 'b', 'c', 'd', 'e', ""]))

a:b:c:d:e:


In [50]:
list(enumerate([1,2,3,4,5]))

[(0, 1), (1, 2), (2, 3), (3, 4), (4, 5)]

In [52]:
list(zip([1,2,3,4,5], [6,7,8,9,10]))

[(1, 6), (2, 7), (3, 8), (4, 9), (5, 10)]

In [59]:
from itertools import product

In [60]:
list(product(range(5), range(3)))

[(0, 0),
 (0, 1),
 (0, 2),
 (1, 0),
 (1, 1),
 (1, 2),
 (2, 0),
 (2, 1),
 (2, 2),
 (3, 0),
 (3, 1),
 (3, 2),
 (4, 0),
 (4, 1),
 (4, 2)]