In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
import string
from unidecode import unidecode
import emoji

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

## Data loading

In [163]:
file_train = 'train_spam.csv'

data_train_all = pd.read_csv(file_train)

In [164]:
texts_train_all = data_train_all['text']
labels_all = data_train_all['text_type']
target_all = pd.Categorical(labels_all, categories=['ham', 'spam']).codes

In [165]:
print(texts_train_all[300:320].to_numpy())

['development of a program in econo physics hello shirley i d understood from yannis that he d proposed a brown bag lunch where i d give a talk and then the discussion about various possibilities would follow i m here until about mid may and then will go on leave in europe for a year i could be available to make trips back and forth from time to time to get things started best regards joe mccauley'
 'at 1635465 1635465 pm 1635465 on 1635465 1635465 1635465 mr fork wrote i realize now that after reviewing the past several years of work and career i have been in the wrong business the wrong business this is what i should have been doing url a new manual has been published url or the one i read http http search barnesandnoble com booksearch isbninquiry asp userid 1635465t1635465jtegkqp isbn 1635465 and of course the open source versions url url r a hettinga mailto rah ibuc com the internet bearer underwriting corporation url 1635465 farquhar street boston ma 1635465 usa however it may des

## Preprocessing

In [166]:
lemmatizer = WordNetLemmatizer()

tokenizer = TweetTokenizer()

ENGLISH_STOP_WORDS = set(stopwords.words('english'))

In [217]:
def remove_hyperlink(words):
    return  re.sub(r"http\S+", "", words)

def to_lower(words):
    result = words.lower()
    return result

def remove_underscore(words):
    return re.sub(r'_', ' ', words)

def remove_punctuation(words):
    # result = words.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    # result = re.sub(r'[^\w\s]', ' ', words)
    result = re.sub(r'[!"#%&\'()*+,-./:;<=>?@\[\]^`{|}~]', ' ', words) # everything except underscore
    return result

def emoji_to_text(words):
    return emoji.demojize(words)

def replace_emoji(words):
    return emoji.replace_emoji(words, replace=' <EMOJI> ')

def replace_non_ascii(words, replace=' <NONASCII> '):
    pattern = r'\b[^\x00-\x7F]+\b'
    non_ascii_words = re.sub(pattern, replace, words)
    return non_ascii_words

def remove_whitespace(words):
    return words.strip()

def replace_newline(words):
    return words.replace('\n', '')

def remove_number(words, replace=' <NUMBER> '):
    result = re.sub(r'\b\w*\d\w*\b', replace, words)
    return result

def remove_currency(words, replace=' <CURRENCY> '):
    currency_pattern = r'[£$€₹]'
    return re.sub(currency_pattern, replace, words)

def remove_stop_words(words):
    result = [i for i in words if i not in ENGLISH_STOP_WORDS]
    return result

def word_lemmatizer(words):
    return [lemmatizer.lemmatize(s) for s in words]

Можно добавить обработку !, %, #, @, ?

In [168]:
def print_tokenized_data(data_tokenized, b=0, e=None):
    for sentence in data_tokenized[b:e]:
        print(' '.join(sentence))


def preprocess_pipeline(sentence, preprocess_utils=None):
    if preprocess_utils is None:
        preprocess_utils = [
                            remove_hyperlink,
                            replace_newline,
                            to_lower,
                            replace_emoji,          ### или emoji_to_text
                            remove_currency,        ### или ничего
                            remove_number,          ### или ничего
                            replace_non_ascii,      ### или что-то что обрабатывает необычные шрифты (replace_non_ascii, unidecode)
                            remove_punctuation,
                            remove_whitespace,
                            tokenizer.tokenize,
                            remove_stop_words,       ###
                            word_lemmatizer,         ###
                        ]
    for func in preprocess_utils:
        sentence = func(sentence)
    return sentence

In [169]:
data_train_tokenized = [
    preprocess_pipeline(sent) for sent in texts_train_all.to_numpy()
]

In [170]:
print_tokenized_data(data_train_tokenized, 20, 40)

hi building rpm resin webserver basically want install entire tarball diretory tarball includes subdirectory spec install NUMBER name version release rpmbuildroot usr local resin getting install resin NUMBER NUMBER NUMBER bin directory install resin NUMBER NUMBER NUMBER conf directory proper nice way handle seem super human misunderstood c dream theater mark url icq NUMBER jid talios url rpm list mailing list rpm list freshrpms net url
main symptom manifestation NUMBER main manifestation NUMBER patient fever dry cough fatigue patient diminished lost sense smell taste first symptom small number patient accompanied nasal congestion runny nose sore throat conjunctivitis myalgia diarrheadefeating NUMBER requires joint effort people india sun pharma best click participate parttime task fighting NUMBER get daily reward
would like get free navigation forever map NUMBER country nokia phone learn install
prospective NUMBER NUMBER houston visit ehud june NUMBER work want firm vince ehud ronn NUM

In [171]:
text = '𝑰𝒕𝒔 𝒕𝒉𝒆 𝒔𝒆𝒄𝒐𝒏𝒅 𝒕𝒊𝒎𝒆 𝒐𝒇 𝒓𝒆𝒄𝒆𝒊𝒗𝒊𝒏𝒈 𝒎𝒚 𝒑𝒓𝒐𝒇𝒊𝒕 𝒇𝒓𝒐𝒎 𝒕𝒉𝒊𝒔 𝒊𝒏𝒗𝒆𝒔𝒕𝒎𝒆𝒏𝒕 𝒑𝒍𝒂𝒕𝒇𝒐𝒓𝒎 𝒂𝒏𝒅 𝑰 𝒏𝒆𝒆𝒅 𝒕𝒐 𝒕𝒆𝒔𝒕𝒊𝒇𝒚 𝒕𝒐 𝒚𝒐𝒖 𝒕𝒉𝒂𝒕 𝑰 𝒋𝒖𝒔𝒕 𝒓𝒆𝒄𝒆𝒊𝒗𝒆𝒅'
text = preprocess_pipeline(text)
print(text)

['NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII', 'NONASCII']


In [172]:
texts_train, texts_val, y_train, y_val = train_test_split(
                                        data_train_tokenized,
                                        target_all,
                                        shuffle=True,
                                        random_state=42, 
                                        stratify=target_all)

## Feature extraction

In [173]:
texts_train_strings = [" ".join(sentence) for sentence in texts_train]

max_features = 40000
vectorizer = TfidfVectorizer(lowercase=False, max_features=max_features)
vectorizer.fit(texts_train_strings)

In [174]:
def convert_to_feature(data):
    data_strings = [" ".join(sentence) for sentence in data]
    return vectorizer.transform(data_strings)

In [175]:
X_train = convert_to_feature(texts_train)
X_val = convert_to_feature(texts_val)
print(X_train.shape, X_val.shape)

(12208, 33599) (4070, 33599)


## Training naive bayes model

In [176]:
clf = GaussianNB()

clf.fit(X_train.toarray(), y_train)

In [177]:
def calc_metrics(model, X_test, y_test):
    prob_pred = model.predict_proba(X_test.toarray())
    y_pred = np.argmax(prob_pred, axis=1)
    p_pred = prob_pred[:, 1]

    return (
        accuracy_score(y_test, y_pred),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred),
        roc_auc_score(y_test, p_pred)
    )

In [178]:
(accuracy_val,
precision_val,
recall_val,
f1_val,
roc_auc_val) = calc_metrics(clf, X_val, y_val)

print(f'Accuracy {accuracy_val:.4f}, Precision {precision_val:.4f}, ', end='')
print(f'Recall {recall_val:.4f}, f1 {f1_val:.4f}, ROC AUC {roc_auc_val:.5f}')

Accuracy 0.7609, Precision 0.5652, Recall 0.8253, f1 0.6710, ROC AUC 0.77957


# Testing

## Pipeline 1
Ничего не делаем c эмоджи, ничего не делаем с currency, ничего не делаем с numbers, ничего не делаем с nonascii

In [179]:
preprocess_utils = [
                    remove_hyperlink,
                    replace_newline,
                    to_lower,
                    remove_punctuation,
                    remove_whitespace,
                    tokenizer.tokenize,
                    remove_stop_words,       ###
                    word_lemmatizer,         ###
                ]

In [180]:
data_train_tokenized = [
    preprocess_pipeline(sent, preprocess_utils) for sent in texts_train_all.to_numpy()
]

print_tokenized_data(data_train_tokenized, 20, 40)

hi building rpm resin webserver basically want install entire tarball diretory tarball includes subdirectory spec install 1635465 name version release rpmbuildroot usr local resin getting install resin 1635465 1635 465 1635465 bin directory install resin 1635465 1635 465 1635465 conf directory proper nice way handle seem super human misunderstood c dream theater mark url icq 1635465 jid talios url rpm list mailing list rpm list freshrpms net url
main symptom manifestation covid 19the main manifestation covid 19 patient fever dry cough fatigue patient diminished lost sense smell taste first symptom small number patient accompanied nasal congestion runny nose sore throat conjunctivitis myalgia diarrheadefeating covid 19 requires joint effort people india sun pharma best click participate parttime task fighting covid 19 get daily reward
would like get free navigation forever map 75 country nokia phone learn install
prospective 6 22 houston visit ehud june 22 work want firm vince ehud ronn

In [181]:
texts_train, texts_val, y_train, y_val = train_test_split(
                                        data_train_tokenized,
                                        target_all,
                                        shuffle=True,
                                        random_state=42, 
                                        stratify=target_all)

texts_train_strings = [" ".join(sentence) for sentence in texts_train]
texts_val_strings = [" ".join(sentence) for sentence in texts_val]

In [193]:
max_features = 35000

vectorizer = TfidfVectorizer(lowercase=False, max_features=max_features)
vectorizer.fit(texts_train_strings)

In [194]:
X_train = vectorizer.transform(texts_train_strings)
X_val = vectorizer.transform(texts_val_strings)
print(X_train.shape, X_val.shape)

(12208, 35000) (4070, 35000)


In [196]:
clf = GaussianNB()

clf.fit(X_train.toarray(), y_train)

In [197]:
(accuracy_val,
precision_val,
recall_val,
f1_val,
roc_auc_val) = calc_metrics(clf, X_val, y_val)

print(f'Accuracy {accuracy_val:.4f}, Precision {precision_val:.4f}, ', end='')
print(f'Recall {recall_val:.4f}, f1 {f1_val:.4f}, ROC AUC {roc_auc_val:.5f}')

Accuracy 0.8106, Precision 0.6375, Recall 0.8311, f1 0.7216, ROC AUC 0.81657


## Pipeline 2

Переводим эмоджи в текст.

Ничего не делаем с currency, ничего не делаем с numbers, ничего не делаем с nonascii.

Гипотеза: все должно отаться как с случае 1.

In [218]:
preprocess_utils = [
                    remove_hyperlink,
                    replace_newline,
                    to_lower,
                    remove_underscore,
                    emoji_to_text,
                    remove_punctuation,
                    remove_whitespace,
                    tokenizer.tokenize,
                    remove_stop_words,       ###
                    word_lemmatizer,         ###
                ]

In [219]:
data_train_tokenized = [
    preprocess_pipeline(sent, preprocess_utils) for sent in texts_train_all.to_numpy()
]

print_tokenized_data(data_train_tokenized, 20, 40)

hi building rpm resin webserver basically want install entire tarball diretory tarball includes subdirectory spec install 1635465 name version release rpmbuildroot usr local resin getting install resin 1635465 1635 465 1635465 bin directory install resin 1635465 1635 465 1635465 conf directory proper nice way handle seem super human misunderstood c dream theater mark url icq 1635465 jid talios url rpm list mailing list rpm list freshrpms net url
main symptom manifestation covid 19 main manifestation covid 19 patient fever dry cough fatigue patient diminished lost sense smell taste first symptom small number patient accompanied nasal congestion runny nose sore throat conjunctivitis myalgia diarrheadefeating covid 19 requires joint effort people india sun pharma best click participate parttime task fighting covid 19 get daily reward
would like get free navigation forever map 75 country nokia phone learn install
prospective 6 22 houston visit ehud june 22 work want firm vince ehud ronn 05

In [220]:
texts_train, texts_val, y_train, y_val = train_test_split(
                                        data_train_tokenized,
                                        target_all,
                                        shuffle=True,
                                        random_state=42, 
                                        stratify=target_all)

texts_train_strings = [" ".join(sentence) for sentence in texts_train]
texts_val_strings = [" ".join(sentence) for sentence in texts_val]

In [221]:
max_features = 35000

vectorizer = TfidfVectorizer(lowercase=False, max_features=max_features)
vectorizer.fit(texts_train_strings)

In [222]:
X_train = vectorizer.transform(texts_train_strings)
X_val = vectorizer.transform(texts_val_strings)
print(X_train.shape, X_val.shape)

(12208, 35000) (4070, 35000)


In [223]:
clf = GaussianNB()

clf.fit(X_train.toarray(), y_train)

In [224]:
(accuracy_val,
precision_val,
recall_val,
f1_val,
roc_auc_val) = calc_metrics(clf, X_val, y_val)

print(f'Accuracy {accuracy_val:.4f}, Precision {precision_val:.4f}, ', end='')
print(f'Recall {recall_val:.4f}, f1 {f1_val:.4f}, ROC AUC {roc_auc_val:.5f}')

Accuracy 0.8179, Precision 0.6458, Recall 0.8494, f1 0.7337, ROC AUC 0.82690


ROC AUC чуть улучшился по сравнению с 1, как и все остальные метрики. Гипотеза опровергнута. Это странно.

## Pipeline 3

Переводим эмоджи в текст. Переводим нестандартные шрифты в ascii.

Ничего не делаем с currency, ничего не делаем с numbers.

Гипотеза: должно стать хуже, чем в 2.

In [225]:
preprocess_utils = [
                    remove_hyperlink,
                    replace_newline,
                    to_lower,
                    remove_underscore,
                    emoji_to_text,
                    unidecode,
                    remove_punctuation,
                    remove_whitespace,
                    tokenizer.tokenize,
                    remove_stop_words,       ###
                    word_lemmatizer,         ###
                ]

In [226]:
data_train_tokenized = [
    preprocess_pipeline(sent, preprocess_utils) for sent in texts_train_all.to_numpy()
]

print_tokenized_data(data_train_tokenized, 20, 40)

hi building rpm resin webserver basically want install entire tarball diretory tarball includes subdirectory spec install 1635465 name version release rpmbuildroot usr local resin getting install resin 1635465 1635 465 1635465 bin directory install resin 1635465 1635 465 1635465 conf directory proper nice way handle seem super human misunderstood c dream theater mark url icq 1635465 jid talios url rpm list mailing list rpm list freshrpms net url
main symptom manifestation covid 19 main manifestation covid 19 patient fever dry cough fatigue patient diminished lost sense smell taste first symptom small number patient accompanied nasal congestion runny nose sore throat conjunctivitis myalgia diarrheadefeating covid 19 requires joint effort people india sun pharma best click participate parttime task fighting covid 19 get daily reward
would like get free navigation forever map 75 country nokia phone learn install
prospective 6 22 houston visit ehud june 22 work want firm vince ehud ronn 05

In [227]:
texts_train, texts_val, y_train, y_val = train_test_split(
                                        data_train_tokenized,
                                        target_all,
                                        shuffle=True,
                                        random_state=42, 
                                        stratify=target_all)

texts_train_strings = [" ".join(sentence) for sentence in texts_train]
texts_val_strings = [" ".join(sentence) for sentence in texts_val]

In [228]:
max_features = 35000

vectorizer = TfidfVectorizer(lowercase=False, max_features=max_features)
vectorizer.fit(texts_train_strings)

In [229]:
X_train = vectorizer.transform(texts_train_strings)
X_val = vectorizer.transform(texts_val_strings)
print(X_train.shape, X_val.shape)

(12208, 35000) (4070, 35000)


In [230]:
clf = GaussianNB()

clf.fit(X_train.toarray(), y_train)

In [231]:
(accuracy_val,
precision_val,
recall_val,
f1_val,
roc_auc_val) = calc_metrics(clf, X_val, y_val)

print(f'Accuracy {accuracy_val:.4f}, Precision {precision_val:.4f}, ', end='')
print(f'Recall {recall_val:.4f}, f1 {f1_val:.4f}, ROC AUC {roc_auc_val:.5f}')

Accuracy 0.7769, Precision 0.5828, Recall 0.8611, f1 0.6951, ROC AUC 0.80123


Стало сильно хуже, чем 1 и 2. Гипотеза верна.

## Pipeline 4

Заменям все эмоджи на EMOJI.

Ничего не делаем с currency, ничего не делаем с numbers, ничего не делаем с nonascii.

Гипотеза: все должно стать лучше, чем в 1 и 2.

In [232]:
preprocess_utils = [
                    remove_hyperlink,
                    replace_newline,
                    to_lower,
                    remove_underscore,
                    replace_emoji,
                    remove_punctuation,
                    remove_whitespace,
                    tokenizer.tokenize,
                    remove_stop_words,
                    word_lemmatizer,
                ]

In [233]:
data_train_tokenized = [
    preprocess_pipeline(sent, preprocess_utils) for sent in texts_train_all.to_numpy()
]

print_tokenized_data(data_train_tokenized, 20, 40)

hi building rpm resin webserver basically want install entire tarball diretory tarball includes subdirectory spec install 1635465 name version release rpmbuildroot usr local resin getting install resin 1635465 1635 465 1635465 bin directory install resin 1635465 1635 465 1635465 conf directory proper nice way handle seem super human misunderstood c dream theater mark url icq 1635465 jid talios url rpm list mailing list rpm list freshrpms net url
main symptom manifestation covid 19 main manifestation covid 19 patient fever dry cough fatigue patient diminished lost sense smell taste first symptom small number patient accompanied nasal congestion runny nose sore throat conjunctivitis myalgia diarrheadefeating covid 19 requires joint effort people india sun pharma best click participate parttime task fighting covid 19 get daily reward
would like get free navigation forever map 75 country nokia phone learn install
prospective 6 22 houston visit ehud june 22 work want firm vince ehud ronn 05

In [234]:
texts_train, texts_val, y_train, y_val = train_test_split(
                                        data_train_tokenized,
                                        target_all,
                                        shuffle=True,
                                        random_state=42, 
                                        stratify=target_all)

texts_train_strings = [" ".join(sentence) for sentence in texts_train]
texts_val_strings = [" ".join(sentence) for sentence in texts_val]

In [235]:
max_features = 35000

vectorizer = TfidfVectorizer(lowercase=False, max_features=max_features)
vectorizer.fit(texts_train_strings)

In [236]:
X_train = vectorizer.transform(texts_train_strings)
X_val = vectorizer.transform(texts_val_strings)
print(X_train.shape, X_val.shape)

(12208, 35000) (4070, 35000)


In [237]:
clf = GaussianNB()

clf.fit(X_train.toarray(), y_train)

In [238]:
(accuracy_val,
precision_val,
recall_val,
f1_val,
roc_auc_val) = calc_metrics(clf, X_val, y_val)

print(f'Accuracy {accuracy_val:.4f}, Precision {precision_val:.4f}, ', end='')
print(f'Recall {recall_val:.4f}, f1 {f1_val:.4f}, ROC AUC {roc_auc_val:.5f}')

Accuracy 0.8059, Precision 0.6314, Recall 0.8236, f1 0.7148, ROC AUC 0.81139


Стало хуже. Гипотеза опровергнута.

## Pipeline 5.

Заменям все эмоджи на EMOJI. Заменяем нестандартные шрифты на NONASCII.

Ничего не делаем с currency, ничего не делаем с numbers.

Гипотеза: все должно стать лучше, чем в 1 и 2.

In [239]:
preprocess_utils = [
                    remove_hyperlink,
                    replace_newline,
                    to_lower,
                    remove_underscore,
                    replace_emoji,
                    replace_non_ascii,
                    remove_punctuation,
                    remove_whitespace,
                    tokenizer.tokenize,
                    remove_stop_words,
                    word_lemmatizer,
                ]

In [240]:
data_train_tokenized = [
    preprocess_pipeline(sent, preprocess_utils) for sent in texts_train_all.to_numpy()
]

print_tokenized_data(data_train_tokenized, 20, 40)

hi building rpm resin webserver basically want install entire tarball diretory tarball includes subdirectory spec install 1635465 name version release rpmbuildroot usr local resin getting install resin 1635465 1635 465 1635465 bin directory install resin 1635465 1635 465 1635465 conf directory proper nice way handle seem super human misunderstood c dream theater mark url icq 1635465 jid talios url rpm list mailing list rpm list freshrpms net url
main symptom manifestation covid 19 main manifestation covid 19 patient fever dry cough fatigue patient diminished lost sense smell taste first symptom small number patient accompanied nasal congestion runny nose sore throat conjunctivitis myalgia diarrheadefeating covid 19 requires joint effort people india sun pharma best click participate parttime task fighting covid 19 get daily reward
would like get free navigation forever map 75 country nokia phone learn install
prospective 6 22 houston visit ehud june 22 work want firm vince ehud ronn 05

In [241]:
texts_train, texts_val, y_train, y_val = train_test_split(
                                        data_train_tokenized,
                                        target_all,
                                        shuffle=True,
                                        random_state=42, 
                                        stratify=target_all)

texts_train_strings = [" ".join(sentence) for sentence in texts_train]
texts_val_strings = [" ".join(sentence) for sentence in texts_val]

In [242]:
max_features = 35000

vectorizer = TfidfVectorizer(lowercase=False, max_features=max_features)
vectorizer.fit(texts_train_strings)

In [243]:
X_train = vectorizer.transform(texts_train_strings)
X_val = vectorizer.transform(texts_val_strings)
print(X_train.shape, X_val.shape)

(12208, 35000) (4070, 35000)


In [244]:
clf = GaussianNB()

clf.fit(X_train.toarray(), y_train)

In [245]:
(accuracy_val,
precision_val,
recall_val,
f1_val,
roc_auc_val) = calc_metrics(clf, X_val, y_val)

print(f'Accuracy {accuracy_val:.4f}, Precision {precision_val:.4f}, ', end='')
print(f'Recall {recall_val:.4f}, f1 {f1_val:.4f}, ROC AUC {roc_auc_val:.5f}')

Accuracy 0.7676, Precision 0.5729, Recall 0.8369, f1 0.6802, ROC AUC 0.78769


Стало значительно хуже.

## Pipeline 6

Переводим эмоджи в текст. Переводим все числа в NUMBERS.

Ничего не делаем с currency, ничего не делаем с nonascii.

Гипотеза: должно стать лучше случая 2.

In [246]:
preprocess_utils = [
                    remove_hyperlink,
                    replace_newline,
                    to_lower,
                    remove_underscore,
                    emoji_to_text,
                    remove_number,
                    remove_punctuation,
                    remove_whitespace,
                    tokenizer.tokenize,
                    remove_stop_words,       ###
                    word_lemmatizer,         ###
                ]

In [247]:
data_train_tokenized = [
    preprocess_pipeline(sent, preprocess_utils) for sent in texts_train_all.to_numpy()
]

print_tokenized_data(data_train_tokenized, 20, 40)

hi building rpm resin webserver basically want install entire tarball diretory tarball includes subdirectory spec install NUMBER name version release rpmbuildroot usr local resin getting install resin NUMBER NUMBER NUMBER bin directory install resin NUMBER NUMBER NUMBER conf directory proper nice way handle seem super human misunderstood c dream theater mark url icq NUMBER jid talios url rpm list mailing list rpm list freshrpms net url
main symptom manifestation NUMBER main manifestation NUMBER patient fever dry cough fatigue patient diminished lost sense smell taste first symptom small number patient accompanied nasal congestion runny nose sore throat conjunctivitis myalgia diarrheadefeating NUMBER requires joint effort people india sun pharma best click participate parttime task fighting NUMBER get daily reward
would like get free navigation forever map NUMBER country nokia phone learn install
prospective NUMBER NUMBER houston visit ehud june NUMBER work want firm vince ehud ronn NUM

In [248]:
texts_train, texts_val, y_train, y_val = train_test_split(
                                        data_train_tokenized,
                                        target_all,
                                        shuffle=True,
                                        random_state=42, 
                                        stratify=target_all)

texts_train_strings = [" ".join(sentence) for sentence in texts_train]
texts_val_strings = [" ".join(sentence) for sentence in texts_val]

In [249]:
max_features = 35000

vectorizer = TfidfVectorizer(lowercase=False, max_features=max_features)
vectorizer.fit(texts_train_strings)

In [250]:
X_train = vectorizer.transform(texts_train_strings)
X_val = vectorizer.transform(texts_val_strings)
print(X_train.shape, X_val.shape)

(12208, 35000) (4070, 35000)


In [251]:
clf = GaussianNB()

clf.fit(X_train.toarray(), y_train)

In [252]:
(accuracy_val,
precision_val,
recall_val,
f1_val,
roc_auc_val) = calc_metrics(clf, X_val, y_val)

print(f'Accuracy {accuracy_val:.4f}, Precision {precision_val:.4f}, ', end='')
print(f'Recall {recall_val:.4f}, f1 {f1_val:.4f}, ROC AUC {roc_auc_val:.5f}')

Accuracy 0.7843, Precision 0.5954, Recall 0.8411, f1 0.6972, ROC AUC 0.80059


Стало хуже 2. Гипотеза опровергнута.

## Pipeline 7

Переводим эмоджи в текст. Переводим все числа в NUMBERS. Переводим символы валют в CURRANCY.

Ничего не делаем с nonascii.

Гипотеза: должно стать лучше случая 2.

In [253]:
preprocess_utils = [
                    remove_hyperlink,
                    replace_newline,
                    to_lower,
                    remove_underscore,
                    emoji_to_text,
                    remove_number,
                    remove_currency,
                    remove_punctuation,
                    remove_whitespace,
                    tokenizer.tokenize,
                    remove_stop_words,       ###
                    word_lemmatizer,         ###
                ]

In [254]:
data_train_tokenized = [
    preprocess_pipeline(sent, preprocess_utils) for sent in texts_train_all.to_numpy()
]

print_tokenized_data(data_train_tokenized, 20, 40)

hi building rpm resin webserver basically want install entire tarball diretory tarball includes subdirectory spec install NUMBER name version release rpmbuildroot usr local resin getting install resin NUMBER NUMBER NUMBER bin directory install resin NUMBER NUMBER NUMBER conf directory proper nice way handle seem super human misunderstood c dream theater mark url icq NUMBER jid talios url rpm list mailing list rpm list freshrpms net url
main symptom manifestation NUMBER main manifestation NUMBER patient fever dry cough fatigue patient diminished lost sense smell taste first symptom small number patient accompanied nasal congestion runny nose sore throat conjunctivitis myalgia diarrheadefeating NUMBER requires joint effort people india sun pharma best click participate parttime task fighting NUMBER get daily reward
would like get free navigation forever map NUMBER country nokia phone learn install
prospective NUMBER NUMBER houston visit ehud june NUMBER work want firm vince ehud ronn NUM

In [255]:
texts_train, texts_val, y_train, y_val = train_test_split(
                                        data_train_tokenized,
                                        target_all,
                                        shuffle=True,
                                        random_state=42, 
                                        stratify=target_all)

texts_train_strings = [" ".join(sentence) for sentence in texts_train]
texts_val_strings = [" ".join(sentence) for sentence in texts_val]

In [256]:
max_features = 35000

vectorizer = TfidfVectorizer(lowercase=False, max_features=max_features)
vectorizer.fit(texts_train_strings)

In [257]:
X_train = vectorizer.transform(texts_train_strings)
X_val = vectorizer.transform(texts_val_strings)
print(X_train.shape, X_val.shape)

(12208, 35000) (4070, 35000)


In [258]:
clf = GaussianNB()

clf.fit(X_train.toarray(), y_train)

In [259]:
(accuracy_val,
precision_val,
recall_val,
f1_val,
roc_auc_val) = calc_metrics(clf, X_val, y_val)

print(f'Accuracy {accuracy_val:.4f}, Precision {precision_val:.4f}, ', end='')
print(f'Recall {recall_val:.4f}, f1 {f1_val:.4f}, ROC AUC {roc_auc_val:.5f}')

Accuracy 0.7862, Precision 0.5978, Recall 0.8444, f1 0.7000, ROC AUC 0.80300


Примерно как предыдущий случай 6. Гипотеза опровергнута.