### Импорт

In [None]:
!pip install pymorphy2
!python3 -m spacy download en_core_web_sm

In [None]:
import json
import spacy
import pymorphy2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from zipfile import ZipFile
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer


random_state = 9

In [None]:
!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

In [None]:
api_token = {"username":"w1nston","key":"81bfe28fc3d2a7574476ab1222e695db"}

with open('/root/.kaggle/kaggle.json', 'w+') as file:
    json.dump(api_token, file)

In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d ozlerhakan/spam-or-not-spam-dataset

In [None]:
with ZipFile('spam-or-not-spam-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
data = pd.read_csv('spam_or_not_spam.csv', encoding='iso-8859-1').rename(columns={'email': 'text'})

### Предобработка

In [None]:
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

In [None]:
data = data.dropna()

In [None]:
data['cleaned_text'] = data['text'].apply(
    lambda x: ' '.join(
        token.lemma_.lower() for token in nlp(x) if
        not token.is_stop
        and not token.is_punct
        and not token.is_digit
        and not token.like_email
        and not token.like_num
        and not token.is_space
    )
)

### Сравнение Bag of words & TF-IDF

#### Bag of Words

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['label'], random_state=random_state)

In [None]:
vectorizer = CountVectorizer(max_df=0.7, min_df=0.003)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
logreg = LogisticRegression().fit(X_train_vectorized, y_train)
preds = logreg.predict(X_test_vectorized)
print(classification_report(y_test, preds))

#### TF-IDF

In [None]:
vectorizer = TfidfVectorizer(max_df=0.7, min_df=0.003)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
logreg = LogisticRegression().fit(X_train_vectorized, y_train)
preds = logreg.predict(X_test_vectorized)
print(classification_report(y_test, preds))

### Обучение моделей

#### Logreg

In [None]:
pipe = Pipeline(
    steps=[
        ('counter', CountVectorizer()),
        #('tfidf', TfidfTransformer()),
        ('clf', LogisticRegression())
    ]
)

parameter_grid = {
    "counter__max_df": np.linspace(0.3, 0.7, 10),
    "counter__min_df": [0.0, 0.001, 0.003, 0.005],
    "counter__ngram_range": ((1, 1), (1, 2), (2, 3)),  # слова или биграммы
    #"tfidf__norm": ("l1", "l2"),
    "clf__C": np.linspace(0.1, 1, 10),
}

grid_search = HalvingGridSearchCV(
    pipe,
    param_grid=parameter_grid,
    n_jobs=-1,
    verbose=1,
    cv=2,
    scoring='f1',
    random_state=random_state,
)
grid_search.fit(X_train, y_train)

##### Визуализация

In [None]:
results = pd.DataFrame(grid_search.cv_results_)
results["params_str"] = results.params.apply(str)
results.drop_duplicates(subset=("params_str", "iter"), inplace=True)
mean_scores = results.pivot(
    index="iter",
    columns="params_str",
     values="mean_test_score",
)
ax = mean_scores.plot(legend=False, alpha=0.6)

labels = [
    f"iter={i}\nn_samples={grid_search.n_resources_[i]}\nn_candidates={grid_search.n_candidates_[i]}"
    for i in range(grid_search.n_iterations_)
]

ax.set_xticks(range(grid_search.n_iterations_))
ax.set_xticklabels(labels, rotation=45, multialignment="left")
ax.set_title("Scores of candidates over iterations")
ax.set_ylabel("Mean test score", fontsize=15)
ax.set_xlabel("Iterations", fontsize=15)
plt.tight_layout()
plt.grid()
plt.show()

##### Обучение с найденными параметрами

In [None]:
grid_search.best_params_

In [None]:
vectorizer = CountVectorizer(max_df=0.7, min_df=0.005, ngram_range=(1, 1))
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
logreg = LogisticRegression(random_state=random_state).fit(X_train_vectorized, y_train)
preds = logreg.predict(X_test_vectorized)
print(classification_report(y_test, preds))

#### DecisionTree

In [None]:
pipe = Pipeline(
    steps=[
        ('counter', CountVectorizer()),
        #('tfidf', TfidfTransformer()),
        ('clf', DecisionTreeClassifier())
    ]
)

parameter_grid = {
    "counter__max_df": np.linspace(0.3, 0.7, 10),
    "counter__min_df": [0.0, 0.001, 0.003, 0.005],
    "counter__ngram_range": ((1, 1), (1, 2), (2, 3)),  # слова или биграммы
    #"tfidf__norm": ("l1", "l2"),
    "clf__max_depth": np.arange(10, 100, 5),
    "clf__criterion": ("gini", "entropy", "log_loss"),
}

grid_search = HalvingGridSearchCV(
    pipe,
    param_grid=parameter_grid,
    n_jobs=-1,
    verbose=1,
    cv=2,
    scoring='f1',
    random_state=random_state,
)
grid_search.fit(X_train, y_train)

##### Визуализация

In [None]:
results = pd.DataFrame(grid_search.cv_results_)
results["params_str"] = results.params.apply(str)
results.drop_duplicates(subset=("params_str", "iter"), inplace=True)
mean_scores = results.pivot(
    index="iter",
    columns="params_str",
     values="mean_test_score",
)
ax = mean_scores.plot(legend=False, alpha=0.6)

labels = [
    f"iter={i}\nn_samples={grid_search.n_resources_[i]}\nn_candidates={grid_search.n_candidates_[i]}"
    for i in range(grid_search.n_iterations_)
]

ax.set_xticks(range(grid_search.n_iterations_))
ax.set_xticklabels(labels, rotation=45, multialignment="left")
ax.set_title("Scores of candidates over iterations")
ax.set_ylabel("Mean test score", fontsize=15)
ax.set_xlabel("Iterations", fontsize=15)
plt.tight_layout()
plt.grid()
plt.show()

##### Обучение с найденными параметрами

In [None]:
grid_search.best_params_

In [None]:
vectorizer = CountVectorizer(max_df=0.65, min_df=0.005, ngram_range=(1, 1))
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
tree = DecisionTreeClassifier(criterion='gini', random_state=random_state, max_depth=25).fit(X_train_vectorized, y_train)

preds = tree.predict(X_test_vectorized)
print(classification_report(y_test, preds))

#### Naive Bayes

In [None]:
pipe = Pipeline(
    steps=[
        ('counter', CountVectorizer()),
        ('clf', MultinomialNB())
    ]
)

parameter_grid = {
    "counter__max_df": np.linspace(0.3, 0.7, 10),
    "counter__min_df": [0.0, 0.001, 0.003, 0.005],
    "counter__ngram_range": ((1, 1), (1, 2), (2, 3)),
}

grid_search = HalvingGridSearchCV(
    pipe,
    param_grid=parameter_grid,
    n_jobs=-1,
    verbose=1,
    cv=2,
    scoring='f1',
    random_state=random_state,
)
grid_search.fit(X_train, y_train)

##### Визуализация

In [None]:
results = pd.DataFrame(grid_search.cv_results_)
results["params_str"] = results.params.apply(str)
results.drop_duplicates(subset=("params_str", "iter"), inplace=True)
mean_scores = results.pivot(
    index="iter",
    columns="params_str",
     values="mean_test_score",
)
ax = mean_scores.plot(legend=False, alpha=0.6)

labels = [
    f"iter={i}\nn_samples={grid_search.n_resources_[i]}\nn_candidates={grid_search.n_candidates_[i]}"
    for i in range(grid_search.n_iterations_)
]

ax.set_xticks(range(grid_search.n_iterations_))
ax.set_xticklabels(labels, rotation=45, multialignment="left")
ax.set_title("Scores of candidates over iterations")
ax.set_ylabel("Mean test score", fontsize=15)
ax.set_xlabel("Iterations", fontsize=15)
plt.tight_layout()
plt.grid()
plt.show()

##### Обучение с найденными параметрами

In [None]:
grid_search.best_params_

In [None]:
vectorizer = CountVectorizer(max_df=0.34, min_df=0, ngram_range=(1, 1))
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
bayes = MultinomialNB().fit(X_train_vectorized, y_train)
preds = bayes.predict(X_test_vectorized)
print(classification_report(y_test, preds))