### Импорт

In [None]:
!python3 -m spacy download en_core_web_sm

In [2]:
import json
import spacy
import pandas as pd
import numpy as np
import gensim.models

from zipfile import ZipFile
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

random_state=9

In [3]:
!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

In [4]:
api_token = {"username":"w1nston","key":"81bfe28fc3d2a7574476ab1222e695db"}

with open('/root/.kaggle/kaggle.json', 'w+') as file:
    json.dump(api_token, file)

In [5]:
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
!kaggle datasets download -d ozlerhakan/spam-or-not-spam-dataset

Downloading spam-or-not-spam-dataset.zip to /content
  0% 0.00/1.16M [00:00<?, ?B/s]
100% 1.16M/1.16M [00:00<00:00, 54.5MB/s]


In [7]:
with ZipFile('spam-or-not-spam-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall()

### Предобработка

In [8]:
data = pd.read_csv('spam_or_not_spam.csv', encoding='iso-8859-1').rename(columns={'email': 'text'})

In [9]:
data = data[['text', 'label']].dropna()

In [10]:
nlp = spacy.load("en_core_web_sm")

In [11]:
data['cleaned_text'] = data['text'].apply(
    lambda x: ' '.join(
        token.lemma_.lower() for token in nlp(x) if
        not token.is_stop
        and not token.is_punct
        and not token.is_digit
        and not token.like_email
        and not token.like_num
        and not token.is_space
    )
)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['label'],
                                                    random_state=random_state, shuffle = True)

In [13]:
sentences = [elem.split() for elem in data['cleaned_text']]

### SG, CBOW, FastText сравнение

In [14]:
# SG
sg_model = gensim.models.Word2Vec(
    sentences=sentences,
    vector_size=256,
    window=7,
    min_count=10,
    sg=1,
    hs=0,
    negative=5,
    epochs=25,
    seed=random_state,
)

In [15]:
# CBOW
cbow_model = gensim.models.Word2Vec(
    sentences=sentences,
    vector_size=256,
    window=7,
    min_count=10,
    sg=0,
    hs=0,
    negative=5,
    epochs=25,
    seed=random_state,
)

In [16]:
# FastText
gensim.models.fasttext.FastText
ft_model = gensim.models.Word2Vec(
    sentences=sentences,
    vector_size=256,
    window=7,
    min_count=10,
    sg=0,
    hs=0,
    negative=5,
    epochs=25,
    seed=random_state,
)

In [None]:
sg_model.wv.most_similar(positive=['tree'], topn=5)

In [None]:
cbow_model.wv.most_similar(positive=['tree'], topn=5)

In [None]:
ft_model.wv.most_similar(positive=['tree'], topn=5)

In [None]:
print(sg_model.wv.doesnt_match(['january', 'february', 'march', 'tree']))
print(cbow_model.wv.doesnt_match(['january', 'february', 'march', 'tree']))
print(ft_model.wv.doesnt_match(['january', 'february', 'march', 'tree']))

### Обучение LogisticRegression

#### SG

In [21]:
def text_to_vector(text, model):
    word_vectors = model.wv
    text_vec = []
    for word in text.split():
        try:
          text_vec.append(word_vectors[word])
        except KeyError: #обработка Out of Vocabulary
          text_vec.append([0 for i in range(256)])

    if len(text_vec) == 0: #для пустых текстов после предобработки
        text_vec.append([0 for i in range(256)])

    text_vec_out = np.mean(text_vec, axis=0)
    return text_vec_out

In [22]:
X_train_vectorized = [text_to_vector(text, sg_model) for text in X_train]
X_test_vectorized = [text_to_vector(text, sg_model) for text in X_test]


In [23]:
for i, elem in enumerate(X_train_vectorized):
  if len(elem) != 256:
    print(len(elem))
    #print(f'elem {i} removed. Len: {elem.shape}')


In [24]:
logreg = LogisticRegression().fit(X_train_vectorized, y_train)
preds = logreg.predict(X_test_vectorized)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       616
           1       0.97      0.90      0.93       134

    accuracy                           0.98       750
   macro avg       0.97      0.94      0.96       750
weighted avg       0.98      0.98      0.98       750



#### CBOW

In [25]:
def text_to_vector(text, model):
    word_vectors = model.wv
    text_vec = []
    for word in text.split():
        try:
          text_vec.append(word_vectors[word])
        except KeyError: #обработка Out of Vocabulary
          text_vec.append([0 for i in range(256)])

    if len(text_vec) == 0: #для пустых текстов после предобработки
        text_vec.append([0 for i in range(256)])

    text_vec_out = np.mean(text_vec, axis=0)
    return text_vec_out

In [26]:
X_train_vectorized = [text_to_vector(text, cbow_model) for text in X_train]
X_test_vectorized = [text_to_vector(text, cbow_model) for text in X_test]


In [27]:
for i, elem in enumerate(X_train_vectorized):
  if len(elem) != 256:
    print(len(elem))
    #print(f'elem {i} removed. Len: {elem.shape}')


In [28]:
logreg = LogisticRegression().fit(X_train_vectorized, y_train)
preds = logreg.predict(X_test_vectorized)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       616
           1       0.94      0.96      0.95       134

    accuracy                           0.98       750
   macro avg       0.97      0.97      0.97       750
weighted avg       0.98      0.98      0.98       750



#### FastText

In [29]:
def text_to_vector(text, model):
    word_vectors = model.wv
    text_vec = []
    for word in text.split():
        try:
          text_vec.append(word_vectors[word])
        except KeyError: #обработка Out of Vocabulary
          text_vec.append([0 for i in range(256)])

    if len(text_vec) == 0: #для пустых текстов после предобработки
        text_vec.append([0 for i in range(256)])

    text_vec_out = np.mean(text_vec, axis=0)
    return text_vec_out

In [30]:
X_train_vectorized = [text_to_vector(text, ft_model) for text in X_train]
X_test_vectorized = [text_to_vector(text, ft_model) for text in X_test]


In [31]:
logreg = LogisticRegression().fit(X_train_vectorized, y_train)
preds = logreg.predict(X_test_vectorized)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       616
           1       0.95      0.96      0.95       134

    accuracy                           0.98       750
   macro avg       0.97      0.97      0.97       750
weighted avg       0.98      0.98      0.98       750

