### Библиотеки

In [None]:
!wget https://raw.githubusercontent.com/Vladimir-Dimitrov-Ngu/NLP_course/master/hw_2/data%20spam/spam_or_not_spam.csv

In [None]:
!pip install fasttext

In [170]:
import pandas as pd
import spacy
import fasttext
from tqdm import trange
import os
import gensim.models

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

import warnings

warnings.filterwarnings("ignore")

RANDOM_STATE = 123
VECTOR_SIZE = 100

### Работа с данными

In [133]:
df = pd.read_csv("spam_or_not_spam.csv")

In [110]:
df.sample(1)

Unnamed: 0,email,label
2542,help wanted we are a NUMBER year old fortune N...,1


In [134]:
df.dropna(inplace=True)

In [112]:
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

In [135]:
df["cleaned_email"] = df["email"].apply(
    lambda x: " ".join(
        token.lemma_.lower()
        for token in nlp(x)
        if not token.is_stop
        and not token.is_punct
        and not token.is_digit
        and not token.like_email
        and not token.like_num
        and not token.is_space
        and len(token) > 1
    )
)
df.drop(columns="email", inplace=True)

In [138]:
index = []
for i, email in enumerate(df["cleaned_email"]):
    if len(email) == 0:
        index.append(i)
df.drop(index, inplace=True)
del index

In [9]:
# df.to_csv('Cleaned_data.csv')
# df = pd.read_csv('Cleaned_data.csv', index_col=0)
# df.reset_index(drop=True)

In [204]:
X_train, X_test, y_train, y_test = train_test_split(
    df["cleaned_email"],
    df["label"],
    random_state=RANDOM_STATE,
    test_size=0.2,
    stratify=df["label"],
    shuffle=True,
)

### Word2Vec

#### Scipgram

In [152]:
X_train_token = [j.split() for j in X_train]
X_train_token[0][::15]

['url', 'iraq']

In [153]:
X_test_token = [j.split() for j in X_test]
X_test_token[0][::15]

['url', 'sentence', 'annoying', 'look']

In [171]:
model_sg = gensim.models.Word2Vec(
    sentences=X_train_token,
    vector_size=VECTOR_SIZE,  # default = 100
    window=7,  # default = 5
    min_count=10,
    sg=1,  # Training algorithm: 1 for skip-gram; otherwise CBOW
    hs=0,  #  If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
    negative=5,  # If > 0, negative sampling will be used, if set to 0, no negative sampling is used.
    epochs=25,  # Number of iterations (epochs) over the corpus
    seed=RANDOM_STATE,
)

In [62]:
model_sg.wv.most_similar(positive=["fine"], topn=5)

[('work', 0.5464556813240051),
 ('vjestika', 0.4894667863845825),
 ('ille', 0.46986567974090576),
 ('sweet', 0.467107355594635),
 ('valhalla', 0.44986072182655334)]

In [63]:
model_sg.wv.most_similar(negative=["terrible"], topn=5)

[('ref', 0.19713957607746124),
 ('collection', 0.1423400342464447),
 ('major', 0.12957414984703064),
 ('compensation', 0.1172618642449379),
 ('credit', 0.09586060047149658)]

In [40]:
model_sg.wv.most_similar(positive=["spam"], topn=5)

[('ham', 0.7495163083076477),
 ('corpus', 0.675554096698761),
 ('filter', 0.6496372222900391),
 ('positive', 0.5836422443389893),
 ('sa', 0.5803134441375732)]

In [41]:
model_sg.wv.most_similar(negative=["spam"], topn=5)

[('revolutionary', 0.15025007724761963),
 ('baby', 0.12427569925785065),
 ('wallace', 0.09568031132221222),
 ('rush', 0.09032419323921204),
 ('innovation', 0.08799225091934204)]

In [172]:
for i, sentence in enumerate(X_train_token):
    vector_mean = model_sg.wv.get_mean_vector(sentence).reshape(-1, VECTOR_SIZE)
    if i == 0:
        X_train_mean_vector = pd.DataFrame(vector_mean)
    else:
        X_train_mean_vector = X_train_mean_vector.append(pd.DataFrame(vector_mean))

In [173]:
for i, sentence in enumerate(X_test_token):
    vector_mean = model_sg.wv.get_mean_vector(sentence).reshape(-1, VECTOR_SIZE)
    if i == 0:
        X_test_mean_vector = pd.DataFrame(vector_mean)
    else:
        X_test_mean_vector = X_test_mean_vector.append(pd.DataFrame(vector_mean))

In [194]:
model_lr = LogisticRegression()
model_lr.fit(X_train_mean_vector, y_train)
predict = model_lr.predict(X_test_mean_vector)
print("f1_score: %.2f" % f1_score(y_test, predict))
print("\n")
print(classification_report(y_test, predict))

f1_score: 0.79
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       501
           1       1.00      0.66      0.79        99

    accuracy                           0.94       600
   macro avg       0.97      0.83      0.88       600
weighted avg       0.95      0.94      0.94       600



#### BOW

In [43]:
model_cbow = gensim.models.Word2Vec(
    sentences=X_train_token,
    vector_size=VECTOR_SIZE,  # default = 100
    window=7,  # default = 5
    min_count=10,
    sg=0,  # Training algorithm: 1 for skip-gram; otherwise CBOW
    hs=0,  #  If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
    negative=5,  # If > 0, negative sampling will be used, if set to 0, no negative sampling is used.
    epochs=25,  # Number of iterations (epochs) over the corpus
    seed=RANDOM_STATE,
)

In [44]:
model_cbow.wv.most_similar(positive=["fine"], topn=5)

[('dev', 0.4817737638950348),
 ('vanilla', 0.46075740456581116),
 ('dual', 0.45963799953460693),
 ('scsi', 0.4586797058582306),
 ('rhnumber', 0.45547837018966675)]

In [45]:
model_cbow.wv.most_similar(negative=["terrible"], topn=5)

[('rfc', 0.482142835855484),
 ('fold', 0.4635330140590668),
 ('authorize', 0.4297294318675995),
 ('envelope', 0.40327364206314087),
 ('mh_sequence', 0.3766980767250061)]

In [46]:
model_cbow.wv.most_similar(positive=["spam"], topn=5)

[('ham', 0.603894829750061),
 ('filter', 0.5881020426750183),
 ('spammer', 0.5675072073936462),
 ('corpus', 0.5337340235710144),
 ('positive', 0.5271260142326355)]

In [47]:
model_cbow.wv.most_similar(negative=["spam"], topn=5)

[('colony', 0.3809428811073303),
 ('royalty', 0.3681044578552246),
 ('fruit', 0.36523672938346863),
 ('soldier', 0.3633030652999878),
 ('baby', 0.3588408827781677)]

In [146]:
for i, sentence in enumerate(X_train_token):
    vector_mean = model_cbow.wv.get_mean_vector(sentence).reshape(-1, VECTOR_SIZE)
    if i == 0:
        X_train_mean_vector = pd.DataFrame(vector_mean)
    else:
        X_train_mean_vector = X_train_mean_vector.append(pd.DataFrame(vector_mean))

In [155]:
for i, sentence in enumerate(X_test_token):
    vector_mean = model_cbow.wv.get_mean_vector(sentence).reshape(-1, VECTOR_SIZE)
    if i == 0:
        X_test_mean_vector = pd.DataFrame(vector_mean)
    else:
        X_test_mean_vector = X_test_mean_vector.append(pd.DataFrame(vector_mean))

In [168]:
model_lr = LogisticRegression()
model_lr.fit(X_train_mean_vector, y_train)
predict = model_lr.predict(X_test_mean_vector)
print("f1_score: %.2f" % f1_score(y_test, predict))
print("\n")
print(classification_report(y_test, predict))

f1_score: 0.8
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       501
           1       1.00      0.67      0.80        99

    accuracy                           0.94       600
   macro avg       0.97      0.83      0.88       600
weighted avg       0.95      0.94      0.94       600



### GLOVE

### FastText

##### Внешняя модель

In [11]:
with open("clean_email.txt", "w") as f:
    for email in X_train:
        f.write("%s\n" % email)

In [230]:
model_fast = fasttext.train_unsupervised(
    "clean_email.txt", dim=VECTOR_SIZE, wordNgrams=1, model="cbow", ws=5
)
# model.save_model("fasttextmodel_1.bin")

In [22]:
model_fast.get_nearest_neighbors("terrible", k=5)

[(0.99679034948349, 'divorce'),
 (0.9962118864059448, 'sale'),
 (0.9960272908210754, 'acquire'),
 (0.9959859251976013, 'sufficiently'),
 (0.9955005049705505, 'efficiency')]

In [21]:
model_fast.get_nearest_neighbors("spam", k=5)

[(0.994591236114502, 'spamd'),
 (0.992931604385376, 'spamc'),
 (0.9835136532783508, 'spamme'),
 (0.979175865650177, 'spambaye'),
 (0.9728814363479614, 'spawn')]

In [235]:
for i, sentence in enumerate(X_train):
    vector_mean = model_fast.get_sentence_vector(sentence).reshape(-1, VECTOR_SIZE)
    if i == 0:
        X_train_mean_vector = pd.DataFrame(vector_mean)
    else:
        X_train_mean_vector = X_train_mean_vector.append(pd.DataFrame(vector_mean))

In [232]:
for i, sentence in enumerate(X_test):
    vector_mean = model_fast.get_sentence_vector(sentence).reshape(-1, VECTOR_SIZE)
    if i == 0:
        X_test_mean_vector = pd.DataFrame(vector_mean)
    else:
        X_test_mean_vector = X_test_mean_vector.append(pd.DataFrame(vector_mean))

In [242]:
model_lr = LogisticRegression()
model_lr.fit(X_train_mean_vector, y_train)
predict = model_lr.predict(X_test_mean_vector)
print("f1_score: %.2f" % f1_score(y_test, predict))
print("\n")
print(classification_report(y_test, predict))

f1_score: 0.68


              precision    recall  f1-score   support

           0       0.92      0.99      0.95       501
           1       0.89      0.56      0.68        99

    accuracy                           0.92       600
   macro avg       0.90      0.77      0.82       600
weighted avg       0.91      0.92      0.91       600



##### Внутренняя модель

In [237]:
for index in range(X_train.shape[0]):
    X_train.iloc[index] = X_train.iloc[index] + " __label__" + str(y_train.iloc[index])

In [238]:
with open("train_email.txt", "w") as f:
    for email in X_train:
        f.write("%s\n" % email)
# Внутренняя модель
model_fast = fasttext.train_supervised(
    "train_email.txt", dim=500, thread=4, wordNgrams=2, ws=5
)
# model.save_model("fasttextmodel_2.bin")

In [239]:
predicts = []
for index in trange(X_test.shape[0]):
    predict_label = model_fast.predict(X_test.iloc[index], 1)[0][0]
    if predict_label == "__label__0":
        predicts.append(0)
    else:
        predicts.append(1)

100%|██████████| 600/600 [00:00<00:00, 3184.30it/s]


In [241]:
print("f1_score: %.2f" % f1_score(y_test, predict))
print("\n")
print(classification_report(y_test, predicts, zero_division=0))

f1_score: 0.68
              precision    recall  f1-score   support

           0       0.90      1.00      0.95       501
           1       1.00      0.43      0.61        99

    accuracy                           0.91       600
   macro avg       0.95      0.72      0.78       600
weighted avg       0.92      0.91      0.89       600

