<a href="https://colab.research.google.com/github/Sergey-Kit/itmo_dl_nlp_course/blob/hw_3/hw_3/itmo_dl_nlp_course_dz_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Подготовка текста под задачу классификации

##### 1. Установка зависимостей

In [1]:
import numpy as np
import pandas as pd
import re
import spacy
from gensim.models import FastText, Word2Vec
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
pd.set_option('max_colwidth', 100)

##### 2. Загрузка датасета

In [3]:
# Пробросить файл из репозитория в файловую систему колаба
data = pd.read_csv('spam_or_not_spam.csv')
data.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMBER NUMBER from chris garrigues cwg dated NUMBER NU...,0
1,martin a posted tassos papadopoulos the greek sculptor behind the plan judged that the limestone...,0
2,man threatens explosion in moscow thursday august NUMBER NUMBER NUMBER NUMBER pm moscow ap secur...,0
3,klez the virus that won t die already the most prolific virus ever klez continues to wreak havoc...,0
4,in adding cream to spaghetti carbonara which has the same effect on pasta as making a pizza a d...,0


In [4]:
data['label'].value_counts()

0    2500
1     500
Name: label, dtype: int64

##### 3. Загрузка стоп-слов

In [5]:
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words
print(f'Spacy english stopwords size: {len(stopwords)}', end='\n\n')
' '.join(stopwords)

Spacy english stopwords size: 326



"himself which together ten mine before his both none between have you than always next four on must move empty made twelve hers by themselves last now nothing seeming whatever so top whereas often upon five several using ’s moreover ‘s get more quite across whereby against it wherein if your everyone see us over will 'd fifty any being once another third off am never well ‘d been used 's keep whenever nor side me might whole rather whoever sometimes but afterwards its mostly perhaps were not n’t of much seems further became say besides namely whom each below thereby within beside ’d really less one 'm three that least onto beyond therefore therein some yourself was be name with anywhere from otherwise myself i sometime ’m latter what had front same re is ‘ll ever about others the via up when could along most because amongst six already ours should these doing anyhow hereafter thus show ca ‘m towards n‘t would per 've why else how n't back can their my someone does hence they become th

In [6]:
data = data.dropna(subset=['email'])


##### 4. Чистим данные

In [7]:
%%time

data['cleaned_text'] = data['email'].apply(
    lambda x: ' '.join(
        token.lemma_.lower() for token in nlp(x) if
        not token.is_stop
        and not token.is_punct
        and not token.is_digit
        and not token.like_email
        and not token.like_num
        and not token.is_space
    )
)
data.sample(5)

CPU times: user 2min 21s, sys: 1.07 s, total: 2min 22s
Wall time: 2min 32s


Unnamed: 0,email,label,cleaned_text
1394,daniel quinlan wrote dq before we release it d be great if someone could test a few dq additiona...,0,daniel quinlan write dq release d great test dq additional score range maybe lower fps bit don t...
1668,a program that acts both as a pop client and a pop server you configure it by telling it about ...,0,program act pop client pop server configure tell real pop server point mail reader pop server lo...
854,begin forwarded text date wed NUMBER oct NUMBER NUMBER NUMBER NUMBER NUMBER from john s denker ...,0,begin forward text date d number oct number number number number number john s denker jsd monmou...
593,i d like to claim the parenthood of desktop web services but then there s a ton of people doing ...,0,d like claim parenthood desktop web service s ton people parent jackson alan bolcer realize birt...
1295,once upon a time roi wrote rpm build errors user dude does not exist using root user dude does n...,0,time roi write rpm build error user dude exist root user dude exist root user dude exist root us...


In [8]:
data['cleaned_text'].iloc[3]

'klez virus win t die prolific virus klez continue wreak havoc andrew brandt september number issue pc world magazine post thursday august number number klez worm approach month wriggle web make persistent virus expert warn harbinger new virus use combination pernicious approach pc pc antivirus software maker symantec mcafee report number new infection daily sign letup press time british security firm messagelab estimate number number e mail message hold variation klez virus say klez surpass summer s sircam prolific virus new klez variant aren t merely nuisance carry virus corrupt datum url irregular mailing list irregular url url'

##### 5. Делим выборку

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['label'], random_state=42, test_size=0.20)

## Построение моделей

In [10]:
sentences_train = [text.split() for text in X_train]
sentences_test = [text.split() for text in X_test]

In [11]:
def get_embeddings(sentences, model):
    #Ф-я преобразования векторов слов в вектора текстов
    embeddings = []
    for text in sentences:
        vector = np.zeros(model.vector_size)
        count = 0
        for word in text:
            if word in model.wv:
                vector += model.wv[word]
                count += 1
        if count != 0:
            vector /= count
        embeddings.append(vector)
    return np.array(embeddings)

##### 1. Функция векторизации и обучения модели

In [12]:
MODELS = {
    'Word2Vec': Word2Vec,
    'FastText': FastText
}

def train_model(sentences_train, sentences_test, y_train, sg, model_type):
    if model_type not in MODELS:
        raise ValueError("Invalid model type. Must be 'Word2Vec' or 'FastText'.")

    model_class = MODELS[model_type]
    model = model_class(
        sentences=sentences_train,
        vector_size=256,
        window=7,
        min_count=10,
        sg=sg,
        hs=0,
        negative=5,
        epochs=25,
        seed=42,
        workers=1
    )

    # Получение векторов эмбеддингов для каждого текста в обучающем наборе данных
    X_train_embeddings = get_embeddings(sentences_train, model)
    X_test_embeddings = get_embeddings(sentences_test, model)

    logreg = LogisticRegression()
    logreg.fit(X_train_embeddings, y_train)
    y_pred_train = logreg.predict(X_train_embeddings)
    y_pred_test = logreg.predict(X_test_embeddings)

    # Оценка качества предсказаний
    wrong_token = model.wv.doesnt_match(['magazine', 'press', 'post', 'summer'])
    most_similar = model.wv.most_similar(positive=['summer'], topn=5)
    most_similar = [x[0] for x in most_similar]
    f1_score_train = metrics.f1_score(y_train, y_pred_train)
    f1_score_test = metrics.f1_score(y_test, y_pred_test)
    return f1_score_train, f1_score_test, wrong_token, most_similar

##### 2. Итерация по вариантам моделей и принципа SkipGram / CBOW

In [15]:
results = []
for sg in tqdm([0, 1]):
    for model_type in tqdm(['Word2Vec', 'FastText']):
        f1_score_train, f1_score_test, wrong_token, most_similar = train_model(
            sentences_train,
            sentences_test,
            y_train,
            sg,
            model_type
            )
        results.append({
              'sg': sg,
              'model_type': model_type,
              'f1_score_train': f1_score_train,
              'f1_score_test': f1_score_test,
              'which wrong token': wrong_token,
              'most_similar_to_summer': most_similar
              })

# Создание датафрейма из списка словарей с результатами
df_results = pd.DataFrame(results)

  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:25<00:25, 25.19s/it][A
100%|██████████| 2/2 [02:15<00:00, 67.53s/it]
 50%|█████     | 1/2 [02:15<02:15, 135.09s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [01:15<01:15, 75.00s/it][A
100%|██████████| 2/2 [04:04<00:00, 122.37s/it]
100%|██████████| 2/2 [06:19<00:00, 189.93s/it]


In [16]:
df_results

Unnamed: 0,sg,model_type,f1_score_train,f1_score_test,which wrong token,most_similar_to_summer
0,0,Word2Vec,0.974619,0.979798,summer,"[forest, dry, winter, era, coast]"
1,0,FastText,0.959391,0.949495,summer,"[mother, career, sum, winner, banner]"
2,1,Word2Vec,0.958656,0.95288,summer,"[winter, opening, wet, spring, dry]"
3,1,FastText,0.954486,0.947368,summer,"[wet, winter, dry, spring, su]"
