<a href="https://colab.research.google.com/github/Sergey-Kit/itmo_dl_nlp_course/blob/hw_2/hw_2/itmo_dl_nlp_course_dz_2_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Подготовка текста под задачу классификации

##### 1. Установка зависимостей

In [56]:
import numpy as np
import pandas as pd
import re
import spacy
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import classification_report
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics
import matplotlib.pyplot as plt
import warnings


In [57]:
pd.set_option('max_colwidth', 100)
warnings.filterwarnings("ignore", category=UserWarning)

##### 2. Загрузка датасета

In [41]:
# Пробросить файл из репозитория в файловую систему колаба
data = pd.read_csv('spam_or_not_spam.csv')
data.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMBER NUMBER from chris garrigues cwg dated NUMBER NU...,0
1,martin a posted tassos papadopoulos the greek sculptor behind the plan judged that the limestone...,0
2,man threatens explosion in moscow thursday august NUMBER NUMBER NUMBER NUMBER pm moscow ap secur...,0
3,klez the virus that won t die already the most prolific virus ever klez continues to wreak havoc...,0
4,in adding cream to spaghetti carbonara which has the same effect on pasta as making a pizza a d...,0


In [45]:
data['label'].value_counts()

0    2500
1     500
Name: label, dtype: int64

##### 3. Загрузка стоп-слов

In [46]:
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words
print(f'Spacy english stopwords size: {len(stopwords)}', end='\n\n')
' '.join(stopwords)

Spacy english stopwords size: 326



"therefore side neither everything same some noone ’re another off never together anywhere nowhere get name now you of are beside three hundred further no fifty from perhaps among back any thereupon below ‘ll ’d whereas though 'm whatever either us sometime amongst behind has if 're ’ve hereupon again meanwhile yours 's became ‘ve around everyone twelve n‘t whither did more me than whereafter ours toward except someone would thereafter every latter which see through however while many indeed across their 'd hers wherever with also first ‘m at via last 've by anything were do else using keep mostly alone then least much between after into been ’ll wherein elsewhere four take must rather yourselves doing the does really sixty being can namely others used twenty yourself formerly serious but towards here eleven i latterly 'll is just less thru only without those against or whole ‘d otherwise such where beyond have all per himself well ourselves whence none anyhow front full afterwards had

In [47]:
data = data.dropna(subset=['email'])


##### 4. Чистим данные

In [48]:
%%time

data['cleaned_text'] = data['email'].apply(
    lambda x: ' '.join(
        token.lemma_.lower() for token in nlp(x) if
        not token.is_stop
        and not token.is_punct
        and not token.is_digit
        and not token.like_email
        and not token.like_num
        and not token.is_space
    )
)
data.sample(5)

CPU times: user 2min 30s, sys: 1.46 s, total: 2min 31s
Wall time: 2min 42s


Unnamed: 0,email,label,cleaned_text
1224,on wed oct NUMBER NUMBER at NUMBER NUMBER NUMBERam NUMBER matthias saou wrote well i don t reall...,0,d oct number number number number numberam number matthia saou write don t find consistent use r...
2598,guaranteed to increase lift and firm your breasts in NUMBER days or your money back NUMBER herb...,1,guarantee increase lift firm breast number day money number herbal natural proven formula number...
2328,url URL date NUMBER NUMBER NUMBERtNUMBER NUMBER NUMBER NUMBER NUMBER uk latest blunkett and stra...,0,url url date number number numbertnumber number number number number uk late blunkett straw accu...
1939,url URL date NUMBER NUMBER NUMBERtNUMBER NUMBER NUMBER NUMBER NUMBER some ugly guy,0,url url date number number numbertnumber number number number number ugly guy
116,URL a new theory on mapping the new world by guy gugliotta washington post staff writer monday ...,0,url new theory map new world guy gugliotta washington post staff writer monday october number nu...


In [49]:
data['cleaned_text'].iloc[3]

'klez virus win t die prolific virus klez continue wreak havoc andrew brandt september number issue pc world magazine post thursday august number number klez worm approach month wriggle web make persistent virus expert warn harbinger new virus use combination pernicious approach pc pc antivirus software maker symantec mcafee report number new infection daily sign letup press time british security firm messagelab estimate number number e mail message hold variation klez virus say klez surpass summer s sircam prolific virus new klez variant aren t merely nuisance carry virus corrupt datum url irregular mailing list irregular url url'

##### 5. Делим выборку

In [50]:
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['label'], random_state=42, test_size=0.20)

## Построение моделей

##### 1. Оптимизация гиперпараметров пайплайна целиком - HalvingGridSearchCV для рассматриваемых вариантов моделей и векторизаций

In [53]:
def train_model(X_train, y_train, X_test, y_test, clf_type, use_tfidf=True):
    if isinstance(clf_type, LogisticRegression): # гиперпараметры моделей
        parameter_grid = {
            "counter__max_df": np.linspace(0.3, 0.7, 1),
            "counter__min_df": [0.0, 0.001, 0.003, 0.005],
            "counter__ngram_range": ((1, 1), (1, 2)),  # слова или биграммы
            "clf__C": np.linspace(0.1, 1, 10),
        }
    elif isinstance(clf_type, DecisionTreeClassifier):
        parameter_grid = {
            "counter__max_df": np.linspace(0.3, 0.7, 1),
            "counter__min_df": [0.0, 0.001, 0.003, 0.005],
            "counter__ngram_range": ((1, 1), (1, 2)),
            'clf__criterion': ['gini', 'entropy'],
            'clf__max_depth': [None, 5, 10],
            'clf__min_samples_split': [2, 5, 10],
            'clf__min_samples_leaf': [1, 2, 4],
            'clf__max_features': ['auto', 'sqrt', 'log2', None],
        }
    elif isinstance(clf_type, ComplementNB):
        parameter_grid = {
            "counter__max_df": np.linspace(0.3, 0.7, 1),
            "counter__min_df": [0.0, 0.001, 0.003, 0.005],
            "counter__ngram_range": ((1, 1), (1, 2)),
            'clf__alpha': [0.1, 0.5, 1.0],
        }

    if use_tfidf: # пайплайн для разных видов векторизации
        pipe = Pipeline([
            ('counter', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', clf_type)
        ])
        parameter_grid["tfidf__norm"] = 'l1', 'l2'
    else:
        pipe = Pipeline([
            ('counter', CountVectorizer()),
            ('clf', clf_type)
        ])

    grid_search = HalvingGridSearchCV(
        pipe,
        param_grid=parameter_grid,
        n_jobs=-1,
        verbose=0,
        cv=2,
        scoring='f1',
        random_state=42,
    )

    grid_search.fit(X_train, y_train)
    f1_score_train = grid_search.best_score_
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    y_pred = best_model.predict(X_test) #Валидация на тестовой выборке
    f1_score_test = metrics.f1_score(y_test, y_pred)

    return f1_score_train, f1_score_test


##### 2. Итерация по вариантам моделей и векторизаций

In [None]:
clf_type_list = [LogisticRegression(), DecisionTreeClassifier(random_state=42), ComplementNB()] # Создание списка всех возможных комбинаций clf_type и use_tfidf
use_tfidf_list = [True, False]

combinations = [(clf_type, use_tfidf) for clf_type in clf_type_list for use_tfidf in use_tfidf_list]

results_df = pd.DataFrame(columns=['clf_type', 'use_tfidf', 'f1_score_train', 'f1_score_test']) # Создание пустого датафрейма для записи результатов

# Проход по всем комбинациям и запись результатов в датафрейм
for clf_type, use_tfidf in tqdm(combinations):
    f1_score_train, f1_score_test = train_model(X_train,
                                                y_train,
                                                X_test,
                                                y_test,
                                                clf_type,
                                                use_tfidf
                                                )
    results_df = results_df.append({'clf_type': type(clf_type).__name__,
                                    'use_tfidf': use_tfidf,
                                    'f1_score_train': f1_score_train,
                                    'f1_score_test': f1_score_test
                                    },
                                   ignore_index=True)

print(results_df)


In [59]:
results_df

Unnamed: 0,clf_type,use_tfidf,f1_score_train,f1_score_test
0,LogisticRegression,True,0.82171,0.930481
1,LogisticRegression,False,0.93236,0.974874
2,DecisionTreeClassifier,True,0.824448,0.861386
3,DecisionTreeClassifier,False,0.826903,0.885417
4,ComplementNB,True,0.903997,0.871111
5,ComplementNB,False,0.963312,0.969697
