# Глубокое обучение и обработка естественного языка

## Домашняя работа №2

1. Загрузить набор данных Spam Or Not Spam
2. Попробовать и сравнить различные способы векторизации: 3 балла

  *   sklearn.feature_extraction.text.CountVectorizer
  *   sklearn.feature_extraction.text.TfidfVectorizer

3. Обучить на полученных векторах модели, с использованием кросс-валидации и подбором гиперпараметров: 3 балла

  *   sklearn.tree.DecisionTreeClassifier
  *   sklearn.linear_model.LogisticRegression
  *   Naive Bayes

4. Сравнить качество обученных моделей на отложенной выборке - 1 балл
5. Обеспечена воспроизводимость решения: зафиксированы random_state, ноутбук воспроизводится от начала до конца без ошибок - 2 балла
6. Соблюден code style на уровне pep8 и On writing clean Jupyter notebooks - 1 балл

In [None]:
# установка spaCy
!pip install -U spacy

# English pipeline в spaCy
!python3 -m spacy download en_core_web_sm

In [2]:
# подключение библиотек
import numpy as np
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [3]:
from google.colab import files
uploaded = files.upload()

Saving spam_or_not_spam.csv to spam_or_not_spam (1).csv


In [4]:
# функция перебора моделей
def get_best_model(pipe, params, n_jobs=-1, scoring='f1', cv=3):
  grid = GridSearchCV( \
      pipe,
      param_grid=params,
      n_jobs=n_jobs,
      scoring=scoring,
      cv=5)

  grid.fit(X_train, y_train)

  return  { \
            'grid': grid,
            'classifier': grid.best_estimator_,
            'best score': grid.best_score_,
            'best params': grid.best_params_,
            'cv': grid.cv
          }


# функция оценки на отложенной выборке
def estimate_test(model, X_test):
  best_clf = model['classifier']
  y_pred = best_clf.predict(X_test)
  report = classification_report(y_test, y_pred, output_dict=True)
  return report

### 1. Разведочный анализ

In [5]:
df = pd.read_csv('spam_or_not_spam.csv')
df

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0
...,...,...
2995,abc s good morning america ranks it the NUMBE...,1
2996,hyperlink hyperlink hyperlink let mortgage le...,1
2997,thank you for shopping with us gifts for all ...,1
2998,the famous ebay marketing e course learn to s...,1


In [6]:
# типы данных
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   email   2999 non-null   object
 1   label   3000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 47.0+ KB


In [7]:
# пропуски в данных
df.isna().sum()

email    1
label    0
dtype: int64

In [8]:
df = df.dropna()

In [9]:
# соотношение классов
df['label'].value_counts()

0    2500
1     499
Name: label, dtype: int64

### 2. Нормализация, токенизация и лемматизация

In [10]:
nlp = spacy.load("en_core_web_sm")

df['cleaned_text'] = df['email'].apply(
    lambda x: ' '.join(
      token.lemma_.lower() for token in nlp(x) if
      not token.is_stop
      and not token.is_punct
      and not token.is_digit
      and not token.like_email
      and not token.like_num
      and not token.is_space
    )
  )

df.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_text'] = df['email'].apply(


Unnamed: 0,email,label,cleaned_text
1488,gary funck said i thought the perllocal pod l...,0,gary funck say think perllocal pod line look o...
1692,i m listed as a developer on sf and have the s...,0,m list developer sf spambayes cvs module check...
12,on mon aug NUMBER NUMBER at NUMBER NUMBER NUMB...,0,mon aug number number number number numberpm n...
1880,url URL date NUMBER NUMBER NUMBERtNUMBER NUMBE...,0,url url date number number numbertnumber numbe...
53,not true on the choice part after three weeks...,0,true choice week tell eircom fact need want nu...


### 3. Сравнение CountVectorizer и TfidfVectorizer


In [11]:
# тренеровочная и тестовая выборки
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['label'], random_state=2023)

Сравним результирующие матрицы векторов для CountVectorizer и TfidfVectorizer

In [12]:
vectorizer = CountVectorizer(max_df=0.7, min_df=0.003)
X_train_vectorized = vectorizer.fit_transform(X_train)

pd.DataFrame(X_train_vectorized.toarray(), columns=vectorizer.get_feature_names_out()).head()

Unnamed: 0,aa,aaron,abandon,ability,able,abroad,absence,absolute,absolutely,abstract,...,yesterday,yield,york,young,yup,ziggy,zip,zone,zope,zzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
vectorizer = TfidfVectorizer(max_df=0.7, min_df=0.003)
X_train_vectorized = vectorizer.fit_transform(X_train)

pd.DataFrame(X_train_vectorized.toarray(), columns=vectorizer.get_feature_names_out()).head()

Unnamed: 0,aa,aaron,abandon,ability,able,abroad,absence,absolute,absolutely,abstract,...,yesterday,yield,york,young,yup,ziggy,zip,zone,zope,zzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.070433,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# сводная таблица
styles = [dict(selector="caption",
                       props=[("text-align", "center"),
                              ("font-size", "120%"),
                              ("color", 'white')])]

index = ['DecisionTreeClassifier', 'LogisticRegression', 'MultinomialNB']
columns = ['CountVectorizer', 'TfidfVectorizer']

res_df = pd.DataFrame([[0, 0]] * 3, index=index, columns=columns)
res_df

Unnamed: 0,CountVectorizer,TfidfVectorizer
DecisionTreeClassifier,0,0
LogisticRegression,0,0
MultinomialNB,0,0


#### 1. CountVectorizer

In [22]:
# сетка параметров
params = {
    'counter__max_df': np.linspace(0.7, 1.0, 4),
    'counter__min_df': [0.0, 0.001, 0.003, 0.005],
    'counter__ngram_range': [(1, 1), (1, 2)],
}

DecisionTreeClassifier

In [23]:
# бейзлайн
pipe = Pipeline(
    steps=[
        ('counter', CountVectorizer()),
        ('clf', DecisionTreeClassifier(random_state=2023))
    ]
)

model = get_best_model(pipe, params)

In [24]:
# оценка на отложенной выборке
report = estimate_test(model, X_test)

# сохранение результата в таблицу
res_df['CountVectorizer'].iloc[0] = round(report['accuracy'], 3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res_df['CountVectorizer'].iloc[0] = round(report['accuracy'], 3)


LogisticRegression

In [25]:
# бейзлайн
pipe = Pipeline(
    steps=[
        ('counter', CountVectorizer()),
        ('clf', LogisticRegression(random_state=2023))
    ]
)

model = get_best_model(pipe, params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
# оценка на отложенной выборке
report = estimate_test(model, X_test)

# сохранение результата в таблицу
res_df['CountVectorizer'].iloc[1] = round(report['accuracy'], 3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res_df['CountVectorizer'].iloc[1] = round(report['accuracy'], 3)


MultinomialNB

In [27]:
# бейзлайн
pipe = Pipeline(
    steps=[
        ('counter', CountVectorizer()),
        ('clf', MultinomialNB())
    ]
)

model = get_best_model(pipe, params)

In [28]:
# оценка на отложенной выборке
report = estimate_test(model, X_test)

# сохранение результата в таблицу
res_df['CountVectorizer'].iloc[2] = round(report['accuracy'], 3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res_df['CountVectorizer'].iloc[2] = round(report['accuracy'], 3)


#### 2. TfidfVectorizer

In [29]:
# сетка параметров
params = {
    'tfidf__max_df': np.linspace(0.7, 1.0, 4),
    'tfidf__min_df': [0.0, 0.001, 0.003, 0.005],
    "tfidf__norm": ["l1", "l2"],
}

DecisionTreeClassifier

In [30]:
# бейзлайн
pipe = Pipeline(
    steps=[
        ('tfidf', TfidfVectorizer()),
        ('clf', DecisionTreeClassifier(random_state=2023))
    ]
)

model = get_best_model(pipe, params)

In [31]:
# оценка на отложенной выборке
report = estimate_test(model, X_test)

# сохранение результата в таблицу
res_df['TfidfVectorizer'].iloc[0] = round(report['accuracy'], 3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res_df['TfidfVectorizer'].iloc[0] = round(report['accuracy'], 3)


LogisticRegression

In [32]:
# бейзлайн
pipe = Pipeline(
    steps=[
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression(random_state=2023))
    ]
)

model = get_best_model(pipe, params)

In [33]:
# оценка на отложенной выборке
report = estimate_test(model, X_test)

# сохранение результата в таблицу
res_df['TfidfVectorizer'].iloc[1] = round(report['accuracy'], 3)

MultinomialNB

In [34]:
# бейзлайн
pipe = Pipeline(
    steps=[
        ('tfidf', TfidfVectorizer()),
        ('clf', MultinomialNB())
    ]
)

model = get_best_model(pipe, params)

In [35]:
# оценка на отложенной выборке
report = estimate_test(model, X_test)

# сохранение результата в таблицу
res_df['TfidfVectorizer'].iloc[2] = round(report['accuracy'], 3)

#### Итог

In [36]:
res_df

Unnamed: 0,CountVectorizer,TfidfVectorizer
DecisionTreeClassifier,0.963,0.96
LogisticRegression,0.988,0.971
MultinomialNB,0.988,0.971


In [37]:
!pip freeze > requirements.txt