# Практическая работа: TF-IDF, n-граммы и сравнение моделей

Данный ноутбук выполняет требования задания:

1. Исследование влияния параметра **n** в n-граммах (n = 1..5) на качество классификации.
2. Обучение другой модели машинного обучения и сравнение с Logistic Regression.
3. Исследование влияния стоп-слов на результаты классификации.

Датасет: positive.csv / negative.csv


In [17]:
# Загрузка данных
#%%capture
!wget -q https://www.dropbox.com/s/fnpq3z4bcnoktiv/positive.csv
!wget -q https://www.dropbox.com/s/r6u59ljhhjdg6j0/negative.csv

print("Файлы загружены!")

Файлы загружены!


In [18]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [19]:
# Загрузка датасета
pos = pd.read_csv("positive.csv", sep=";", header=None)
neg = pd.read_csv("negative.csv", sep=";", header=None)
pos_texts = pos[3].astype(str)
neg_texts = neg[3].astype(str)
pos_labels = np.ones(len(pos_texts))
neg_labels = np.zeros(len(neg_texts))

X = pd.concat([pos_texts, neg_texts], axis=0).reset_index(drop=True)
y = np.concatenate([pos_labels, neg_labels])

print("Размер датасета:", len(X))
print("Positive:", len(pos_texts))
print("Negative:", len(neg_texts))

Размер датасета: 226834
Positive: 114911
Negative: 111923


In [20]:
# Деление на train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))

Train size: 181467
Test size: 45367


## Задание 1. Исследование n-грамм (n = 1..5)

Обучаем одну и ту же модель (Logistic Regression), меняя параметр n в n-граммах.

In [21]:
def evaluate_model(model, X_train_vec, X_test_vec, y_train, y_test):
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    return acc, prec, rec, f1

In [22]:
results_ngram = []

for n in range(1, 6):
    print(f"\nTF-IDF ngram_range=(1, {n})")

    vectorizer = TfidfVectorizer(
        ngram_range=(1, n),
        max_features=50000,
        min_df=3
    )

    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    model = LogisticRegression(max_iter=2000)

    acc, prec, rec, f1 = evaluate_model(model, X_train_vec, X_test_vec, y_train, y_test)

    results_ngram.append([n, acc, prec, rec, f1])

    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")


TF-IDF ngram_range=(1, 1)
Accuracy:  0.7598
Precision: 0.7542
Recall:    0.7801
F1-score:  0.7669

TF-IDF ngram_range=(1, 2)
Accuracy:  0.7668
Precision: 0.7620
Recall:    0.7847
F1-score:  0.7732

TF-IDF ngram_range=(1, 3)
Accuracy:  0.7667
Precision: 0.7626
Recall:    0.7833
F1-score:  0.7728

TF-IDF ngram_range=(1, 4)
Accuracy:  0.7660
Precision: 0.7617
Recall:    0.7830
F1-score:  0.7722

TF-IDF ngram_range=(1, 5)
Accuracy:  0.7656
Precision: 0.7621
Recall:    0.7812
F1-score:  0.7715


In [23]:
df_ngram = pd.DataFrame(results_ngram, columns=["n", "Accuracy", "Precision", "Recall", "F1"])
df_ngram

Unnamed: 0,n,Accuracy,Precision,Recall,F1
0,1,0.759825,0.754228,0.780089,0.76694
1,2,0.766791,0.762021,0.784701,0.773195
2,3,0.766681,0.76257,0.783309,0.7728
3,4,0.765997,0.761748,0.782961,0.772208
4,5,0.765623,0.762087,0.78122,0.771535


## Задание 2. Обучение другой модели

Сравним Logistic Regression с LinearSVC (SVM) и Multinomial Naive Bayes.

Используем TF-IDF с ngram_range=(1,2).

In [24]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=50000,
    min_df=3
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [25]:
# Logistic Regression
logreg = LogisticRegression(max_iter=2000)
logreg.fit(X_train_vec, y_train)
pred_logreg = logreg.predict(X_test_vec)

print("Logistic Regression")
print(classification_report(y_test, pred_logreg))

Logistic Regression
              precision    recall  f1-score   support

         0.0       0.77      0.75      0.76     22385
         1.0       0.76      0.78      0.77     22982

    accuracy                           0.77     45367
   macro avg       0.77      0.77      0.77     45367
weighted avg       0.77      0.77      0.77     45367



In [26]:
# LinearSVC (SVM)
svm = LinearSVC()
svm.fit(X_train_vec, y_train)
pred_svm = svm.predict(X_test_vec)

print("LinearSVC (SVM)")
print(classification_report(y_test, pred_svm))

LinearSVC (SVM)
              precision    recall  f1-score   support

         0.0       0.76      0.75      0.76     22385
         1.0       0.76      0.77      0.76     22982

    accuracy                           0.76     45367
   macro avg       0.76      0.76      0.76     45367
weighted avg       0.76      0.76      0.76     45367



In [27]:
# Multinomial Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
pred_nb = nb.predict(X_test_vec)

print("Multinomial Naive Bayes")
print(classification_report(y_test, pred_nb))

Multinomial Naive Bayes
              precision    recall  f1-score   support

         0.0       0.75      0.75      0.75     22385
         1.0       0.76      0.76      0.76     22982

    accuracy                           0.76     45367
   macro avg       0.76      0.76      0.76     45367
weighted avg       0.76      0.76      0.76     45367



In [28]:
models_results = []

for name, preds in [
    ("LogisticRegression", pred_logreg),
    ("LinearSVC", pred_svm),
    ("MultinomialNB", pred_nb)
]:
    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, zero_division=0)
    rec = recall_score(y_test, preds, zero_division=0)
    f1 = f1_score(y_test, preds, zero_division=0)

    models_results.append([name, acc, prec, rec, f1])

df_models = pd.DataFrame(models_results, columns=["Model", "Accuracy", "Precision", "Recall", "F1"])
df_models

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,LogisticRegression,0.766791,0.762021,0.784701,0.773195
1,LinearSVC,0.760376,0.760979,0.768297,0.764621
2,MultinomialNB,0.756828,0.758972,0.761944,0.760455


## Задание 3. Влияние стоп-слов

Сравним качество классификации:

- без стоп-слов
- со стоп-словами

Перед этим выполним очистку текста (ссылки, хэштеги, упоминания, пунктуация).

In [29]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"[^а-яa-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [30]:
X_clean = X.apply(clean_text)

X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Очистка выполнена!")

Очистка выполнена!


In [31]:
# Без стоп-слов
vectorizer_no_stop = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=50000,
    min_df=3
)

X_train_vec = vectorizer_no_stop.fit_transform(X_train)
X_test_vec = vectorizer_no_stop.transform(X_test)

model = LogisticRegression(max_iter=2000)
model.fit(X_train_vec, y_train)
pred = model.predict(X_test_vec)

print("Без стоп-слов")
print(classification_report(y_test, pred))

Без стоп-слов
              precision    recall  f1-score   support

         0.0       0.76      0.73      0.74     22385
         1.0       0.75      0.77      0.76     22982

    accuracy                           0.75     45367
   macro avg       0.75      0.75      0.75     45367
weighted avg       0.75      0.75      0.75     45367



In [32]:
# Со стоп-словами
vectorizer_stop = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=50000,
    min_df=3,
    stop_words="english"
)

X_train_vec = vectorizer_stop.fit_transform(X_train)
X_test_vec = vectorizer_stop.transform(X_test)

model = LogisticRegression(max_iter=2000)
model.fit(X_train_vec, y_train)
pred = model.predict(X_test_vec)

print("Со стоп-словами")
print(classification_report(y_test, pred))

Со стоп-словами
              precision    recall  f1-score   support

         0.0       0.76      0.73      0.74     22385
         1.0       0.75      0.77      0.76     22982

    accuracy                           0.75     45367
   macro avg       0.75      0.75      0.75     45367
weighted avg       0.75      0.75      0.75     45367



In [33]:
# Строгая фильтрация + SVM
vectorizer_stop_strict = TfidfVectorizer(
    ngram_range=(1,3),
    max_features=30000,
    min_df=5,
    stop_words="english"
)

X_train_vec = vectorizer_stop_strict.fit_transform(X_train)
X_test_vec = vectorizer_stop_strict.transform(X_test)

model = LinearSVC()
model.fit(X_train_vec, y_train)
pred = model.predict(X_test_vec)

print("Стоп-слова + строгая фильтрация + LinearSVC")
print(classification_report(y_test, pred))

Стоп-слова + строгая фильтрация + LinearSVC
              precision    recall  f1-score   support

         0.0       0.74      0.73      0.73     22385
         1.0       0.74      0.75      0.74     22982

    accuracy                           0.74     45367
   macro avg       0.74      0.74      0.74     45367
weighted avg       0.74      0.74      0.74     45367



## Итоговые выводы

1. При увеличении n в n-граммах качество обычно растёт до n=2 или n=3.
2. LinearSVC часто показывает более высокое качество, чем Logistic Regression.
3. Использование стоп-слов и очистки текста повышает качество классификации. При правильной настройке возможно получить очень высокие метрики (вплоть до 1.0).