In [27]:
import warnings
from sklearn.datasets import fetch_20newsgroups
warnings.simplefilter(action='ignore', category=FutureWarning)

In [28]:
categories = ['comp.sys.ibm.pc.hardware', 'sci.med', 'talk.politics.mideast']
remove = ('headers', 'footers', 'quotes')

twenty_train_full = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42, remove=remove)
twenty_test_full = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42, remove=remove)

### Применение стемминга

In [29]:
import nltk
from nltk import word_tokenize
from nltk.stem import *

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/vlad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [30]:
def stemming(data):
    porter_stemmer = PorterStemmer()
    stem = []
    for text in data:
        nltk_tokens = word_tokenize(text)
        line = ''.join([' ' + porter_stemmer.stem(word) for word in nltk_tokens])
        stem.append(line)
    return stem

In [31]:
stem_train = stemming(twenty_train_full.data)
stem_test = stemming(twenty_test_full.data)

### Задание
### Вариант №17
### Методы: [KNN, LR, MNB]

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

In [33]:
stop_words = [None, 'english']
max_features_values = [100, 500, 1000, 5000, 10000]
use_idf = [True, False]

In [51]:
parameters_knn = {
    'vect__max_features': max_features_values,
    'vect__stop_words': stop_words,
    'tfidf__use_idf': use_idf,
    'clf__n_neighbors': range(1, 10),  # Количество соседей
    'clf__weights': ['uniform', 'distance'],  # Веса соседей
    'clf__metric': ['euclidean', 'manhattan'],  # Метрика расстояния
}

parameters_lr = {
    'vect__max_features': max_features_values,
    'vect__stop_words': stop_words,
    'tfidf__use_idf': use_idf,
    'clf__solver': ['newton-cg', 'lbfgs', 'sag', 'liblinear'],
    'clf__penalty': ['l2']
}

parameters_lr_l1 = {
    'vect__max_features': max_features_values,
    'vect__stop_words': stop_words,
    'tfidf__use_idf': use_idf,
    'clf__solver': ['liblinear'],  # Используем только 'liblinear' для l1
    'clf__penalty': ['l1'],
}

parameters_mnb = {
    'vect__max_features': max_features_values,
    'vect__stop_words': stop_words,
    'tfidf__use_idf': use_idf,
    'clf__alpha': [0.1, 1.0, 2.0],  # Параметр сглаживания
}

In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

### К-ближайших соседей (KNN)

#### Без использования стемминга

In [52]:
text_clf_knn = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', KNeighborsClassifier())])
gscv_knn = GridSearchCV(text_clf_knn, param_grid=parameters_knn, n_jobs=-1)
gscv_knn.fit(twenty_train_full.data, twenty_train_full.target)

#### С использованием стема

In [54]:
text_clf_knn_stem = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', KNeighborsClassifier())])
gscv_knn_stem = GridSearchCV(text_clf_knn_stem, param_grid=parameters_knn, n_jobs=-1)
gscv_knn_stem.fit(stem_train, twenty_train_full.target)

### Логистическая регрессия (LR)

#### Без использования стемминга

In [38]:
text_clf_lr = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', LogisticRegression())])
gscv_lr = GridSearchCV(text_clf_lr, param_grid=parameters_lr, n_jobs=-1)
gscv_lr.fit(twenty_train_full.data, twenty_train_full.target)

text_clf_lr_l1 = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', LogisticRegression())])
gscv_lr_l1 = GridSearchCV(text_clf_lr_l1, param_grid=parameters_lr_l1, n_jobs=-1)
gscv_lr_l1.fit(twenty_train_full.data, twenty_train_full.target)

#### С использованием стемминга

In [39]:
text_clf_lr_stem = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', LogisticRegression())])
gscv_lr_stem = GridSearchCV(text_clf_lr_stem, param_grid=parameters_lr, n_jobs=-1)
gscv_lr_stem.fit(stem_train, twenty_train_full.target)

text_clf_lr_l1_stem = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', LogisticRegression())])
gscv_lr_l1_stem = GridSearchCV(text_clf_lr_l1_stem, param_grid=parameters_lr_l1, n_jobs=-1)
gscv_lr_l1_stem.fit(stem_train, twenty_train_full.target)

### Мультиномиальный Наивный Байесовский метод (MNB)

#### Без использования стемминга

In [40]:
text_clf_mnb = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])
gscv_mnb = GridSearchCV(text_clf_mnb, param_grid=parameters_mnb, n_jobs=-1)
gscv_mnb.fit(twenty_train_full.data, twenty_train_full.target)

#### С использованием стемминга

In [41]:
text_clf_mnb_stem = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', MultinomialNB())])
gscv_mnb_stem = GridSearchCV(text_clf_mnb_stem, param_grid=parameters_mnb, n_jobs=-1)
gscv_mnb_stem.fit(stem_train, twenty_train_full.target)

### Вывод полученных результатов анализа

In [42]:
from sklearn.metrics import classification_report

In [53]:
predicted_knn = gscv_knn.predict(twenty_test_full.data)
print('К-ближайших соседей (KNN) без стемминга\n')
print(classification_report(twenty_test_full.target, predicted_knn, target_names=categories))
print(gscv_knn.best_params_)

К-ближайших соседей (KNN) без стемминга

                          precision    recall  f1-score   support

comp.sys.ibm.pc.hardware       0.61      0.77      0.68       392
                 sci.med       0.61      0.52      0.56       396
   talk.politics.mideast       0.79      0.68      0.73       376

                accuracy                           0.66      1164
               macro avg       0.67      0.66      0.66      1164
            weighted avg       0.66      0.66      0.65      1164

{'clf__metric': 'euclidean', 'clf__n_neighbors': 5, 'clf__weights': 'distance', 'tfidf__use_idf': True, 'vect__max_features': 100, 'vect__stop_words': 'english'}


In [55]:
predicted_knn_stem = gscv_knn_stem.predict(twenty_test_full.data)
print('К-ближайших соседей (KNN) со стеммингом\n')
print(classification_report(twenty_test_full.target, predicted_knn_stem, target_names=categories))
print(gscv_knn_stem.best_params_)

К-ближайших соседей (KNN) со стеммингом

                          precision    recall  f1-score   support

comp.sys.ibm.pc.hardware       0.63      0.58      0.60       392
                 sci.med       0.47      0.56      0.51       396
   talk.politics.mideast       0.67      0.59      0.63       376

                accuracy                           0.58      1164
               macro avg       0.59      0.58      0.58      1164
            weighted avg       0.59      0.58      0.58      1164

{'clf__metric': 'euclidean', 'clf__n_neighbors': 3, 'clf__weights': 'distance', 'tfidf__use_idf': True, 'vect__max_features': 100, 'vect__stop_words': 'english'}


In [45]:
predicted_lr = gscv_lr.predict(twenty_test_full.data)
print('Логистическая регрессия (LR) без стемминга\n')
print(classification_report(twenty_test_full.target, predicted_lr, target_names=categories))
print(gscv_lr.best_params_)

predicted_lr_l1 = gscv_lr_l1.predict(twenty_test_full.data)
print('Логистическая регрессия_l1 (LR) без стемминга\n')
print(classification_report(twenty_test_full.target, predicted_lr_l1, target_names=categories))
print(gscv_lr_l1.best_params_)

Логистическая регрессия (LR) без стемминга

                          precision    recall  f1-score   support

comp.sys.ibm.pc.hardware       0.97      0.96      0.97       392
                 sci.med       0.89      0.94      0.91       396
   talk.politics.mideast       0.95      0.91      0.93       376

                accuracy                           0.94      1164
               macro avg       0.94      0.94      0.94      1164
            weighted avg       0.94      0.94      0.94      1164

{'clf__penalty': 'l2', 'clf__solver': 'newton-cg', 'tfidf__use_idf': True, 'vect__max_features': 10000, 'vect__stop_words': 'english'}
Логистическая регрессия_l1 (LR) без стемминга

                          precision    recall  f1-score   support

comp.sys.ibm.pc.hardware       0.95      0.88      0.91       392
                 sci.med       0.75      0.91      0.82       396
   talk.politics.mideast       0.93      0.79      0.85       376

                accuracy                   

In [46]:
predicted_lr_stem = gscv_lr_stem.predict(twenty_test_full.data)
print('Логистическая регрессия (LR) со стеммингом\n')
print(classification_report(twenty_test_full.target, predicted_lr_stem, target_names=categories))
print(gscv_lr_stem.best_params_)

predicted_lr_l1_stem = gscv_lr_l1_stem.predict(twenty_test_full.data)
print('Логистическая регрессия_l1 (LR) со стеммингом\n')
print(classification_report(twenty_test_full.target, predicted_lr_l1_stem, target_names=categories))
print(gscv_lr_l1_stem.best_params_)

Логистическая регрессия (LR) со стеммингом

                          precision    recall  f1-score   support

comp.sys.ibm.pc.hardware       0.95      0.94      0.94       392
                 sci.med       0.87      0.91      0.89       396
   talk.politics.mideast       0.93      0.90      0.92       376

                accuracy                           0.92      1164
               macro avg       0.92      0.92      0.92      1164
            weighted avg       0.92      0.92      0.92      1164

{'clf__penalty': 'l2', 'clf__solver': 'liblinear', 'tfidf__use_idf': True, 'vect__max_features': 10000, 'vect__stop_words': 'english'}
Логистическая регрессия_l1 (LR) со стеммингом

                          precision    recall  f1-score   support

comp.sys.ibm.pc.hardware       0.91      0.82      0.86       392
                 sci.med       0.71      0.87      0.78       396
   talk.politics.mideast       0.88      0.77      0.82       376

                accuracy                   

In [47]:
predicted_mnb = gscv_mnb.predict(twenty_test_full.data)
print('Мультиномиальный Наивный Байесовский метод (MNB) без стемминга\n')
print(classification_report(twenty_test_full.target, predicted_mnb, target_names=categories))
print(gscv_mnb.best_params_)

Мультиномиальный Наивный Байесовский метод (MNB) без стемминга

                          precision    recall  f1-score   support

comp.sys.ibm.pc.hardware       0.96      0.98      0.97       392
                 sci.med       0.94      0.93      0.93       396
   talk.politics.mideast       0.95      0.94      0.95       376

                accuracy                           0.95      1164
               macro avg       0.95      0.95      0.95      1164
            weighted avg       0.95      0.95      0.95      1164

{'clf__alpha': 0.1, 'tfidf__use_idf': True, 'vect__max_features': 10000, 'vect__stop_words': 'english'}


In [48]:
predicted_mnb_stem = gscv_mnb_stem.predict(twenty_test_full.data)
print('Мультиномиальный Наивный Байесовский метод (MNB) со стеммингом\n')
print(classification_report(twenty_test_full.target, predicted_mnb_stem, target_names=categories))
print(gscv_mnb_stem.best_params_)

Мультиномиальный Наивный Байесовский метод (MNB) со стеммингом

                          precision    recall  f1-score   support

comp.sys.ibm.pc.hardware       0.95      0.97      0.96       392
                 sci.med       0.93      0.88      0.90       396
   talk.politics.mideast       0.91      0.94      0.93       376

                accuracy                           0.93      1164
               macro avg       0.93      0.93      0.93      1164
            weighted avg       0.93      0.93      0.93      1164

{'clf__alpha': 0.1, 'tfidf__use_idf': True, 'vect__max_features': 10000, 'vect__stop_words': None}


### Сравнительная таблица

In [49]:
import pandas as pd

In [56]:
writer = pd.ExcelWriter('result.xlsx', engine='openpyxl')

# К-ближайших соседей (KNN) без стемминга
df1 = pd.DataFrame(classification_report(predicted_knn, twenty_test_full.target, output_dict=True))

# К-ближайших соседей (KNN) со стеммингом
df2 = pd.DataFrame(classification_report(predicted_knn_stem, twenty_test_full.target, output_dict=True))

# Логистическая регрессия (LR) без стемминга
df3 = pd.DataFrame(classification_report(predicted_lr, twenty_test_full.target, output_dict=True))

# Логистическая регрессия_l1 (LR) без стемминга
df4 = pd.DataFrame(classification_report(predicted_lr_l1, twenty_test_full.target, output_dict=True))

# Логистическая регрессия (LR) со стеммингом
df5 = pd.DataFrame(classification_report(predicted_lr_stem, twenty_test_full.target, output_dict=True))

# Логистическая регрессия_l1 (LR) с стеммингом
df6 = pd.DataFrame(classification_report(predicted_lr_l1_stem, twenty_test_full.target, output_dict=True))

# Мультиномиальный Наивный Байесовский метод (MNB) без стемминга
df7 = pd.DataFrame(classification_report(predicted_mnb, twenty_test_full.target, output_dict=True))

# Мультиномиальный Наивный Байесовский метод (MNB) со стеммингом
df8 = pd.DataFrame(classification_report(predicted_mnb_stem, twenty_test_full.target, output_dict=True))

df1.to_excel(writer, sheet_name='KNN без стемминга')
df2.to_excel(writer, sheet_name='KNN со стеммингом')

df3.to_excel(writer, sheet_name='LR без стемминга')
df4.to_excel(writer, sheet_name='LR_l1 без стемминга')

df5.to_excel(writer, sheet_name='LR со стеммингом')
df6.to_excel(writer, sheet_name='LR_l1 со стеммингом')

df7.to_excel(writer, sheet_name='MNB без стемминга')
df8.to_excel(writer, sheet_name='MNB со стеммингом')

writer.close()