# Окутин Денис ИУ9-11М Задание 9

In [1]:
from sklearn.datasets import fetch_20newsgroups
import re
import numpy as np

## Скачивание датасета

In [2]:
categories = [
    'comp.graphics',
    'sci.med',
    'alt.atheism',
]

newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

X_train_raw, y_train = newsgroups_train.data, newsgroups_train.target
X_test_raw, y_test = newsgroups_test.data, newsgroups_test.target

## Обработка

In [3]:
def preprocessing (text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    return tokens

word_set = set()
for text in X_train_raw:
    tokens = preprocessing(text)
    word_set.update(tokens)

vocab = list(word_set)
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 29401


## Кодирование

In [4]:
import numpy as np

def one_hot(text,vocab):
    res = np.zeros(len(vocab))
    for word in text:
        if word in vocab:
            index = vocab.index(word)
            res[index]=1
    return res

X_train = []
for text in X_train_raw:
    tokens = preprocessing(text)
    X_train.append(one_hot(tokens,vocab))
X_train = np.array(X_train)

X_test = []
for text in X_test_raw:
    tokens = preprocessing(text)
    X_test.append(one_hot(tokens,vocab))
X_test = np.array(X_test)

## Метрики

In [5]:
def my_accuracy_score(y_true, y_pred):
    correct = np.sum(y_true == y_pred)
    return correct / len(y_true)

def my_precision_score(y_true, y_pred,average='macro'):
    classes = np.unique(y_true)
    precisions = []

    for c in classes:
        tp = np.sum((y_pred == c) & (y_true == c))
        fp = np.sum((y_pred == c) & (y_true != c))

        precision = 0
        if (tp + fp) > 0:
            precision = tp / (tp + fp)

        precisions.append(precision)
    if average == 'macro':
      return np.mean(precisions)
    return precisions

def my_recall_score(y_true, y_pred,average='macro'):
    classes = np.unique(y_true)
    recalls = []

    for c in classes:
        tp = np.sum((y_pred == c) & (y_true == c))
        fn = np.sum((y_true == c) & (y_pred != c))

        recall = 0
        if (tp + fn) > 0:
            recall = tp / (tp + fn)

        recalls.append(recall)

    if average == 'macro':
      return np.mean(recalls)
    return recalls

def my_f1_score(y_true, y_pred, average='macro'):
    precisions = my_precision_score(y_true, y_pred,average=None)
    recalls = my_recall_score(y_true, y_pred,average=None)

    f1_scores = []
    for p, r in zip(precisions, recalls):
      f1 = 0
      if (p + r) != 0:
          f1 = 2 * p * r / (p + r)
      f1_scores.append(f1)

    if average == 'macro':
      return np.mean(f1_scores)

    return f1_scores

# Тестирование

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

svm_param_grid = {
    'C': [0.1],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

svm_model = SVC(random_state=42)

svm_grid = GridSearchCV(
    svm_model,
    svm_param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

print("Подбор параметров для SVM...")
svm_grid.fit(X_train, y_train)

print("Лучшие параметры SVM:", svm_grid.best_params_)
print("Лучшая точность SVM (кросс-валидация):", svm_grid.best_score_)


Подбор параметров для SVM...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Лучшие параметры SVM: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Лучшая точность SVM (кросс-валидация): 0.9445132391610102


In [None]:
y_pred_svm = svm_grid.predict(X_test)
print(f"Accuracy: {my_accuracy_score(y_test, y_pred_svm)}")
print(f"Precision: {my_precision_score(y_test, y_pred_svm)}")
print(f"Recall: {my_recall_score(y_test, y_pred_svm)}")
print(f"F1-Score: {my_f1_score(y_test, y_pred_svm)}")

Accuracy: 0.8686594202898551
Precision: 0.8719575828205968
Recall: 0.8703652218189936
F1-Score: 0.8697453969480732


In [None]:
lr_param_grid = {
    'C': [0.1],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000]
}

lr_model = LogisticRegression(random_state=42)

lr_grid = GridSearchCV(
    lr_model,
    lr_param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

print("Подбор параметров для Logistic Regression...")
lr_grid.fit(X_train, y_train)

print("Лучшие параметры Logistic Regression:", lr_grid.best_params_)
print("Лучшая точность Logistic Regression (кросс-валидация):", lr_grid.best_score_)

Подбор параметров для Logistic Regression...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Лучшие параметры Logistic Regression: {'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'saga'}
Лучшая точность Logistic Regression (кросс-валидация): 0.9499425618715658


In [6]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(random_state=42,C = 0.1, max_iter = 1000, penalty = 'l2', solver = 'saga')
lr_model.fit(X_train,y_train)

y_pred_lr = lr_model.predict(X_test)
print(f"Accuracy: {my_accuracy_score(y_test, y_pred_lr)}")
print(f"Precision: {my_precision_score(y_test, y_pred_lr)}")
print(f"Recall: {my_recall_score(y_test, y_pred_lr)}")
print(f"F1-Score: {my_f1_score(y_test, y_pred_lr)}")

Accuracy: 0.8831521739130435
Precision: 0.8914592760457539
Recall: 0.883872409047482
F1-Score: 0.8846037292757245
