In [1]:
from pathlib import Path
import json

## Data loading

In [2]:
news_path = Path('../cache/news_6771.json')
with news_path.open(encoding="UTF-8") as f:
    news = json.load(f).get('catalog')

## Corpus preprocessing

### Stemming

In [3]:
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation

mystem = Mystem() 
russian_stopwords = stopwords.words("russian")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexgiving/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def preprocess_text(text):
    """
    Tokenize text and lemmatize it with pymystem3 and remove stopwords and punctuation symbols from it
    """
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if \
                    token not in russian_stopwords \
                    and token != " " \
                    and token.strip() not in punctuation \
                    and token.strip() not in ['\n', '\t', '\r']
            ]
    text = " ".join(tokens)
    return text

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from typing import Optional

class CorpusStructure:
    corpus: list
    target: list
    vectorizer: CountVectorizer
    matrix: Optional[list] = None

    def __init__(self, corpus: list, targets: list, vectorizer: CountVectorizer) -> None:
        self._corpus = corpus
        self._target = targets
        self._vectorizer = vectorizer
        self._matrix = None

    @property
    def corpus(self) -> list:
        return self._corpus

    @property
    def target(self) -> list:
        return self._target

    @property
    def corpus_len(self) -> int:
        return len(self._corpus)

    @property
    def target_len(self) -> int:
        return len(self._target)

    @property
    def matrix(self):
        if self._matrix == None:
            self._matrix = self._vectorizer.transform(self._corpus)
        return self._matrix.toarray()

    def transform(self):
        return self._vectorizer.transform(self._corpus)

In [6]:
def train_test_split(corpus: list, targets: list, vectorizer: CountVectorizer, test_size: float = 0.2) -> tuple:
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(corpus, targets, test_size=test_size, random_state=42)
    return CorpusStructure(X_train, y_train, vectorizer), CorpusStructure(X_test, y_test, vectorizer)

In [7]:
import random

def get_corpus(news: list, n: int = -1, shuffle=True):
    corpus_text = []
    corpus_target = []
    n = len(news) if n == -1 else n
    if shuffle: random.shuffle(news)
    for article in news[:n]:
        corpus_text.append(preprocess_text(article.get('text')))
        corpus_target.append(article.get('category'))
    return corpus_text, corpus_target

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
corpus_text, corpus_target = get_corpus(news)
vectorizer.fit(corpus_text)

train_data, test_data = train_test_split(corpus_text, corpus_target, vectorizer, test_size=0.2)

## EXPERIMENTS

In [None]:
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

"""Module for grid search and classification of the data"""
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


models = [
    (
        'LogisticRegression',
        LogisticRegression(),
        {
            'penalty'   : ['l1','l2'], 
            'solver'    : ['lbfgs', 'newton-cg', 'sag', 'saga'],
            'C'         : [ 0.1, 1, 10],
            'max_iter'  : [1000]
        }
    ),
    (
        'MultinomialNB',
        MultinomialNB(),
        {
            'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000]
        }
    ),
    (
        'LinearSVC',
        LinearSVC(),
        {
            'penalty'   : ['l1','l2'], 
            'loss'      : ['hinge', 'squared_hinge'],
            'C'         : [ 0.1, 1, 10],
            'max_iter'  : [1000]
        }
    ),
    (
        'SGDClassifier',
        SGDClassifier(),
        {
            'penalty'       : ['l1','l2'],
            'alpha'         : [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000],
            'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
            'max_iter'      : [1000],
            'loss'          : ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron'],
        }
    ),
    (
        'RandomForestClassifier',
        RandomForestClassifier(),
        {
            'n_estimators'  : [100, 500], 
            'criterion'     : ['gini', 'entropy', 'log_loss'],
            'max_features'  : ['sqrt', 'log2'],
            'max_depth'     : [3, 5, 10],
            'min_samples_split' : [2, 5, 10],
            'min_samples_leaf'  : [1, 2, 5]
        }
    ),
    (
        'KNeighborsClassifier',
        KNeighborsClassifier(),
        {
            'weights' : ['uniform', 'distance'],
            'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'n_neighbors': [3, 5, 10],
            'p': [1, 2]
        }
    ),
    (
        'DecisionTreeClassifier',
        DecisionTreeClassifier(),
        {
            'criterion'     : ['gini', 'entropy', 'log_loss'],
            'max_features'  : ['sqrt', 'log2']
        }
    )
]

In [None]:
def plot_confusion_matrix(name: str, test: list, predicted: list, target: list):
    disp = metrics.ConfusionMatrixDisplay(metrics.confusion_matrix(test, predicted, labels=target))
    disp.plot()
    disp.ax_.set_title(f'{name} Confusion Matrix')

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

accuracy = []
precision = []
recall = []
f1 = []

for i, (name, model, params) in enumerate(models):
    grid_classifier = GridSearchCV(model,
                                    params,
                                    cv=3,
                                    scoring='accuracy',
                                    verbose=0,
                                    error_score=0,
                                    n_jobs=-1
                                    )
    grid_classifier.fit(train_data.matrix, train_data.target)
    predicted = grid_classifier.predict(test_data.matrix)

    accuracy.append(metrics.accuracy_score(test_data.target, predicted))
    precision.append(metrics.precision_score(test_data.target, predicted, average='macro', zero_division=0))
    recall.append(metrics.recall_score(test_data.target, predicted, average='macro', zero_division=0))
    f1.append(metrics.f1_score(test_data.target, predicted, average='macro', zero_division=0))

    # plot_confusion_matrix(name, test_data.target, predicted, corpus_target)

metrics_frame = pd.DataFrame({
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1': f1
}, index=[name for name, _, _ in models])

metrics_frame