In [None]:
# Импорт собранных и обработанных данных
import pandas as pd

# Для mac
df_nlp = pd.read_csv(r'/Users/user/Documents/Mine/nlp/ML.csv')

# Для win
#df_ml = pd.read_csv(r'C:\Users\User\Downloads\ML.csv')

df_ml = df_nlp.drop(df_nlp.columns[[0,2,3,4,5,6,7,8,9]], axis = 1)
df_ml


In [None]:
# Кодирование целевой переменной. Формирование тестовой и обучающей выборок
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

encoder = LabelEncoder()
rubrics_list = df_ml['rubric'].to_list()
rubric_labels = encoder.fit_transform(rubrics_list)

X = df_ml['text_lemm']
y = rubric_labels

rubrics = ['Политика', 'Общество', 'Экономика', 'В мире', 'Спорт', 'Происшествия', 'Культура', 'Технологии', 'Наука']
my_tags = rubrics

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#X_train.shape

### Байесовский классификатор

In [None]:
from sklearn.naive_bayes import MultinomialNB

def nb_classifier():
    
    nb = Pipeline ([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB()),
                   ])
    
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

nb_classifier()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

def opt_nb_classifier():
    
    nb_opt = Pipeline ([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', MultinomialNB()),
                       ])
    
    # Определяем параметры для сеточного поиска
    parameters = {
        'tfidf__use_idf': (True, False),
        'clf__alpha': (0.01, 0.1, 1.0),
    }
    
    # Создаем объект сеточного поиска с 5-кратной перекрестной проверкой
    grid_search = GridSearchCV(nb_opt, parameters, cv=5, n_jobs=-1, verbose=1)
    
    # Запуск сеточного поиска
    grid_search.fit(X_train, y_train)
    
    # Получение лучших параметров
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    # Запуск модели с новыми параметрами
    y_pred = grid_search.predict(X_test)

    # Вывод оценки классификации по всем метрикам
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

opt_nb_classifier()

### Метод опорных векторов

In [None]:
from sklearn.linear_model import SGDClassifier

def sgd_classifier():
    
    sgd = Pipeline ([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier()),
                    ])

    sgd.fit(X_train, y_train)
    y_pred = sgd.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

sgd_classifier()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

def opt_sgd_classifier():
    
    sgd_opt = Pipeline ([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier()),
                        ])
    
    # Определяем параметры для сеточного поиска
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'clf__alpha': [0.000001],
        'clf__penalty': ['l2'],
    }
    
    # Создаем объект сеточного поиска с 5-кратной перекрестной проверкой
    grid_search = GridSearchCV(sgd_opt, parameters, cv=5, n_jobs=-1, verbose=1)
    
    # Запуск сеточного поиска
    grid_search.fit(X_train, y_train)
    
    # Получение лучших параметров
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    # Запуск модели с новыми параметрами
    y_pred = grid_search.predict(X_test)

    # Вывод оценки классификации по всем метрикам
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

opt_sgd_classifier()

### Логистическая регрессия

In [None]:
from sklearn.linear_model import LogisticRegression

def logreg_classifier():

    logreg = Pipeline ([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', LogisticRegression()),
                       ])

    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

logreg_classifier()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

def opt_logreg_classifier():
    
    logreg_opt = Pipeline ([('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer(sublinear_tf=True)),
                            ('clf', LogisticRegression(solver='liblinear')),
                           ])
    
    # Определение сетки параметров для перебора
    parameters = {
        'clf__C': [15],
        'vect__ngram_range': [(1, 1), (1, 2)],
        'clf__penalty': ['l2'],
        'clf__class_weight': [None, 'balanced']
    }

    # Create the grid search object with 5-fold cross validation
    grid_search = GridSearchCV(logreg_opt, parameters, cv=5, n_jobs=-1, verbose=1)
    
    # Fit the grid search object to the training data
    grid_search.fit(X_train, y_train)
    
    # Get the best score and best parameters from the grid search
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    # Predict on the test data using the best model from the grid search
    y_pred = grid_search.predict(X_test)

    # Print the accuracy score and classification report
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

opt_logreg_classifier()

### Дерево решений 

In [None]:
from sklearn.tree import DecisionTreeClassifier

def dtree_classifier():
    
    dtree = Pipeline ([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('clf', DecisionTreeClassifier()),
                      ])

    dtree.fit(X_train, y_train)
    y_pred = dtree.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

dtree_classifier()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

def opt_dtree_classifier():
    
    dtree = Pipeline ([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('clf', DecisionTreeClassifier()),
                      ])
    
    # Define the parameters for the grid search
    parameters = {
        'clf__criterion': ['gini', 'entropy'],
        'clf__max_depth': [10, 100],
        'clf__min_samples_split': [2, 5, 10],
        'clf__min_samples_leaf': [1, 5],
        'clf__class_weight': ['balanced'],
    }
    
    # Создание объекта GridSearchCV с 5-кратной перекрестной проверкой
    grid_search = GridSearchCV(dtree, parameters, cv=5, n_jobs=-1, verbose=1)

    # Применение grid search для обучения модели
    grid_search.fit(X_train, y_train)

    # Получение лучшего результата и лучших параметров из grid search
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    # Предсказание на тестовых данных с использованием лучшей модели из grid search
    y_pred = grid_search.predict(X_test)

    # Вывод оценки точности и отчета о классификации
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

opt_dtree_classifier()

### Метод K ближайщих соседей KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def knn_classifier():

    knn = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', KNeighborsClassifier()),
                   ])

    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

knn_classifier()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

def opt_knn_classifier():
    
    knn = Pipeline ([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('clf', KNeighborsClassifier()),
                      ])
    
    # Define the parameters for the grid search
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'tfidf__use_idf': [True, False],
        'clf__n_neighbors': [3, 5, 7],
        'clf__weights': ['uniform', 'distance']
    }
    
    # Создание объекта GridSearchCV с 5-кратной перекрестной проверкой
    grid_search = GridSearchCV(knn, parameters, cv=5, n_jobs=-1, verbose=1)

    # Применение grid search для обучения модели
    grid_search.fit(X_train, y_train)

    # Получение лучшего результата и лучших параметров из grid search
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    # Предсказание на тестовых данных с использованием лучшей модели из grid search
    y_pred = grid_search.predict(X_test)

    # Вывод оценки точности и отчета о классификации
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

opt_knn_classifier()

### Стекинг

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

def stack_classifier():
    estimators = [('lr', LogisticRegression()), ('dt', DecisionTreeClassifier())]

    stack = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', StackingClassifier(estimators=estimators, final_estimator=SVC()))
                     ])

    stack.fit(X_train, y_train)
    y_pred = stack.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

stack_classifier()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

def opt_stack_classifier():
    estimators = [('lr', LogisticRegression()), ('dt', DecisionTreeClassifier())]

    stack_opt = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', StackingClassifier(estimators=estimators, final_estimator=SVC()))
                     ])
    
    # Define the parameters for the grid search
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'tfidf__use_idf': (True, False),
        'clf__final_estimator__C': [0.1, 1, 10],
        'clf__stack_method': ['auto', 'predict_proba', 'decision_function'],
        'clf__passthrough': [True, False],
    }
    
    # Create the grid search object with 5-fold cross validation
    grid_search = GridSearchCV(stack_opt, parameters, cv=5, n_jobs=-1, verbose=1)
    
    # Fit the grid search object to the training data
    grid_search.fit(X_train, y_train)
    
    # Get the best score and best parameters from the grid search
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    # Predict on the test data using the best model from the grid search
    y_pred = grid_search.predict(X_test)

    # Print the accuracy score and classification report
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

opt_stack_classifier()


### Бэггинг

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

def bag_classifier():
    bag = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', BaggingClassifier(base_estimator=LogisticRegression()))
                   ])

    bag.fit(X_train, y_train)
    y_pred = bag.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

bag_classifier()

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression

def opt_bag_classifier():
    
    bag_opt = Pipeline ([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', BaggingClassifier(base_estimator=LogisticRegression(), random_state=12)),
                        ])
    
    # Define the parameters for the grid search
    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'tfidf__use_idf': (True, False),
        'clf__n_estimators': [10, 20, 30],
        'clf__max_samples': [0.5, 1.0],
        'clf__max_features': [0.5, 1.0],
    }
    
    # Create the grid search object with 5-fold cross validation
    grid_search = GridSearchCV(bag_opt, parameters, cv=5, n_jobs=-1, verbose=1)
    
    # Fit the grid search object to the training data
    grid_search.fit(X_train, y_train)
    
    # Get the best score and best parameters from the grid search
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    # Predict on the test data using the best model from the grid search
    y_pred = grid_search.predict(X_test)

    # Print the accuracy score and classification report
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

opt_bag_classifier()


### Случайный лес

In [None]:
from sklearn.ensemble import RandomForestClassifier


def rf_classifier():

    rf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', RandomForestClassifier())
                  ])

    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

rf_classifier()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

def opt_rf_classifier():
    
    rf_opt = Pipeline ([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', RandomForestClassifier()),
                       ])
    
    # Define the parameters for the grid search
    parameters = {
        'vect__ngram_range': [(1, 2)],
    }
    
    # Create the grid search object with 5-fold cross validation
    grid_search = GridSearchCV(rf_opt, parameters, cv=5, n_jobs=-1, verbose=1)
    
    # Fit the grid search object to the training data
    grid_search.fit(X_train, y_train)
    
    # Get the best score and best parameters from the grid search
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    # Predict on the test data using the best model from the grid search
    y_pred = grid_search.predict(X_test)

    # Print the accuracy score and classification report
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

opt_rf_classifier()

### Метод градиентного бустинга

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

def gb_classifier():
    
    gb = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', GradientBoostingClassifier(loss='deviance')),
                   ])

    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

gb_classifier()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

def opt_gb_classifier():
    
    gb = Pipeline ([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('clf', GradientBoostingClassifier()),
                      ])
    
    # Define the parameters for the grid search
    parameters = {
        'vect__ngram_range': [(1, 2)],
    }
    
    # Создание объекта GridSearchCV с 5-кратной перекрестной проверкой
    grid_search = GridSearchCV(gb, parameters, cv=5, n_jobs=-1, verbose=1)

    # Применение grid search для обучения модели
    grid_search.fit(X_train, y_train)

    # Получение лучшего результата и лучших параметров из grid search
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    # Предсказание на тестовых данных с использованием лучшей модели из grid search
    y_pred = grid_search.predict(X_test)

    # Вывод оценки точности и отчета о классификации
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred, target_names=my_tags))
    
    return round(accuracy_score(y_pred, y_test), 2)

opt_gb_classifier()