In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [2]:
data = pd.read_csv('data/train.csv')
data.head()

Unnamed: 0,Id,Text,Class
0,0,"Обустройство тротуаров, мостовых (в том числе ...",H
1,1,Въ издержкахъ же оныхъ вы имете присылать счет...,H
2,2,"Положи это туда, откуда взял.",H
3,3,Минстрой обозначил способы снижения энергоемко...,M
4,4,В конце 1873 года военный суд вынес решение по...,M


Данные очень сбалансированы

In [3]:
data.groupby('Class').count()['Text']

Class
H    64535
M    64531
Name: Text, dtype: int64

Делим на train и val (тестовый датасет лежит отдельно)

In [4]:
np.random.seed(10)
train_size = int(0.9 * data.shape[0])
all_ind = np.random.permutation(data.shape[0])
train_ind = all_ind[:train_size]
val_ind = all_ind[train_size:]

In [5]:
val = pd.read_csv('data/val.csv')
test = pd.read_csv('data/test.csv')

Для токенизации текста

In [6]:
from razdel import tokenize
def tokenize_text(line):
    return [_.text for _ in list(tokenize(line))]

In [7]:
def write_submission(model, vec, test_file, out_file):
    subm = pd.read_csv(test_file)
    preds = model.predict(vec.transform(subm['Text']))
    subm["Class"] = preds
    subm[['Id', 'Class']].to_csv(out_file, index=False)

##  Logistic Regression

In [8]:
def train_predict(model, train_bow, y_train, test_bow, y_test):
    model.fit(train_bow, y_train)
    y_pred = model.predict(test_bow)
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'F1-score: {f1_score(y_test, y_pred, average="macro")}')

Обычный Bag of words

In [9]:
vec = CountVectorizer(ngram_range=(1, 1), tokenizer=tokenize_text)
train_bow = vec.fit_transform(data['Text'][train_ind])
test_bow = vec.transform(data['Text'][val_ind])
clf = LogisticRegression(random_state=10, solver='saga')

train_predict(clf, train_bow,
                        data['Class'][train_ind],  
                        test_bow,
                        data['Class'][val_ind])

Accuracy: 0.6602618734020299
F1-score: 0.6599854705672352




TF-IDF

In [10]:
vec = TfidfVectorizer(ngram_range=(1, 1), tokenizer=tokenize_text)
train_bow = vec.fit_transform(data['Text'][train_ind])
test_bow = vec.transform(data['Text'][val_ind])
clf = LogisticRegression(random_state=10, solver='saga')

train_predict(clf, train_bow,
                        data['Class'][train_ind],  
                        test_bow,
                        data['Class'][val_ind])

Accuracy: 0.6886185790656233
F1-score: 0.6885144405691219


Подберем параметр регуляризации для последнего варианта

In [11]:
for C in [0.0001, 0.001, 0.01, 0.1, 1]:
    clf = LogisticRegression(random_state=10, C=C, solver='saga')
    print(f'C={C}')
    train_predict(clf, train_bow,
                            data['Class'][train_ind],  
                            test_bow,
                            data['Class'][val_ind])
    print()

C=0.0001
Accuracy: 0.6099790811187727
F1-score: 0.609972991579804

C=0.001
Accuracy: 0.6144727667157357
F1-score: 0.6118895463091297

C=0.01
Accuracy: 0.6303556209808631
F1-score: 0.6288206883219779

C=0.1
Accuracy: 0.6641357402959635
F1-score: 0.6638565116551582

C=1
Accuracy: 0.6886185790656233
F1-score: 0.6885144405691219



Выбираем модель Logreg c коэф.регул. C=1, обученную с векторами TF-IDF 

и проверяем результат на тестовой выборке

In [12]:
vec = TfidfVectorizer(ngram_range=(1, 1), tokenizer=tokenize_text)
train_bow = vec.fit_transform(data['Text'])

val_bow = vec.transform(val['Text'])
final_log_reg = LogisticRegression(random_state=10, C=1, solver='saga')
final_log_reg.fit(train_bow, data['Class'])
val_pred = final_log_reg.predict(val_bow)

print(classification_report(val['Class'], val_pred))

              precision    recall  f1-score   support

           H       0.69      0.71      0.70     10756
           M       0.70      0.68      0.69     10755

    accuracy                           0.69     21511
   macro avg       0.69      0.69      0.69     21511
weighted avg       0.69      0.69      0.69     21511



In [14]:
write_submission(final_log_reg, vec, 'data/test.csv', 'submissions/log_c1_tfidf.csv')