In [110]:
import pandas as pd
import sklearn.metrics as m
from itertools import product
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from mlxtend.classifier import StackingClassifier

In [15]:
def normalize(text):
    tokens = word_tokenize(text)
    return ' '.join([token for token in tokens if len(token) > 1])

In [8]:
data_url = 'https://raw.githubusercontent.com/TatianaShavrina/hse_ml_m1/master/ensembles/complaints.csv'
data = pd.read_csv(data_url, sep='\t')
data.head()

Unnamed: 0,COMPLAINT_ID,DATE,PRODUCT_ID,ISSUE_ID,cleaned_text
0,3178905,03/13/2019,44,318,go year . contact advis never took loan . advi...
1,3175952,03/12/2019,44,349,"mail valid debt xx/xx/19 valid receiv , receiv..."
2,3174747,03/09/2019,44,16,xx/xx/xxxx appli receiv onlin loan bluechip fi...
3,3173291,03/08/2019,44,16,xx/xx/xxxx appli receiv onlin loan . loan amou...
4,3172221,03/07/2019,44,48,told husband left bill . debt would pay within...


In [28]:
y = data["PRODUCT_ID"]
X = [normalize(tokens) for tokens in data["cleaned_text"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [103]:
def get_predictions(weights):
    clf1 = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=1)
    clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
    clf3 = GaussianNB()
    clf4 = MultinomialNB(alpha=0.1, fit_prior=True)
    clf5 = KNeighborsClassifier(n_neighbors=2) 

    eclf = VotingClassifier(
        estimators=[
            ('lr', clf1), ('rf', clf2), ('gnb', clf3),
            ('mnb', clf4), ('knc', clf5),
        ],
        voting='hard',
        weights=weights,
    )

    voting = Pipeline([
        ('vect', CountVectorizer( analyzer='word', max_features=500)),
        ('tfidf', TfidfTransformer(sublinear_tf=True)),
        ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), 
        ('clf', eclf),
        ])
    voting = voting.fit(X_train, y_train)
    predictions = voting.predict(X_test)
    return predictions

In [90]:
def print_metrics(predictions):
    print("Precision: {0:6.2f}".format(m.precision_score(y_test, predictions, average='macro')))
    print("Recall: {0:6.2f}".format(m.recall_score(y_test, predictions, average='macro')))
    print("F1-measure: {0:6.2f}".format(m.f1_score(y_test, predictions, average='macro')))
    print("Accuracy: {0:6.2f}".format(m.accuracy_score(y_test, predictions)))

In [104]:
def get_f1(weights, f=get_predictions):
    return m.f1_score(y_test, f(weights), average='macro')

In [100]:
import warnings
warnings.filterwarnings("ignore")

best_score = 0
best_weights = None
for weights in product(*[[1, 2]] * 5):
    f1 = get_f1(weights)
    if f1 > best_score:
        best_score = f1
        best_weights = weights
best_weights

(2, 2, 1, 1, 2)

In [102]:
predictions = get_predictions(best_weights)
print_metrics(predictions)

Precision:   0.74
Recall:   0.71
F1-measure:   0.71
Accuracy:   0.71


ДОБИЛО ДО СЕМИ ДЕСЯТЫХ!!!

Всё благодаря гениальной штуке для генерации весов, надо обязательно запомнить:

In [84]:
from itertools import product
product(*[[1, 2]] * 5)

<itertools.product at 0x7f8e8e74b360>

Я не вполне понимаю задания, результат в `0.71` уже довольно неплохо. Но попробуем сделать ещё что-нибудь из семинарской тетрадки.

In [106]:
def get_predictions2():
    clf1 = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=1)
    clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
    clf3 = GaussianNB()
    clf4 = MultinomialNB(alpha=0.1, fit_prior=True)
    clf5 = KNeighborsClassifier(n_neighbors=2)
    lr = LogisticRegression()
    sclf = StackingClassifier(
        classifiers=[clf1, clf2, clf3, clf4, clf5],
        meta_classifier=lr,
    )
    voting = Pipeline([
        ('vect', CountVectorizer( analyzer='word', max_features=500)),
        ('tfidf', TfidfTransformer(sublinear_tf=True)),
        ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), 
        ('clf', sclf),
        ])
    voting = voting.fit(X_train, y_train)
    predictions = voting.predict(X_test)
    return predictions

In [111]:
predictions = get_predictions2()
print_metrics(predictions)

Precision:   0.67
Recall:   0.62
F1-measure:   0.62
Accuracy:   0.62


Хорошая попытка, но у нас уже есть лучше. По сути получается, что здесь мы заставляем модель угадывать нужные веса, а в предыдущей модели, мы просто вычислили оптимальные веса. Поэтому не удивительно, что результат там чуть повыше