In [1]:
import pandas as pd
from lxml import html
import numpy as np
from sklearn.decomposition import TruncatedSVD, NMF, PCA
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_distances
from string import punctuation
from nltk.corpus import stopwords
from tqdm import tqdm_notebook
from pymorphy2 import MorphAnalyzer
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import gensim
from collections import Counter,defaultdict

In [4]:
corpus_xml = html.fromstring(open('/Users/Stoneberry/Downloads/paraphraser/paraphrases.xml', 'rb').read())

texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])
    
data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

In [5]:
data.head()

Unnamed: 0,label,text_1,text_2
0,0,Полицейским разрешат стрелять на поражение по ...,Полиции могут разрешить стрелять по хулиганам ...
1,0,Право полицейских на проникновение в жилище ре...,Правила внесудебного проникновения полицейских...
2,0,Президент Египта ввел чрезвычайное положение в...,Власти Египта угрожают ввести в стране чрезвыч...
3,-1,Вернувшихся из Сирии россиян волнует вопрос тр...,Самолеты МЧС вывезут россиян из разрушенной Си...
4,0,В Москву из Сирии вернулись 2 самолета МЧС с р...,Самолеты МЧС вывезут россиян из разрушенной Си...


In [5]:
morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word and word not in stops]

    return ' '.join(words)

def tokenize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]

    return ' '.join(words)


In [6]:
data['text_1_norm'] = data['text_1'].apply(normalize)
data['text_2_norm'] = data['text_2'].apply(normalize)

In [8]:
data.head()

Unnamed: 0,label,text_1,text_2,text_1_norm,text_2_norm
0,0,Полицейским разрешат стрелять на поражение по ...,Полиции могут разрешить стрелять по хулиганам ...,полицейский разрешить стрелять поражение гражд...,полиция мочь разрешить стрелять хулиган травма...
1,0,Право полицейских на проникновение в жилище ре...,Правила внесудебного проникновения полицейских...,право полицейский проникновение жилища решить ...,правило внесудебный проникновение полицейский ...
2,0,Президент Египта ввел чрезвычайное положение в...,Власти Египта угрожают ввести в стране чрезвыч...,президент египет ввести чрезвычайный положение...,власть египет угрожать ввести страна чрезвычай...
3,-1,Вернувшихся из Сирии россиян волнует вопрос тр...,Самолеты МЧС вывезут россиян из разрушенной Си...,вернуться сирия россиянин волновать вопрос тру...,самолёт мчс вывезти россиянин разрушить сирия
4,0,В Москву из Сирии вернулись 2 самолета МЧС с р...,Самолеты МЧС вывезут россиян из разрушенной Си...,москва сирия вернуться 2 самолёт мчс россиянин...,самолёт мчс вывезти россиянин разрушить сирия


### Тексты для train

In [7]:
data_rt = pd.read_csv('/Users/Stoneberry/Downloads/news_texts.csv')

In [10]:
data_rt.head()

Unnamed: 0,content,content_norm
0,Канцлер Германии Ангела Меркель в ходе брифинг...,канцлер германия ангел меркель ход брифинг пре...
1,Российские и белорусские войска успешно заверш...,российский белорусский войско успешно завершит...
2,"Дзюба, Шатов и Анюков оказались не нужны «Зени...",дзюба шат анюк оказаться нужный зенит российск...
3,"В Испанию без фанатов\nПожалуй, главной пятнич...",испания фанат пожалуй главный пятничный новост...
4,"Постпред России при ООН Виталий Чуркин, говоря...",постпред россия оон виталий чуркин говорить ве...


In [8]:
data_rt.dropna(inplace=True)

In [9]:
cv = CountVectorizer(min_df=3, max_df=0.4, max_features=1000)
X_cv = cv.fit_transform(data_rt['content_norm'])

In [10]:
tfidf = TfidfVectorizer(min_df=3, max_df=0.4, max_features=1000)
X_tf = tfidf.fit_transform(data_rt['content_norm'])

## Преобразуйте тексты в векторы в каждой паре 4 методами  - SVD, NMF, Word2Vec, Fastext. 

#### Для SVD и NMF сделайте две пары векторов - через TfidfVectorizer и CountVectorizer.

##### SVD 

In [14]:
svd_cv = TruncatedSVD(50)
svd_cv.fit(X_cv)

TruncatedSVD(algorithm='randomized', n_components=50, n_iter=5,
       random_state=None, tol=0.0)

In [15]:
X_text_1 = svd_cv.transform(cv.transform(data['text_1_norm']))
X_text_2 = svd_cv.transform(cv.transform(data['text_2_norm']))

X_svd_cv = [X_text_1, X_text_2]

In [16]:
svd_tf = TruncatedSVD(50)
svd_tf.fit(X_tf)

TruncatedSVD(algorithm='randomized', n_components=50, n_iter=5,
       random_state=None, tol=0.0)

In [17]:
X_text_1 = svd_tf.transform(tfidf.transform(data['text_1_norm']))
X_text_2 = svd_tf.transform(tfidf.transform(data['text_2_norm']))

X_svd_tf = [X_text_1, X_text_2]

In [11]:
def my_svd(svd_cv, svd_tf):
    
    global data, cv, tfidf

    X_text_1 = svd_cv.transform(cv.transform(data['text_1_norm']))
    X_text_2 = svd_cv.transform(cv.transform(data['text_2_norm']))
    X_svd_cv = [X_text_1, X_text_2]
    
    X_text_1 = svd_tf.transform(tfidf.transform(data['text_1_norm']))
    X_text_2 = svd_tf.transform(tfidf.transform(data['text_2_norm']))
    X_svd_tf = [X_text_1, X_text_2]
    
    return [X_svd_cv, X_svd_tf]

##### NMF

In [19]:
nmf_cv = NMF(50)
nmf_cv.fit(X_cv)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=50, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [20]:
X_text_1_nmf = nmf_cv.transform(cv.transform(data['text_1_norm']))
X_text_2_nmf = nmf_cv.transform(cv.transform(data['text_2_norm']))

X_text_nmf_cv = [X_text_1_nmf, X_text_2_nmf]

In [21]:
nmf_tf = NMF(50)
nmf_tf.fit(X_tf)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=50, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [22]:
X_text_1_nmf = nmf_tf.transform(tfidf.transform(data['text_1_norm']))
X_text_2_nmf = nmf_tf.transform(tfidf.transform(data['text_2_norm']))

X_text_nmf_tf = [X_text_1_nmf, X_text_2_nmf]

In [12]:
def my_NMF(nmf_cv, nmf_tf):
    
    global data, cv, tfidf
    
    X_text_1_nmf = nmf_cv.transform(cv.transform(data['text_1_norm']))
    X_text_2_nmf = nmf_cv.transform(cv.transform(data['text_2_norm']))
    X_text_nmf_cv = [X_text_1_nmf, X_text_2_nmf]

    X_text_1_nmf = nmf_tf.transform(tfidf.transform(data['text_1_norm']))
    X_text_2_nmf = nmf_tf.transform(tfidf.transform(data['text_2_norm']))  
    X_text_nmf_tf = [X_text_1_nmf, X_text_2_nmf]
    
    return [X_text_nmf_cv, X_text_nmf_tf]

##### word2vec

In [24]:
w2v = gensim.models.Word2Vec([text.split() for text in data_rt['content_norm']], size=50, sg=1)

In [13]:
def get_embedding(text, model, dim, tfidf=False):
    
    if tfidf:
        vocab = tfidf.vocabulary_
        arr = tfidf.transform([' '.join(text)]).toarray()[0]
    
    text = text.split()
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))
    
    for i,word in enumerate(words):
        
        try:
            if tfidf and word in vocab:
                weight = arr[vocab[word]]
            else:
                weight = words[word]/total

            v = model[word]
            vectors[i] = v * weight

        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

In [26]:
dim = 50
X_text_1_w2v = np.zeros((len(data['text_1_norm']), dim))
X_text_2_w2v = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_w2v[i] = get_embedding(text, w2v, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_w2v[i] = get_embedding(text, w2v, dim)

X_text_w2v = [X_text_1_w2v, X_text_2_w2v]

In [27]:
dim = 50
X_text_1_w2v = np.zeros((len(data['text_1_norm']), dim))
X_text_2_w2v = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_w2v[i] = get_embedding(text, w2v, dim, tfidf)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_w2v[i] = get_embedding(text, w2v, dim, tfidf)

X_text_w2v_tf = [X_text_1_w2v, X_text_2_w2v]

In [25]:
def my_w2v(w2v, dim):
    
    global data, tfidf

    X_text_1_w2v = np.zeros((len(data['text_1_norm']), dim))
    X_text_2_w2v = np.zeros((len(data['text_2_norm']), dim))

    for i, text in enumerate(data['text_1_norm'].values):
        X_text_1_w2v[i] = get_embedding(text, w2v, dim)
            
    for i, text in enumerate(data['text_2_norm'].values):
        X_text_2_w2v[i] = get_embedding(text, w2v, dim)
    X_text_w2v = [X_text_1_w2v, X_text_2_w2v]
    

    X_text_1_w2v = np.zeros((len(data['text_1_norm']), dim))
    X_text_2_w2v = np.zeros((len(data['text_2_norm']), dim))
    for i, text in enumerate(data['text_1_norm'].values):
        X_text_1_w2v[i] = get_embedding(text, w2v, dim, tfidf)
    
    for i, text in enumerate(data['text_2_norm'].values):
        X_text_2_w2v[i] = get_embedding(text, w2v, dim, tfidf)
    X_text_w2v_tf = [X_text_1_w2v, X_text_2_w2v]
    
    return [X_text_w2v, X_text_w2v_tf]

##### Fastext

без нормализации и с нормализацией, а через каждую модель постройте две пары векторов -  с взвешиванием по tfidf и без. 

In [29]:
fast_text_norm = gensim.models.FastText([text.split() for text in data_rt['content_norm']], size=50, min_n=4, max_n=8)
fast_text_nenorm = gensim.models.FastText([text.split() for text in data_rt['content']], size=50, min_n=4, max_n=8)

Not norm

In [27]:
data['text_1_notnorm'] = data['text_1'].apply(tokenize)
data['text_2_notnorm'] = data['text_2'].apply(tokenize)

In [30]:
dim = 50

X_text_1_ft = np.zeros((len(data['text_1_notnorm']), dim))
X_text_2_ft = np.zeros((len(data['text_2_notnorm']), dim))

for i, text in enumerate(data['text_1_notnorm'].values):
    X_text_1_ft[i] = get_embedding(text, fast_text_nenorm, dim)
    
for i, text in enumerate(data['text_2_notnorm'].values):
    X_text_2_ft[i] = get_embedding(text, fast_text_nenorm, dim)

X_text_ft_ne = [X_text_1_ft, X_text_2_ft]

In [31]:
dim = 50

X_text_1_ft = np.zeros((len(data['text_1_notnorm']), dim))
X_text_2_ft = np.zeros((len(data['text_2_notnorm']), dim))

for i, text in enumerate(data['text_1_notnorm'].values):
    X_text_1_ft[i] = get_embedding(text, fast_text_nenorm, dim, tfidf)
    
for i, text in enumerate(data['text_2_notnorm'].values):
    X_text_2_ft[i] = get_embedding(text, fast_text_nenorm, dim, tfidf)

X_text_ft_ne_tf = [X_text_1_ft, X_text_2_ft]

Norm

In [32]:
dim = 50
X_text_1_ft = np.zeros((len(data['text_1_norm']), dim))
X_text_2_ft = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_ft[i] = get_embedding(text, fast_text_norm, dim)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_ft[i] = get_embedding(text, fast_text_norm, dim)

X_text_ft_norm = [X_text_1_ft, X_text_2_ft]

In [33]:
dim = 50
X_text_1_ft = np.zeros((len(data['text_1_norm']), dim))
X_text_2_ft = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_ft[i] = get_embedding(text, fast_text_norm, dim, tfidf)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_ft[i] = get_embedding(text, fast_text_norm, dim, tfidf)

X_text_ft_norm_tf = [X_text_1_ft, X_text_2_ft]

In [24]:
def fast(fast_text_norm, fast_text_nenorm, dim):
    
    global data

    def foo(data, dim, name1, name2, fast_text_norm, fast_text_nenorm, tf=False):

        X_text_1_ft = np.zeros((len(data[name1]), dim))
        X_text_2_ft = np.zeros((len(data[name2]), dim))

        for i, text in enumerate(data[name1].values):
            if not tf:
                X_text_1_ft[i] = get_embedding(text, fast_text_nenorm, dim)
            else:
                X_text_1_ft[i] = get_embedding(text, fast_text_norm, dim, tfidf)
    
        for i, text in enumerate(data[name2].values):
            if not tf:
                X_text_2_ft[i] = get_embedding(text, fast_text_nenorm, dim)
            else:
                X_text_2_ft[i] = get_embedding(text, fast_text_norm, dim, tfidf)

        return [X_text_1_ft, X_text_2_ft]
    
    return [foo(data, dim, 'text_1_notnorm', 'text_2_notnorm', fast_text_norm, fast_text_nenorm, tf=False), 
            foo(data, dim, 'text_1_norm', 'text_2_norm', fast_text_norm, fast_text_nenorm, tf=False), 
            foo(data, dim, 'text_1_notnorm', 'text_2_notnorm', fast_text_norm, fast_text_nenorm, tf=True),
            foo(data, dim, 'text_1_norm', 'text_2_norm', fast_text_norm, fast_text_nenorm, tf=True)]
    
    

### Между векторами каждой пары вычислите косинусную близость

In [35]:
all_together = [X_text_ft_norm_tf, X_text_ft_norm, X_text_ft_ne_tf, X_text_ft_ne,
               X_text_w2v_tf, X_text_w2v, X_text_nmf_tf, X_text_nmf_cv,
               X_svd_tf, X_svd_cv]

In [36]:
results = {}
for index, pair in enumerate(all_together):
    x = pair[0]
    y = pair[1]
    res = []
    
    for i in range(len(x)):
        ans = cosine_distances(x[i].reshape(1, -1), y[i].reshape(1, -1))[0]
        res.append(ans[0])
    results[index] = res
    

In [37]:
len(results)

10

In [38]:
table = pd.DataFrame(results)
table.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.27805,0.263309,0.12957,0.087815,0.095309,0.077762,0.574292,0.474685,0.793834,0.823382
1,0.092989,0.18893,0.101363,0.104367,0.054607,0.08482,0.192967,0.596678,0.551834,0.584042
2,0.364554,0.162135,0.031763,0.045786,0.07638,0.042211,0.320273,0.916755,0.580027,0.834902
3,0.849245,0.386477,0.531361,0.168232,0.47283,0.281317,0.672072,0.347814,0.407976,0.309526
4,0.908951,0.295414,0.774327,0.475915,0.271393,0.084477,0.002551,0.123715,0.016991,0.072903


### Постройте обучающую выборку из этих близостей.

In [16]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, make_scorer

In [18]:
y = data['label'].values

In [None]:
train_X, valid_X, train_y, valid_y = train_test_split(table, y, random_state=1)

### Обучите любую модель

In [41]:
params = {'C':[1, 10, 100, 100]}
est = LogisticRegression(class_weight='balanced', random_state=42, multi_class='auto')
clf = GridSearchCV(est, params, cv=3)
clf.fit(train_X, train_y)
clf.best_params_ 

{'C': 1}

In [46]:
clf = LogisticRegression(C=1, class_weight='balanced',  multi_class='auto', random_state=42)
clf.fit(train_X, train_y)
preds = clf.predict(valid_X)
print('test', f1_score(valid_y, preds, average='micro'))

test 0.5379081350304372


### Оцените качество на кросс-валидации

In [47]:
clf = LogisticRegression(C=1, class_weight='balanced', multi_class='auto', random_state=42)
np.mean(cross_val_score(clf, table, y, cv=5, scoring=make_scorer(f1_score, average='micro')))

0.5531783025076636

### С помощью кросс-валидации подберите параметры моделей (количество компонент, размерность в w2v, min_n - в fastext и т.д).

In [92]:
params = [(10, 50, 1, 2), (50, 50, 1, 2), (100, 50, 1, 2), 
          (10, 100, 2, 3), (50, 100, 2, 3), (100, 100, 2, 3),]
          #(10, 300, 4, 5), (50, 300, 4, 5), (100, 300, 4, 5)
         #]

In [23]:
def fitting(param, X_cv, X_tf, data_rt, y, C=10):

    all_together = []
    
    svd_cv = TruncatedSVD(param[0])
    svd_cv.fit(X_cv)
    svd_tf = TruncatedSVD(param[0])
    svd_tf.fit(X_tf)
    all_together += my_svd(svd_cv, svd_tf)

    nmf_cv = NMF(param[0])
    nmf_cv.fit(X_cv)
    nmf_tf = NMF(param[0])
    nmf_tf.fit(X_tf)
    all_together += my_NMF(nmf_cv, nmf_tf)
    
    dim = param[1]

    w2v = gensim.models.Word2Vec([text.split() for text in data_rt['content_norm']], size=param[1], sg=1)
    all_together += my_w2v(w2v, dim)

    fast_text_norm = gensim.models.FastText([text.split() for text in data_rt['content_norm']], 
                        size=param[1], min_n=param[2], max_n=param[3])
    fast_text_nenorm = gensim.models.FastText([text.split() for text in data_rt['content']], 
                          size=param[1], min_n=param[2], max_n=param[3])
    all_together += fast(fast_text_norm, fast_text_nenorm, dim)

    
    results = {}
    for index, pair in enumerate(all_together):
        x = pair[0]
        z = pair[1]
        res = []
        for i in range(len(x)):
            ans = cosine_distances(x[i].reshape(1, -1), z[i].reshape(1, -1))[0]
            res.append(ans[0])
        results[index] = res
    
    table = pd.DataFrame(results)
    train_X, valid_X, train_y, valid_y = train_test_split(table, y, random_state=42)
    
    clf = LogisticRegression(C=C, class_weight='balanced', multi_class='auto', random_state=42)
    return [train_X, valid_X, train_y, valid_y], np.mean(cross_val_score(clf, table, y, cv=3,
                         scoring=make_scorer(f1_score, average='micro')))
    

In [20]:
def grid(params, X_cv, X_tf, data_rt, y):
    
    ans = {}
    t_par = tqdm_notebook(params, desc='params', leave=True)
    
    for param in t_par:
        
        ans[param] = fitting(param, X_cv, X_tf, data_rt, y)[1]

    return sorted(ans.items(), key=lambda kv: kv[1])
    

In [94]:
grid(params, X_cv, X_tf, data_rt, y)

HBox(children=(IntProgress(value=0, description='params', max=6), HTML(value='')))

10
10
10
10
10
10


[((10, 100, 2, 3), 0.45703361516114027),
 ((100, 100, 2, 3), 0.4676874736451764),
 ((50, 100, 2, 3), 0.47045426104559096),
 ((50, 50, 1, 2), 0.5577667100495336),
 ((100, 50, 1, 2), 0.5579048311522172),
 ((10, 50, 1, 2), 0.5630246936177559)]

In [96]:
grid([(10, 300, 4, 5), (50, 300, 4, 5), (100, 300, 4, 5)], X_cv, X_tf, data_rt, y)

HBox(children=(IntProgress(value=0, description='params', max=3), HTML(value='')))

[((10, 300, 4, 5), 0.45758705688475687),
 ((50, 300, 4, 5), 0.46893240161006244),
 ((100, 300, 4, 5), 0.46948473285073533)]

In [102]:
dt = fitting((10, 50, 1, 2), X_cv, X_tf, data_rt, y)

In [None]:
fitting(param, X_cv, X_tf, data_rt, y, C=10)

In [105]:
params = {'C':[0.1, 1, 10, 100, 1000], 'solver':['lbfgs', 'liblinear', 'sag', 'saga']}
est = LogisticRegression(class_weight='balanced', random_state=42, multi_class='auto')
clf = GridSearchCV(est, params, cv=3)
clf.fit(dt[0][0], dt[0][2])
clf.best_params_ 

{'C': 10, 'solver': 'liblinear'}

In [21]:
params = [(10, 50, 1, 5), (50, 50, 1, 5), (5, 50, 1, 5), 
          (10, 10, 2, 3), (50, 10, 2, 3), (5, 10, 2, 3),]

In [28]:
grid(params, X_cv, X_tf, data_rt, y)

HBox(children=(IntProgress(value=0, description='params', max=6), HTML(value='')))

[((50, 10, 2, 3), 0.5430763546387581),
 ((5, 10, 2, 3), 0.5448780384091811),
 ((10, 10, 2, 3), 0.5451555255217464),
 ((50, 50, 1, 5), 0.5652229177117685),
 ((5, 50, 1, 5), 0.5654976466173752),
 ((10, 50, 1, 5), 0.5670209813355582)]

In [29]:
params = [(100, 50, 1, 8), (50, 50, 1, 8), (200, 50, 1, 8), 
          (100, 10, 2, 5), (50, 10, 2, 5), (200, 10, 2, 5),]

grid(params, X_cv, X_tf, data_rt, y)

HBox(children=(IntProgress(value=0, description='params', max=6), HTML(value='')))

[((100, 10, 2, 5), 0.543219838863251),
 ((50, 10, 2, 5), 0.5454312887549626),
 ((200, 10, 2, 5), 0.5459841943248742),
 ((100, 50, 1, 8), 0.5603781846516126),
 ((200, 50, 1, 8), 0.5605171867897976),
 ((50, 50, 1, 8), 0.5639777596578903)]

In [30]:
params = [(10, 50, 1, 8)]

grid(params, X_cv, X_tf, data_rt, y)

HBox(children=(IntProgress(value=0, description='params', max=1), HTML(value='')))

[((10, 50, 1, 8), 0.568405831079361)]

In [31]:
dt = fitting((10, 50, 1, 8), X_cv, X_tf, data_rt, y)


params = {'C':[0.1, 1, 10, 100, 1000], 'solver':['lbfgs', 'liblinear', 'sag', 'saga']}
est = LogisticRegression(class_weight='balanced', random_state=42, multi_class='auto')
clf = GridSearchCV(est, params, cv=3)
clf.fit(dt[0][0], dt[0][2])
clf.best_params_ 

{'C': 1, 'solver': 'liblinear'}

In [49]:
result = pd.concat([dt[0][0], dt[0][1]])
y_new = np.concatenate((dt[0][2], dt[0][3]), axis=None)

In [53]:
clf = LogisticRegression(C=1, class_weight='balanced', multi_class='auto', random_state=42)
np.mean(cross_val_score(clf, result, y_new, cv=3, scoring=make_scorer(f1_score, average='micro')))

0.5675911256989612

In [43]:
est = LogisticRegression(class_weight='balanced', random_state=42, multi_class='auto')
est.fit(dt[0][0], dt[0][2])
pred = est.predict(dt[0][1])
f1_score(dt[0][3], pred, average='micro')

0.5666851134477033