In [1]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GroupKFold, cross_val_score, GridSearchCV
from functools import partial
from hyperopt import hp, tpe, Trials, STATUS_OK
from hyperopt.fmin import fmin
from hyperopt.pyll import scope
from hyperopt.plotting import main_plot_history
from catboost import CatBoostClassifier
import nltk
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
from nltk.stem.snowball import RussianStemmer

# Препроцессинг данных

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Олег\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [85]:
#mystem = Mystem()
mystem = RussianStemmer(False)
russian_stopwords = stopwords.words("russian")
english_stopwords = stopwords.words("english")
not_word = []

def is_word(token):
    ok = False
    legal_chars = '0123456789abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя'
    for c in token:
        if c not in legal_chars:
            not_word.append(token)
            return False
        if c in '0123456789':
            ok = True
    
    if len(token) <= 1 and not(ok):
        return False
    return True

def preprocess_text(text):
    #tokens = mystem.lemmatize(text.lower())
    data = mystem.stem(text.lower())
    for sign in punctuation:
        data = data.replace(sign, ' ')
    data = data.replace('|', ' ')
    data = data.replace('"\"', ' ')
    data = data.replace('-', ' ')
    tokens = data.strip().split(' ')
    tokens = [token.strip() for token in tokens if token not in russian_stopwords\
              and token not in english_stopwords\
              and token != " " \
              and token.strip() not in punctuation
              and is_word(token)]
    
    text = " ".join(tokens)
    
    return text

#### Пример

In [86]:
doc_to_title = {}
doc_proc_to_title = {}
with open('docs_titles.tsv', encoding='utf-8') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = title# словарь: ID - title
        doc_proc_to_title[doc_id] = preprocess_text(title)
        if num_line == 5:
            break
doc_to_title, doc_proc_to_title

({15731: 'ВАЗ 21213 | Замена подшипников ступицы | Нива',
  14829: 'Ваз 2107 оптом в Сочи. Сравнить цены, купить потребительские товары на Tiu.ru',
  15764: 'Купить ступица Лада калина2. Трансмиссия - переходные ступицы цена, замена, тюнинг.',
  17669: 'Классика 21010 - 21074',
  14852: 'Ступица Нива — замена подшипника своими руками'},
 {15731: 'ваз 21213 замена подшипников ступицы нив',
  14829: 'ваз 2107 оптом сочи сравнить цены купить потребительские товары тиу',
  15764: 'купить ступица лада калина2 трансмиссия переходные ступицы цена замена тюнинг',
  17669: 'классика 21010 21074',
  14852: 'ступица нива замена подшипника своими рук'})

###### Мы можем наблюдать, как слова в заголовках привелись к нижнему регистру. Из текста удалены: предлоги, союзы и тд. - стоп слова; знаки препинания, пробелы, а также те слова, которые содержат не русские/английские буквы и цифры. Помимо этого слова приведены к словоформе с помощью леммитизации. 

### Попробуем применить эту модификацию текста и обучить модель.

In [87]:
doc_to_title = {}
with open('docs_titles.tsv', encoding='utf-8') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = preprocess_text(title)# словарь: ID - title
#doc_to_title

# Выделение признаков

In [88]:
import pandas as pd
train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = doc_to_title[doc_id]
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))# словарь: 
    # Group_ID - [(id - title - target),()...]
#traingroups_titledata

In [89]:
test_data = pd.read_csv('test_groups.csv')
testgroups_titledata = {}
for i in range(len(test_data)):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    title = doc_to_title[doc_id]
    if doc_group not in testgroups_titledata:
        testgroups_titledata[doc_group] = []
    testgroups_titledata[doc_group].append((doc_id, title))# словарь: 
    # Group_ID - [(id - title - target),()...]b

In [90]:
N = 25# минимальное число документов в группе - 1
K = 25# top K слов в документе

In [91]:
import numpy as np
y_train = []
X_train = []
tmp = []
groups_train = []
for new_group in traingroups_titledata:
    docs = traingroups_titledata[new_group]
    
    top_words = {}
    for k, (doc_id, title, target_id) in enumerate(docs):
        words_j = set(title.strip().split())
        for w in words_j:
            if w in top_words.keys():
                top_words[w] += 1
            else:
                top_words[w] = 1

                
    sorted_list = {k: v for k, v in sorted(top_words.items(), key=lambda kv: kv[1], reverse=True)}
    #print(len(sorted_list))
    topk_words = list(sorted_list.keys())[0:K-1]
    
    for k, (doc_id, title, target_id) in enumerate(docs):
        y_train.append(target_id)
        groups_train.append(new_group)
        all_dist = []
        words = set(title.strip().split())
        tmp.append(len(docs))
        bool_tmp = []
        for z in topk_words:
            bool_tmp.append(int(z in words))
            
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j, target_j = docs[j]
            words_j = set(title_j.strip().split())
            if len(words.union(words_j)) == 0:
                all_dist.append(len(words.intersection(words_j)))
            else:
                all_dist.append(len(words.intersection(words_j))/len(words.union(words_j)))

        X_train.append(sorted(all_dist, reverse=True)[0:N] + bool_tmp)
X_train = np.array(X_train)
y_train = np.array(y_train)
groups_train = np.array(groups_train)
print (X_train.shape, y_train.shape, groups_train.shape)

(11690, 49) (11690,) (11690,)


In [92]:
X_test = []
groups_test = []
for new_group in testgroups_titledata:
    docs = testgroups_titledata[new_group]
    
    top_words = {}
    for k, (doc_id, title) in enumerate(docs):
        words_j = set(title.strip().split())
        for w in words_j:
            if w in top_words.keys():
                top_words[w] += 1
            else:
                top_words[w] = 1
    sorted_list = {k: v for k, v in sorted(top_words.items(), key=lambda kv: kv[1], reverse=True)}
    topk_words = list(sorted_list.keys())[0:K-1]
    
    for k, (doc_id, title) in enumerate(docs):
        groups_test.append(new_group)
        all_dist = []
        words = set(title.strip().split())
        
        bool_tmp = []
        for z in topk_words:
            bool_tmp.append(int(z in words))
        
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j = docs[j]
            words_j = set(title_j.strip().split())
            if len(words.union(words_j)) == 0:
                all_dist.append(len(words.intersection(words_j)))
            else:
                all_dist.append(len(words.intersection(words_j))/len(words.union(words_j)))

        X_test.append(sorted(all_dist, reverse=True)[0:N] + bool_tmp)
X_test = np.array(X_test)
groups_test = np.array(groups_test)
print(X_test.shape, groups_test.shape)

(16627, 49) (16627,)


# Поиск наилучших параметров XGBoost

In [11]:
? xgb.XGBClassifier

In [93]:
import warnings
warnings.filterwarnings("ignore")
 
trials = Trials()
 
def quality(params, X_train, y_train):
    #pipeline.set_params(**params)
    pipeline = xgb.XGBClassifier(**params)
 
    score = cross_val_score(estimator=pipeline, X=X_train, y=y_train, groups=groups_train,
                            scoring='f1', cv=GroupKFold(n_splits=5), n_jobs=-1) 
                            #fit_params={'categorical_feature' : 'auto'}
    return   {'loss': score.mean(), 'params': params, 'status': STATUS_OK}
 
grid = {
        'n_estimators' : scope.int(hp.quniform(label='n_estimators', 
                        low=50, 
                        high=500, 
                        q=1)),
        'max_depth' : scope.int(hp.quniform(label='max_depth', 
                        low=2, 
                        high=11, 
                        q=1)),
        'learning_rate' : hp.loguniform(label='learning_rate', 
                        low=-3*np.log(10), 
                        high=np.log(1)),
        'subsample' : hp.uniform(label='subsample', 
                        low=0.1, 
                        high=1),
        'colsample_bytree' : hp.uniform(label='colsample_bytree', 
                        low=0.1, 
                        high=1),
        'colsample_bylevel' : hp.uniform(label='colsample_bylevel', 
                        low=0.1, 
                        high=1),
        'colsample_bynode' : hp.uniform(label='colsample_bynode', 
                        low=0.1, 
                        high=1)
                }
        
 
best = fmin(fn=partial(quality, 
                       X_train=X_train, y_train=y_train),
                space=grid,
                algo=tpe.suggest,
                max_evals=1,
                trials=trials,
                verbose= 1,
                rstate=np.random.default_rng(1),
                show_progressbar=True)

100%|██████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.65s/trial, best loss: 0.6362785501352021]


In [19]:
best

{'colsample_bylevel': 0.9422139186669864,
 'colsample_bynode': 0.8947769179863024,
 'colsample_bytree': 0.10622632852938073,
 'learning_rate': 0.0047805268897416235,
 'max_depth': 7.0,
 'n_estimators': 206.0,
 'subsample': 0.5591228131751852}

In [94]:
n_estimators_list = np.arange(50, 501, 50)
max_depth_list = np.arange(2, 9, 1)
learn_rate_list = np.arange(0.01, 1.01, 0.09)
subsample_list = np.arange(0.3, 1.01, 0.1)
colsample_list = np.arange(0.3, 1.01, 0.1)
colsample_by_level_list = np.arange(0.3, 1.01, 0.1)
colsample_by_node_list = np.arange(0.3, 1.01, 0.1)

best_n = 0
best_dep = 0
best_learn_rate = 0
best_subsample = 0
best_colsample = 0
best_colsample_by_level = 0
best_colsample_by_node = 0

par = {'eval_metric': 'logloss'}

### Подбираем n_estimators

In [95]:
import warnings

In [96]:
best_score = 0

for n_est in n_estimators_list:
    clf = xgb.XGBClassifier(
        n_estimators=n_est,
        learning_rate = 0.01,
        max_depth=5
    )
    cv_score = cross_val_score(
        clf,
        X_train,
        y_train,
        groups=groups_train,
        cv =GroupKFold(n_splits=5),
        scoring='f1',
        fit_params= par
    ).mean()
    if cv_score > best_score:
        best_n = n_est
        best_score = cv_score


In [97]:
best_n, best_score

(150, 0.6606855164087648)

### Подбираем max_depth

In [98]:
best_score = 0

for depth in max_depth_list:
    clf = xgb.XGBClassifier(
        n_estimators=best_n,
        learning_rate = 0.01,
        max_depth=depth
    )
    cv_score = cross_val_score(
        clf,
        X_train,
        y_train,
        groups=groups_train,
        cv =GroupKFold(n_splits=5),
        scoring='f1',
        fit_params= par
    ).mean()
    if cv_score > best_score:
        best_dep = depth
        best_score = cv_score

In [99]:
best_dep, best_score

(5, 0.6606855164087648)

### Подбираем learning_rate

In [100]:
best_score = 0

for rate in learn_rate_list:
    clf = xgb.XGBClassifier(
        n_estimators=best_n,
        learning_rate = rate,
        max_depth=best_dep
    )
    cv_score = cross_val_score(
        clf,
        X_train,
        y_train,
        groups=groups_train,
        cv =GroupKFold(n_splits=5),
        scoring='f1',
        fit_params= par
    ).mean()
    if cv_score > best_score:
        best_learn_rate = rate
        best_score = cv_score

In [101]:
if best_learn_rate >= 1.0:
    best_learn_rate = 1.0
best_learn_rate, best_score

(0.01, 0.6606855164087648)

### Подбираем subsample

In [102]:
best_score = 0

for sub in subsample_list:
    clf = xgb.XGBClassifier(
        n_estimators=best_n,
        learning_rate = best_learn_rate,
        max_depth=best_dep,
        subsample = sub
    )
    cv_score = cross_val_score(
        clf,
        X_train,
        y_train,
        groups=groups_train,
        cv =GroupKFold(n_splits=5),
        scoring='f1',
        fit_params= par
    ).mean()
    if cv_score > best_score:
        best_subsample = sub
        best_score = cv_score

In [103]:
if best_subsample >= 1.0:
    best_subsample = 1.0

best_subsample, best_score

(1.0, 0.6606855164087648)

### Подбираем colsample

In [104]:
best_score = 0

for col in colsample_list:
    clf = xgb.XGBClassifier(
        n_estimators=best_n,
        learning_rate = best_learn_rate,
        max_depth=best_dep,
        subsample = best_subsample,
        colsample_bytree = col,
    )
    cv_score = cross_val_score(
        clf,
        X_train,
        y_train,
        groups=groups_train,
        cv =GroupKFold(n_splits=5),
        scoring='f1',
        fit_params= par
    ).mean()
    if cv_score > best_score:
        best_colsample = col
        best_score = cv_score

In [105]:
if best_colsample >= 1.0:
    best_colsample = 1.0
best_colsample, best_score

(0.8000000000000003, 0.6615644175163237)

In [106]:
best_score = 0

for col_level in colsample_by_level_list:
    clf = xgb.XGBClassifier(
        n_estimators=best_n,
        learning_rate = best_learn_rate,
        max_depth=best_dep,
        subsample = best_subsample,
        colsample_bytree = best_colsample,
        colsample_bylevel = col_level
    )
    cv_score = cross_val_score(
        clf,
        X_train,
        y_train,
        groups=groups_train,
        cv =GroupKFold(n_splits=5),
        scoring='f1',
        fit_params= par
    ).mean()
    if cv_score > best_score:
        best_colsample_by_level = col_level
        best_score = cv_score

In [107]:
if best_colsample_by_level >= 1.0:
    best_colsample_by_level = 1.0
best_colsample_by_level, best_score
    

(1.0, 0.6615644175163237)

In [108]:
best_score = 0

for col_node in colsample_by_node_list:
    clf = xgb.XGBClassifier(
        n_estimators=best_n,
        learning_rate = best_learn_rate,
        max_depth=best_dep,
        subsample = best_subsample,
        colsample_bytree = best_colsample,
        colsample_bylevel = best_colsample_by_level,
        colsample_bynode = col_node
    )
    cv_score = cross_val_score(
        clf,
        X_train,
        y_train,
        groups=groups_train,
        cv =GroupKFold(n_splits=5),
        scoring='f1',
        fit_params= par
    ).mean()
    if cv_score > best_score:
        best_colsample_by_node = col_node
        best_score = cv_score

In [109]:
if best_colsample_by_node >= 1.0:
    best_colsample_by_node = 1.0

best_colsample_by_node, best_score

(1.0, 0.6615644175163237)

### Подбираем регуляризацию

In [110]:
L1_list = np.arange(0, 2.01, 0.5)
L2_list = np.arange(0, 2.01, 0.5)

best_L1 = 0
best_L2 = 0

In [111]:
best_score = 0

for alpha in L1_list:
    for lambda_ in L2_list:
        clf = xgb.XGBClassifier(
            n_estimators=best_n,
            learning_rate = best_learn_rate,
            max_depth=best_dep,
            subsample = best_subsample,
            colsample_bytree = best_colsample,
            colsample_bylevel = best_colsample_by_level,
            colsample_bynode = best_colsample_by_node,
            reg_alpha = alpha,
            reg_lambda = lambda_
        )
        cv_score = cross_val_score(
            clf,
            X_train,
            y_train,
            groups=groups_train,
            cv =GroupKFold(n_splits=5),
            scoring='f1',
            fit_params= par
        ).mean()
        if cv_score > best_score:
            best_L1 = alpha
            best_L2 = lambda_
            best_score = cv_score

In [112]:
best_L1, best_L2, best_score

(0.0, 0.5, 0.6628018336593079)

In [113]:
best_params = {'n_estimators': best_n, 'max_depth': best_dep, 'learning_rate': best_learn_rate,
              'subsample': best_subsample, 'colsample_bytree': best_colsample, 
              'colsample_bylevel': best_colsample_by_level, 'colsample_bynode': best_colsample_by_node,
              'reg_alpha': best_L1, 'reg_lambda': best_L2}
best_params

In [114]:
clf = xgb.XGBClassifier(**best_params)
cv_score = cross_val_score(
    clf,
    X_train,
    y_train,
    groups=groups_train,
    cv =GroupKFold(n_splits=5),
    scoring='f1',
    fit_params= par
).mean()
cv_score

0.6628018336593079

# Формируем y_test

In [115]:
clf.fit(X_train, y_train, **par)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
              colsample_bynode=1.0, colsample_bytree=0.8000000000000003,
              enable_categorical=False, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=150, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0.0, reg_lambda=0.5, scale_pos_weight=1,
              subsample=1.0, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [116]:
y_pred = clf.predict(X_test)

In [117]:
test_data 

Unnamed: 0,pair_id,group_id,doc_id
0,11691,130,6710
1,11692,130,4030
2,11693,130,5561
3,11694,130,4055
4,11695,130,4247
...,...,...,...
16622,28313,309,16637
16623,28314,309,16759
16624,28315,309,15358
16625,28316,309,17287


In [118]:
submission = pd.DataFrame({"pair_id": test_data["pair_id"], 
                           "target": y_pred})
submission

Unnamed: 0,pair_id,target
0,11691,1
1,11692,0
2,11693,1
3,11694,1
4,11695,0
...,...,...
16622,28313,0
16623,28314,1
16624,28315,1
16625,28316,1


In [120]:
submission.to_csv("result.csv", index=False)

In [390]:
clf = xgb.XGBClassifier(n_estimators = best_n, max_depth = best_dep, learning_rate = best_learn_rate)
cv_score = cross_val_score(
    clf,
    X_train,
    y_train,
    groups=groups_train,
    cv =GroupKFold(n_splits=5),
    scoring='f1'
).mean()
cv_score



0.6619074643905811

In [392]:
clf.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.01, max_delta_step=0,
              max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=300, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [393]:
y_pred = clf.predict(X_test)

In [394]:
submission = pd.DataFrame({"pair_id": test_data["pair_id"], 
                           "target": y_pred})
submission

Unnamed: 0,pair_id,target
0,11691,1
1,11692,0
2,11693,1
3,11694,1
4,11695,0
...,...,...
16622,28313,0
16623,28314,0
16624,28315,1
16625,28316,1


In [396]:
submission.to_csv("result.csv", index=False)