In [1]:
import pandas as pd
import numpy as np
import re
import sklearn.metrics as metrics
import pymorphy2

In [210]:
from scipy.spatial.distance import cdist
from scipy import sparse
from functools import reduce
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_validate, KFold, cross_val_score, GroupKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
def get_df():
    
    titles_df = pd.read_csv('./data/docs_titles.tsv/docs_titles.tsv', sep='\t')
    docs_id_test = pd.read_csv('./data/test_groups.csv', sep=',')
    docs_id_train = pd.read_csv('./data/train_groups.csv', sep=',')

    info = pd.concat([docs_id_train, docs_id_test])
    info.reset_index(drop=True)

    titles = pd.merge(titles_df, info[['group_id', 'doc_id', 'target']], on='doc_id', how='inner')
    titles['title'] += ' '
    titles['title'].fillna(' ', inplace=True)
    
    return titles

In [4]:
def titles_extraction():
    
    titles = get_df();

    titles = titles[['title', 'group_id']].groupby('group_id').sum()['title']

    titles = titles.apply(lambda x: cleaner(x))
    titles = titles.values

    return titles

In [5]:
def cleaning(titles, group_num, bad_words):
    titles = titles.lower()
    titles = re.sub(r'\W', '  ', titles)
    
    for i in bad_words[group_num]:
            titles = titles.replace(i, ' ')
    titles = re.sub(r'\s+', ' ', titles)
    return titles

In [6]:
# стеммниг для русского языка (взял в интернете)
class Porter:
    PERFECTIVEGROUND =  re.compile(u"((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$")
    REFLEXIVE = re.compile(u"(с[яь])$")
    ADJECTIVE = re.compile(u"(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|ую|юю|ая|яя|ою|ею)$")
    PARTICIPLE = re.compile(u"((ивш|ывш|ующ)|((?<=[ая])(ем|нн|вш|ющ|щ)))$")
    VERB = re.compile(u"((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|ят|ует|уют|ит|ыт|ены|ить|ыть|ишь|ую|ю)|((?<=[ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)))$")
    NOUN = re.compile(u"(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$")
    RVRE = re.compile(u"^(.*?[аеиоуыэюя])(.*)$")
    DERIVATIONAL = re.compile(u".*[^аеиоуыэюя]+[аеиоуыэюя].*ость?$")
    DER = re.compile(u"ость?$")
    SUPERLATIVE = re.compile(u"(ейше|ейш)$")
    I = re.compile(u"и$")
    P = re.compile(u"ь$")
    NN = re.compile(u"нн$")

    def stem(string):
        
        changed = ''
        
        string = string.lower()

        for word in string.split():
            if not word.isdigit():
                word = word.replace(u'ё', u'е')
                m = re.match(Porter.RVRE, word)

                if m and m.groups():
                    pre = m.group(1)
                    rv = m.group(2)
                    temp = Porter.PERFECTIVEGROUND.sub('', rv, 1)
                    if temp == rv:
                        rv = Porter.REFLEXIVE.sub('', rv, 1)
                        temp = Porter.ADJECTIVE.sub('', rv, 1)
                        if temp != rv:
                            rv = temp
                            rv = Porter.PARTICIPLE.sub('', rv, 1)
                        else:
                            temp = Porter.VERB.sub('', rv, 1)
                            if temp == rv:
                                rv = Porter.NOUN.sub('', rv, 1)
                            else:
                                rv = temp
                    else:
                        rv = temp

                    rv = Porter.I.sub('', rv, 1)

                    if re.match(Porter.DERIVATIONAL, rv):
                        rv = Porter.DER.sub('', rv, 1)

                    temp = Porter.P.sub('', rv, 1)
                    if temp == rv:
                        rv = Porter.SUPERLATIVE.sub('', rv, 1)
                        rv = Porter.NN.sub(u'н', rv, 1)
                    else:
                        rv = temp
                    word = pre+rv

            changed += word + ' '

        return changed

In [7]:
# # эта и 3 следующие функции используются для подсчета расстояний без тф идф и косинусной метрики

# def features_create(mode, groups_titledata):
    
#     if mode == 'train':
#         y = []      
#     X = []
#     groups_train = []
    
#     for new_group in groups_titledata:
#         docs = groups_titledata[new_group]
        
#         for k, info in enumerate(docs):
            
#             doc_id = info[0]
#             title = info[1]
            
#             if mode == 'train':
#                 target_id = info[2]
#                 y.append(target_id)
                
#             groups_train.append(new_group)
#             all_dist = []
#             words = set(title.strip().split())
            
#             for j in range(0, len(docs)):
#                 if k == j:
#                     continue
#                 info = docs[j]
#                 doc_id_j = info[0]
#                 title_j = info[1]

#                 words_j = set(title_j.strip().split())
#                 all_dist.append(len(words.intersection(words_j)))
                
#             X.append(sorted(all_dist, reverse=True)[0:25])
            
#     X = np.array(X)
    
#     if mode == 'train':
#         y = np.array(y)
    
#     groups_train = np.array(groups_train)

#     if mode == 'train':
#         print(X.shape, y.shape, groups_train.shape)
#         return X, y, groups_train
#     else:
#         print(X.shape, groups_train.shape)
#         return X, groups_train

In [9]:
# def tuple_x(a1, a2):
#     return a1, a2

In [8]:
# def title_info_dict(mode, doc_to_title):
    
#     data = pd.read_csv('./data/{}_groups.csv'.format(mode))

#     titledata = {}
    
#     for i in range(len(data)):
        
#         new_doc = data.iloc[i]
#         doc_group = new_doc['group_id']
#         doc_id = new_doc['doc_id']
            
#         title = doc_to_title[doc_id]
        
#         if doc_group not in titledata:
#             titledata[doc_group] = []
            
#         if mode == 'train':
#             titledata[doc_group].append((doc_id, title, new_doc['target']))
#         else:
#             titledata[doc_group].append((doc_id, title))
        
#     return titledata

In [10]:
# def easy_launch():
    
#     doc_to_title = {}
#     with open('./data/unversal_table.csv', encoding = 'utf-8') as f:
#         for num_line, line in enumerate(f):
#             if num_line == 0:
#                 continue

#             line = line.replace('\t', ',')
#             data = line.strip().split(',')

#             doc_id = int(data[0])
#             if len(data) == 1:
#                 title = ''
#             else:
#                 title = data[1]
#             doc_to_title[doc_id] = title
            
#     print('doc titles dict len = {}'.format(len(doc_to_title)))
    
#     train_titledata = title_info_dict('train', doc_to_title)
#     test_titledata = title_info_dict('test', doc_to_title)
    
#     X_train, y_train, groups_train = features_create('train', train_titledata)
#     X_test, groups_test = features_create('test', test_titledata)
    
#     return X_train, y_train, X_test, groups_train, groups_test

In [11]:
# 3  функции для перебора комбинаций параметров
def flatten(x):
    
    result = []
    for elem in x:
        if hasattr(elem, "__iter__") and not isinstance(elem, str):
            result.extend(flatten(elem))
        else:
            result.append(elem)
            
    return result

In [12]:
def list_concat(list1, list2):
    
    len1 = len(list1)
    len2 = len(list2)
    
    return [[list1[i], list2[j]] for i in range(len1) for j in range(len2)]

In [13]:
def combinations(params):
    
    list_ = []
    for value in params.values():
        list_.append(value)

    tmp = reduce(lambda x, y: list_concat(x, y), list_)

    res = []
    for elem in tmp:
        param_list = flatten(elem)
        param_dict = dict(zip(params.keys(), param_list))
        res.append(param_dict)
        
    return res

In [14]:
def frange(start, stop, step):
    i = start
    while i < stop:
        yield i
        i += step

In [346]:
def validation(X_train, train_target, model, params, folds_gen_func, folds_num=10, thresholds=[0.32], **kwargs):
    
    scaler = StandardScaler()
    scaler.fit(X_train) 
    
    main_res = []
    for param_set in combinations(params):
        
        print(param_set)        
        exact_model = model(**param_set) 
        
        fold_generator = folds_gen_func(folds_num)
        
        for th in thresholds:
#           th = 0.27
            print('th = ', th)
            res = []
            for train_index, test_index in fold_generator.split(X_train, train_target, **kwargs):

                exact_model.fit(scaler.transform(X_train[train_index]), train_target[train_index])

                y_pred = [0 if val < th else 1 for val in exact_model.predict_proba(scaler.transform(X_train[test_index]))[:,1]]

                score = metrics.f1_score(train_target[test_index],\
                                                y_pred)
    #               print('threshold = {}, score = {}'.format(th, score))

                res.append(score)
        #                                       exact_model.predict(scaler.transform(X_train[test_index]))))


            mean = sum(res)/len(res)
            print(mean)
#             print('threshold = {}, score = {}'.format(th, mean))
            main_res.append((mean, param_set, th))
    
    best = main_res[np.argmax([res[0] for res in main_res])]
    print('--------max-------')
    print(best)
    
    return best

In [16]:
# Сохраняет решение

def save_submission(y_pred):

    data = pd.read_csv('data/test_groups.csv')
    print('len data = ', len(data))
    data['target'] = y_pred
    
    data = data.drop(['group_id', 'doc_id'], axis=1)

    data.to_csv("submission.csv", index=False)
    
    info = np.unique(data['target'], return_counts=True)
    
    if info[0].shape[0] > 1:
        
        print('0: {}, 1: {}'.format(info[1][0], info[1][1]))
        if info[1][1] > 6000 or info[1][1] < 2500:
            print('Your submisson is shit')
#         elif info[1][1] > 4500:
#             print('Your submisson is probably shit')
    else:
        print('There are only {} in submission'.format(info[0][0]))
        
    return data

In [194]:
def predict(X_train, X_test, train_target, model, scaler=None, **kwargs):
    
    curr_model = model(**kwargs)
    
    if scaler is not None:
        
        your_scaler = scaler()
        your_scaler.fit(X_train)
        X_train = your_scaler.transform(X_train)
        X_test = your_scaler.transform(X_test)
        
    curr_model.fit(X_train, train_target)
#     y_pred = curr_model.predict(X_test)
    th = 0.35
    y_pred = [0 if val < th else 1 for val in curr_model.predict_proba(X_test)[:,1]]
    return y_pred

In [18]:
morph = pymorphy2.MorphAnalyzer()

In [19]:
# Лемматизация

def str_parser(words_):
    global j
    new_string = ''
    
    for i in re.findall(r'\b[а-я]{1,20}\b', words_):
        new_string += (morph.parse(i)[0].normal_form) + ' '

    j += 1
    if(j % 100 == 0):
        print(j, '/28026 loaded')
    
    return new_string

In [20]:
def lemmatization(df):
    
    return df['title'].apply(lambda x: str_parser(x))

In [21]:
def stemming_titles(df):
    stemming = Porter
    return df['title'].apply(lambda x: stemming.stem(x))

In [22]:
def get_train_test_():

    df_train = pd.read_csv('data/core_train.csv')
    df_test = pd.read_csv('data/core_test.csv')
    df_train.fillna(' ', inplace=True)
    df_test.fillna(' ', inplace=True)
    titles_df = pd.concat([df_train, df_test],ignore_index = True)
    titles_df.drop(columns = {'Unnamed: 0'}, inplace = True)

    return titles_df

In [408]:
# Рассчитывает расстояния между документами берет 20 наименьших и сохраняет numpy ndarray в файл

def features_save(group_num, max_f=None, vec_type=1):
    
    df = pd.read_csv('./data/unversal_table.csv')
    df.fillna(' ', inplace=True)
    corpus = df[df.group_id == group_num]['title'].values
#     print(corpus)
    if vec_type == 1:
        vectorizer = CountVectorizer(max_features=max_f)
        X = vectorizer.fit_transform(corpus)
        
    elif vec_type ==2:
        vectorizer2 = CountVectorizer(max_features=max_f)
        X = vectorizer2.fit_transform(corpus)
#     print(vectorizer.vocabulary_)
    features = count_distances(X.toarray())
#     print(features.shape)
    np.save('group_features/{}'.format(group_num), features)
    return features

In [409]:
# Рассчет расстояний по матрице встречаемости (косинусная метрика)

def count_distances(docs):
    
    distance = cdist(docs, docs, 'cosine')
    
    res = np.asarray([np.concatenate((vec[:num],vec[num+1:])) for num, vec in enumerate(distance)])
    res = np.sort(res)[:, :25]
#     res = np.flip(res, axis=1)[:, :25]
#     info = 26 - res.shape[1]
#     if info > 0:
#         z = np.zeros(shape=(res.shape[0], info))
#         res = np.hstack((z, res))
    
    return res

In [410]:
# Проходит по заданным группам и сохраняет признаки(расстояния)

def main_parser_and_saver(start_group, finish_group):
    
    for group_num in range(start_group, finish_group + 1):

        features_save(group_num)
        print('Скачалась группа:', group_num)
    
    return True

In [411]:
# Создает X_train, X_test, train_target

def prepare_data():
    
    X_train = all_group_feature_list(1, 129)
    X_test = all_group_feature_list(130, 309)
    
    d = pd.read_csv('./data/train_groups.csv')
    train_target = d['target']
    
    X_train = np.nan_to_num(X_train, nan=0, posinf=1, neginf=0)
    X_test = np.nan_to_num(X_test, nan=0, posinf=1, neginf=0)
    
    return X_train, train_target, X_test

In [412]:
# Загружает файлы с признаками документов по группам

def all_group_feature_list(start_group, finish_group):
    
    res = np.load('group_features/{}.npy'.format(start_group))
  
    for group_num in range(start_group + 1, finish_group + 1):
#         res += ndarray_to_list(np.load('group_features/{}.npy'.format(group_num)))
        
        t = np.load('group_features/{}.npy'.format(group_num))
#         print(res.shape, t.shape, group_num)
        res = np.vstack((res, t))
        
    return res

In [60]:
titles_df = get_train_test_()
titles = pd.DataFrame({'doc_id': titles_df['doc_id'] ,
                       'title': titles_df['title'] + titles_df['h1']})

In [402]:
%%time
j = 0
# titles['title'] = lemmatization(titles)

CPU times: user 4 µs, sys: 10 µs, total: 14 µs
Wall time: 35.8 µs


In [61]:
%%time

titles['title'] = stemming_titles(titles)

CPU times: user 8.01 s, sys: 151 ms, total: 8.16 s
Wall time: 8.19 s


In [64]:
titles.to_csv('./data/unversal_table.csv', index=False)

In [65]:
%%time
X_train, y_train, X_test, groups_train, groups_test = easy_launch()

doc titles dict len = 28026
(11690, 25) (11690,) (11690,)
(16627, 25) (16627,)
CPU times: user 12.7 s, sys: 45.4 ms, total: 12.7 s
Wall time: 12.8 s


In [66]:
groups_train = pd.read_csv('data/train_groups.csv')['group_id']

In [401]:
%%time
best = validation(X_train, y_train, GradientBoostingClassifier, params, GroupKFold, groups=groups_train, thresholds=th)

{'learning_rate': 0.06, 'n_estimators': 150}
th =  0.22
0.6905247109429726
th =  0.23
0.6918590992745755
th =  0.24
0.6950857871928419
th =  0.25
0.6973429128734442
th =  0.26
0.7002258476889112
th =  0.27
0.7020797215310617
--------max-------
(0.7020797215310617, {'learning_rate': 0.06, 'n_estimators': 150}, 0.27)
CPU times: user 5min 6s, sys: 691 ms, total: 5min 7s
Wall time: 5min 11s


In [204]:
y_pred = predict(X_train, X_test, y_train, GradientBoostingClassifier, StandardScaler, **best[1])
data = save_submission(y_pred)

len data =  16627
0: 11334, 1: 5293


In [392]:
params = {'learning_rate': [0.07, 0.08],
#           'n_estimators': [372, 374]}
          'n_estimators': [100, 150, 200, 250]}
th = [0.26]

In [393]:
params = {'learning_rate': [0.06],
          'n_estimators' : [150]}
th = [0.22, 0.23, 0.24, 0.25, 0.26, 0.27]

In [94]:
params = {'algorithm': ['ball_tree', 'brute'],
# params = {'algorithm': ['ball_tree'],
# params = {'algorithm': ['auto'],
          'leaf_size': [5, 10, 20, 40],
          'n_neighbors': [30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80],
#           'weights': [smart_weights],
          'p': [1],
          'n_jobs': [-1]}

In [146]:
params = {'algorithm': ['brute'],
# params = {'algorithm': ['ball_tree'],
# params = {'algorithm': ['auto'],
          'n_neighbors': [60],
#           'weights': [smart_weights],
          'p': [1],
          'n_jobs': [-1]}
th = [0.3,0.31,0.32,0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45]

In [220]:
th = [0.32] 
params = {'n_estimators': [300, 400, 500, 600], 
          'criterion': ['gini', 'entropy'], 
          'max_depth': [10, 15, None], 
          'n_jobs': [-1]}

In [228]:
params = {'n_estimators': [1300], 
          'criterion': ['entropy'], 
          'max_depth': [10], 
          'n_jobs': [-1]}
# th = [0.3,0.31,0.32,0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45]

In [230]:
best = validation(X_train, y_train, RandomForestClassifier, params, GroupKFold, groups=groups_train, thresholds=th)

{'n_estimators': 1300, 'criterion': 'entropy', 'max_depth': 10, 'n_jobs': -1}
th =  0.32
0.7072713780196176
--------max-------
(0.7072713780196176, {'n_estimators': 1300, 'criterion': 'entropy', 'max_depth': 10, 'n_jobs': -1}, 0.32)


In [326]:
best = validation(X_train, y_train, KNeighborsClassifier, params, GroupKFold, groups=groups_train, thresholds=th)

{'learning_rate': 0.06, 'n_estimators': 150}


TypeError: __init__() got an unexpected keyword argument 'learning_rate'

In [403]:
# main_parser_and_saver(1, 129)
# main_parser_and_saver(130, 309)

In [368]:
df_train = pd.read_csv('data/core_train.csv')
df_test = pd.read_csv('data/core_test.csv')
df_train.fillna(' ', inplace=True)
df_test.fillna(' ', inplace=True)

In [396]:
titles_all = pd.concat([df_train, df_test])

In [370]:
titles_h2h3a = pd.read_csv('./data/no_bad_words_table.csv')

In [383]:
titles_df = pd.merge(titles_all, titles_h2h3a[['doc_id', 'title']], on='doc_id', how='inner')

In [385]:
titles_df2 = pd.DataFrame({'doc_id': titles_df['doc_id'] ,
                       'title': titles_df['title_x'] + titles_df['h1'],
#                        'title': titles_df['title'] + titles_df['h1'],
#                        'title': titles_df['h2'] + titles_df['h3'] + titles_df['a'],
                       'group_id': titles_df['group_id']})

titles_df2['title'] = stemming_titles(titles_df2)

In [405]:
titles = pd.DataFrame({'doc_id': titles_df2['doc_id'] ,
                       'title': titles_df2['title'] + titles_df['title_y'],
#                        'title': titles_df['title'] + titles_df['h1'],
#                        'title': titles_df['h2'] + titles_df['h3'] + titles_df['a'],
                       'group_id': titles_df2['group_id']})

titles.fillna(' ', inplace=True)
j = 0
# titles['title'] = stemming_titles(titles)
# titles['title'] = lemmatization(titles)

# titles['title']

In [382]:
titles['title'][0]

'ваз зам подшипник ступиц нив зам подшипник ступиц авт автозапчаст амортизатор ваз вал впрысков газ двигател диск задн зам карда колес коробк куз магазин мост неисправн нив общ опор охлажден передач передн подвеск подшипник привод пружин раздаточн регулировк руководств рулев рычаг сайт салон систем снят стабилизатор ступиц схем сцеплен тормозн трансмисс тюнинг управлен установк шаров шин электрооборудован '

In [349]:
titles.to_csv('./data/unversal_table.csv', index=False)

In [298]:
titles.fillna(' ', inplace=True)

In [407]:
X_train, y_train, X_test = prepare_data()
groups_train = pd.read_csv('data/train_groups.csv')['group_id']

In [400]:
X_train

array([[0.        , 0.06458565, 0.16333997, ..., 0.62203553, 0.65811827,
        0.66563307],
       [0.44056907, 0.44098301, 0.5       , ..., 0.77777778, 0.78918149,
        0.78918149],
       [0.48823368, 0.53343053, 0.53343053, ..., 0.75256417, 0.75686773,
        0.80712081],
       ...,
       [0.88661066, 0.89793793, 0.90715233, ..., 0.97817821, 0.97867993,
        0.97867993],
       [0.3238766 , 0.78178211, 0.81101776, ..., 1.        , 1.        ,
        1.        ],
       [0.88529213, 0.90634142, 0.93082855, ..., 1.        , 1.        ,
        1.        ]])