In [144]:
import pandas as pd
import numpy as np
import re
import sklearn.metrics as metrics

In [124]:
from scipy import sparse
from functools import reduce
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_validate, KFold, cross_val_score, GroupKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

In [20]:
def get_df():
    
    titles_df = pd.read_csv('./data/docs_titles.tsv/docs_titles.tsv', sep='\t')
    docs_id_test = pd.read_csv('./data/test_groups.csv', sep=',')
    docs_id_train = pd.read_csv('./data/train_groups.csv', sep=',')

    info = pd.concat([docs_id_train, docs_id_test])
    info.reset_index(drop=True)

    titles = pd.merge(titles_df, info[['group_id', 'doc_id', 'target']], on='doc_id', how='inner')
    titles['title'] += ' '
    titles['title'].fillna(' ', inplace=True)
    
    return titles

In [21]:
def titles_extraction():
    
    titles = get_df();

    titles = titles[['title', 'group_id']].groupby('group_id').sum()['title']

    titles = titles.apply(lambda x: cleaner(x))
    titles = titles.values

    return titles

In [85]:
def cleaning(titles, group_num, bad_words):
    titles = titles.lower()
    titles = re.sub(r'\W', '  ', titles)
    
    for i in bad_words[group_num]:
            titles = titles.replace(i, ' ')
    titles = re.sub(r'\s+', ' ', titles)
    return titles

In [172]:
class Porter:
    PERFECTIVEGROUND =  re.compile(u"((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$")
    REFLEXIVE = re.compile(u"(с[яь])$")
    ADJECTIVE = re.compile(u"(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|ую|юю|ая|яя|ою|ею)$")
    PARTICIPLE = re.compile(u"((ивш|ывш|ующ)|((?<=[ая])(ем|нн|вш|ющ|щ)))$")
    VERB = re.compile(u"((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|ят|ует|уют|ит|ыт|ены|ить|ыть|ишь|ую|ю)|((?<=[ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)))$")
    NOUN = re.compile(u"(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$")
    RVRE = re.compile(u"^(.*?[аеиоуыэюя])(.*)$")
    DERIVATIONAL = re.compile(u".*[^аеиоуыэюя]+[аеиоуыэюя].*ость?$")
    DER = re.compile(u"ость?$")
    SUPERLATIVE = re.compile(u"(ейше|ейш)$")
    I = re.compile(u"и$")
    P = re.compile(u"ь$")
    NN = re.compile(u"нн$")

    def stem(string):
        
        changed = ''
        
        string = string.lower()

        for word in string.split():
            if not word.isdigit():
                word = word.replace(u'ё', u'е')
                m = re.match(Porter.RVRE, word)

                if m and m.groups():
                    pre = m.group(1)
                    rv = m.group(2)
                    temp = Porter.PERFECTIVEGROUND.sub('', rv, 1)
                    if temp == rv:
                        rv = Porter.REFLEXIVE.sub('', rv, 1)
                        temp = Porter.ADJECTIVE.sub('', rv, 1)
                        if temp != rv:
                            rv = temp
                            rv = Porter.PARTICIPLE.sub('', rv, 1)
                        else:
                            temp = Porter.VERB.sub('', rv, 1)
                            if temp == rv:
                                rv = Porter.NOUN.sub('', rv, 1)
                            else:
                                rv = temp
                    else:
                        rv = temp

                    rv = Porter.I.sub('', rv, 1)

                    if re.match(Porter.DERIVATIONAL, rv):
                        rv = Porter.DER.sub('', rv, 1)

                    temp = Porter.P.sub('', rv, 1)
                    if temp == rv:
                        rv = Porter.SUPERLATIVE.sub('', rv, 1)
                        rv = Porter.NN.sub(u'н', rv, 1)
                    else:
                        rv = temp
                    word = pre+rv

            changed += word + ' '

        return changed

In [377]:
def features_create(mode, groups_titledata):
    
    if mode == 'train':
        y = []      
    X = []
    groups_train = []
    
    for new_group in groups_titledata:
        docs = groups_titledata[new_group]
        
        for k, info in enumerate(docs):
            
            doc_id = info[0]
            title = info[1]
            
            if mode == 'train':
                target_id = info[2]
                y.append(target_id)
                
            groups_train.append(new_group)
            all_dist = []
            words = set(title.strip().split())
            
            for j in range(0, len(docs)):
                if k == j:
                    continue
                info = docs[j]
                doc_id_j = info[0]
                title_j = info[1]

                words_j = set(title_j.strip().split())
                all_dist.append(len(words.intersection(words_j)))
                
            X.append(sorted(all_dist, reverse=True)[0:25])
            
    X = np.array(X)
    
    if mode == 'train':
        y = np.array(y)
    
    groups_train = np.array(groups_train)

    if mode == 'train':
        print(X.shape, y.shape, groups_train.shape)
        return X, y, groups_train
    else:
        print(X.shape, groups_train.shape)
        return X, groups_train

In [365]:
def title_info_dict(mode, doc_to_title):
    
    data = pd.read_csv('./data/{}_groups.csv'.format(mode))

    titledata = {}
    
    for i in range(len(data)):
        
        new_doc = data.iloc[i]
        doc_group = new_doc['group_id']
        doc_id = new_doc['doc_id']
            
        title = doc_to_title[doc_id]
        
        if doc_group not in titledata:
            titledata[doc_group] = []
            
        if mode == 'train':
            titledata[doc_group].append((doc_id, title, new_doc['target']))
        else:
            titledata[doc_group].append((doc_id, title))
        
    return titledata

In [366]:
def tuple_x(a1, a2):
    return a1, a2

In [378]:
def easy_launch():
    
    doc_to_title = {}
    with open('./data/unversal_table.csv') as f:
        for num_line, line in enumerate(f):
            if num_line == 0:
                continue

            line = line.replace('\t', ',')
            data = line.strip().split(',')

            doc_id = int(data[0])
            if len(data) == 1:
                title = ''
            else:
                title = data[1]
            doc_to_title[doc_id] = title
            
    print('doc titles dict len = {}'.format(len(doc_to_title)))
    
    train_titledata = title_info_dict('train', doc_to_title)
    test_titledata = title_info_dict('test', doc_to_title)
    
    X_train, y_train, groups_train = features_create('train', train_titledata)
    X_test, groups_test = features_create('test', test_titledata)
    
    return X_train, y_train, X_test, groups_train, groups_test

In [384]:
X_train, y_train, X_test, groups_train, groups_test = easy_launch()

doc titles dict len = 28026
(11690, 25) (11690,) (11690,)
(16627, 25) (16627,)


In [286]:
def flatten(x):
    
    result = []
    for elem in x:
        if hasattr(elem, "__iter__") and not isinstance(elem, str):
            result.extend(flatten(elem))
        else:
            result.append(elem)
            
    return result

In [121]:
def list_concat(list1, list2):
    
    len1 = len(list1)
    len2 = len(list2)
    
    return [[list1[i], list2[j]] for i in range(len1) for j in range(len2)]

In [120]:
def combinations(params):
    
    list_ = []
    for value in params.values():
        list_.append(value)

    tmp = reduce(lambda x, y: list_concat(x, y), list_)

    res = []
    for elem in tmp:
        param_list = flatten(elem)
        param_dict = dict(zip(params.keys(), param_list))
        res.append(param_dict)
        
    return res

In [146]:
def frange(start, stop, step):
    i = start
    while i < stop:
        yield i
        i += step

In [325]:
def validation(X_train, train_target, model, params, folds_gen_func, groups_num=10, thresholds=[0.27], **kwargs):
    
    scaler = StandardScaler()
    scaler.fit(X_train) 
    
    main_res = []
    for param_set in combinations(params):
        
        print(param_set)        
        exact_model = model(**param_set) 
        
        fold_generator = folds_gen_func(groups_num)
        
        for th in thresholds:
    #         th = 0.27
            res = []
            for train_index, test_index in fold_generator.split(X_train, train_target, **kwargs):

                exact_model.fit(scaler.transform(X_train[train_index]), train_target[train_index])

                y_pred = [0 if val < th else 1 for val in exact_model.predict_proba(scaler.transform(X_train[test_index]))[:,1]]

                score = metrics.f1_score(train_target[test_index],\
                                                y_pred)
    #               print('threshold = {}, score = {}'.format(th, score))

                res.append(score)
        #                                       exact_model.predict(scaler.transform(X_train[test_index]))))


            mean = sum(res)/len(res)
            print(mean)
#             print('threshold = {}, score = {}'.format(th, mean))
            main_res.append((mean, param_set, th))
    
    best = main_res[np.argmax([res[0] for res in main_res])]
    print('--------max-------')
    print(best)
    
    return best

In [315]:
# Сохраняет решение

def save_submission(y_pred):

    data = pd.read_csv('data/test_groups.csv')
    print('len data = ', len(data))
    data['target'] = y_pred
    
    data = data.drop(['group_id', 'doc_id'], axis=1)

    data.to_csv("submission.csv", index=False)
    
    info = np.unique(data['target'], return_counts=True)
    
    if info[0].shape[0] > 1:
        
        print('0: {}, 1: {}'.format(info[1][0], info[1][1]))
        if info[1][1] > 6000 or info[1][1] < 2500:
            print('Your submisson is shit')
#         elif info[1][1] > 4500:
#             print('Your submisson is probably shit')
    else:
        print('There are only {} in submission'.format(info[0][0]))
        
    return data

In [328]:
def predict(X_train, X_test, train_target, model, scaler=None, **kwargs):
    
    curr_model = model(**kwargs)
    
    if scaler is not None:
        
        your_scaler = scaler()
        your_scaler.fit(X_train)
        X_train = your_scaler.transform(X_train)
        X_test = your_scaler.transform(X_test)
        
    curr_model.fit(X_train, train_target)
#     y_pred = curr_model.predict(X_test)
    th = 0.27
    y_pred = [0 if val < th else 1 for val in curr_model.predict_proba(X_test)[:,1]]
    return y_pred

In [371]:
params = {'learning_rate': [0.07, 0.08],
#           'n_estimators': [372, 374]}
          'n_estimators': [175, 180, 185]}
groups_train = pd.read_csv('data/train_groups.csv')['group_id']

In [385]:
best = validation(X_train, y_train, GradientBoostingClassifier, params, GroupKFold, groups=groups_train)

{'learning_rate': 0.07, 'n_estimators': 175}
0.6757455898211158
{'learning_rate': 0.07, 'n_estimators': 180}
0.6762859284694906
{'learning_rate': 0.07, 'n_estimators': 185}
0.6761382585234248
{'learning_rate': 0.08, 'n_estimators': 175}
0.6758614481487066
{'learning_rate': 0.08, 'n_estimators': 180}
0.6758887661351857
{'learning_rate': 0.08, 'n_estimators': 185}
0.6756966908458771
--------max-------
(0.6762859284694906, {'learning_rate': 0.07, 'n_estimators': 180}, 0.27)


In [329]:
y_pred = predict(X_train, X_test, y_train, GradientBoostingClassifier, StandardScaler, **best[1])
data = save_submission(y_pred)

len data =  16627
0: 10613, 1: 6014
Your submisson is shit


In [240]:
def stemming_titles(df):
    stemming = Porter
    return df['title'].apply(lambda x: stemming.stem(x))

In [266]:
df_train = pd.read_csv('data/core_train.csv')
df_test = pd.read_csv('data/core_test.csv')
df_train.fillna(' ', inplace=True)
df_test.fillna(' ', inplace=True)

In [332]:
titles_df = pd.concat([df_train, df_test])
titles_df

Unnamed: 0.1,Unnamed: 0,title,h1,h2,h3,a,group_id,doc_id
0,0,ВАЗ Замена подшипников ступицы Нива,Замена подшипников ступицы,,ВАЗ,Автомануалы Руководства Советы Схемы Фото Рук...,1,15731
1,1,Ваз оптом Сочи Сравнить цены купить потребите...,Ваз оптом Сочи,,Популярные категории,зарегистрируйте свою компанию покупатель Войт...,1,14829
2,2,Купить ступица Лада калина Трансмиссия перехо...,Ступица Лада калина,,Запчасть,Автомобили Спецтехника Запчасти Отзывы Форумы...,1,15764
3,3,Классика,Learn Center,Классика Прайс,,ГЛАВНАЯ Продукция Интернет магазин ДОСТАВКА К...,1,17669
4,4,Ступица Нива замена подшипника своими руками,Ступица Нива как провести ремонт замену подши...,,Устройство особенности cтупицы переднего коле...,Устройство особенности cтупицы переднего коле...,1,14852
...,...,...,...,...,...,...,...,...
16622,16622,Ответы Mail полезно кушать творог утрам если ...,полезно кушать творог утрам если худею поправ...,,,Mail Почта Мой Мир Одноклассники Игры Знакомс...,309,16637
16623,16623,Творог Полезные свойства лечение творогом Жен...,Лечебные свойства продуктов букву Творог Поле...,,,уход залицом уход заволосами уход зателом фит...,309,16759
16624,16624,Творог Полезные опасные свойства творога,Творог,Полезные свойства творога Смотрите также свой...,Опасные свойства творога,Вконтакте Одноклассники Фейсбук Твиттер Витам...,309,15358
16625,16625,Ответы Mail Чем полезен творог,Чем полезен творог,,,Mail Почта Мой Мир Одноклассники Игры Знакомс...,309,17287


In [382]:
titles = pd.DataFrame({'doc_id': titles_df['doc_id'] ,
                       'title': titles_df['title'] + titles_df['h1']})

titles['title'] = stemming_titles(titles)

titles

Unnamed: 0,doc_id,title
0,15731,ваз зам подшипник ступиц нив зам подшипник сту...
1,14829,ваз опт соч сравн цен куп потребительск товар ...
2,15764,куп ступиц лад калин трансмисс переходн ступиц...
3,17669,классик learn center
4,14852,ступиц нив зам подшипник сво рук ступиц нив ка...
...,...,...
16622,16637,ответ mail полезн куша творог утр есл худ попр...
16623,16759,творог полезн свойств лечен творог женск сайт ...
16624,15358,творог полезн опасн свойств творог творог
16625,17287,ответ mail чем полез творог чем полез творог


In [383]:
titles.to_csv('./data/unversal_table.csv', index=False)