In [1]:
import pandas as pd
import numpy as np
import re
import sklearn.metrics as metrics

In [2]:
from scipy import sparse
from functools import reduce
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_validate, KFold, cross_val_score, GroupKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler

In [3]:
def get_df():
    
    titles_df = pd.read_csv('./data/docs_titles.tsv/docs_titles.tsv', sep='\t')
    docs_id_test = pd.read_csv('./data/test_groups.csv', sep=',')
    docs_id_train = pd.read_csv('./data/train_groups.csv', sep=',')

    info = pd.concat([docs_id_train, docs_id_test])
    info.reset_index(drop=True)

    titles = pd.merge(titles_df, info[['group_id', 'doc_id', 'target']], on='doc_id', how='inner')
    titles['title'] += ' '
    titles['title'].fillna(' ', inplace=True)
    
    return titles

In [4]:
def cleaner(text):
    
    text = re.sub(r'\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.lower()
    
    return text

In [5]:
def cleaning(title, group_num, bad_words):
    
#     print(title)
    title = title.lower()
    
    title = re.sub(r'\W', '  ', title)
    
    for i in bad_words[group_num-1]:
            title = title.replace(' ' + i + ' ', ' ')
    title = re.sub(r'\s+', ' ', title)
    
    return title

In [6]:
def bad_words_cleaner(bad_words, table_name='./data/unversal_table_with_bad_words.csv' ):
                            #dict: {group_1 : bad_words_1,
                            #       .....................
                            #       group_n : bad_words_n}     

#     print(len(df))
    
    df = pd.read_csv(table_name)
    df.fillna('', inplace=True)
    df['title'] = df[['title', 'group_id']].apply(lambda x: cleaning(*x, bad_words), axis=1)
    
    return df

In [7]:
class Porter:
    PERFECTIVEGROUND =  re.compile(u"((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$")
    REFLEXIVE = re.compile(u"(с[яь])$")
    ADJECTIVE = re.compile(u"(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|ую|юю|ая|яя|ою|ею)$")
    PARTICIPLE = re.compile(u"((ивш|ывш|ующ)|((?<=[ая])(ем|нн|вш|ющ|щ)))$")
    VERB = re.compile(u"((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|ят|ует|уют|ит|ыт|ены|ить|ыть|ишь|ую|ю)|((?<=[ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)))$")
    NOUN = re.compile(u"(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$")
    RVRE = re.compile(u"^(.*?[аеиоуыэюя])(.*)$")
    DERIVATIONAL = re.compile(u".*[^аеиоуыэюя]+[аеиоуыэюя].*ость?$")
    DER = re.compile(u"ость?$")
    SUPERLATIVE = re.compile(u"(ейше|ейш)$")
    I = re.compile(u"и$")
    P = re.compile(u"ь$")
    NN = re.compile(u"нн$")

    def stem(string):
        
        changed = ''
        
        string = string.lower()

        for word in string.split():
            if not word.isdigit():
                word = word.replace(u'ё', u'е')
                m = re.match(Porter.RVRE, word)

                if m and m.groups():
                    pre = m.group(1)
                    rv = m.group(2)
                    temp = Porter.PERFECTIVEGROUND.sub('', rv, 1)
                    if temp == rv:
                        rv = Porter.REFLEXIVE.sub('', rv, 1)
                        temp = Porter.ADJECTIVE.sub('', rv, 1)
                        if temp != rv:
                            rv = temp
                            rv = Porter.PARTICIPLE.sub('', rv, 1)
                        else:
                            temp = Porter.VERB.sub('', rv, 1)
                            if temp == rv:
                                rv = Porter.NOUN.sub('', rv, 1)
                            else:
                                rv = temp
                    else:
                        rv = temp

                    rv = Porter.I.sub('', rv, 1)

                    if re.match(Porter.DERIVATIONAL, rv):
                        rv = Porter.DER.sub('', rv, 1)

                    temp = Porter.P.sub('', rv, 1)
                    if temp == rv:
                        rv = Porter.SUPERLATIVE.sub('', rv, 1)
                        rv = Porter.NN.sub(u'н', rv, 1)
                    else:
                        rv = temp
                    word = pre+rv

            changed += word + ' '

        return changed

In [8]:
def features_create(mode, groups_titledata):
    
    if mode == 'train':
        y = []      
    X = []
    groups_train = []
    
    for new_group in groups_titledata:
        docs = groups_titledata[new_group]
        
        for k, info in enumerate(docs):
            
            doc_id = info[0]
            title = info[1]
            
            if mode == 'train':
                target_id = info[2]
                y.append(target_id)
                
            groups_train.append(new_group)
            all_dist = []
            words = set(title.strip().split())
            
            for j in range(0, len(docs)):
                if k == j:
                    continue
                info = docs[j]
                doc_id_j = info[0]
                title_j = info[1]

                words_j = set(title_j.strip().split())
                all_dist.append(len(words.intersection(words_j)))
                
            X.append(sorted(all_dist, reverse=True)[0:25])
            
    X = np.array(X)
    
    if mode == 'train':
        y = np.array(y)
    
    groups_train = np.array(groups_train)

    if mode == 'train':
        print(X.shape, y.shape, groups_train.shape)
        return X, y, groups_train
    else:
        print(X.shape, groups_train.shape)
        return X, groups_train

In [9]:
def title_info_dict(mode, doc_to_title):
    
    data = pd.read_csv('./data/{}_groups.csv'.format(mode))

    titledata = {}
    
    for i in range(len(data)):
        
        new_doc = data.iloc[i]
        doc_group = new_doc['group_id']
        doc_id = new_doc['doc_id']
            
        title = doc_to_title[doc_id]
        
        if doc_group not in titledata:
            titledata[doc_group] = []
            
        if mode == 'train':
            titledata[doc_group].append((doc_id, title, new_doc['target']))
        else:
            titledata[doc_group].append((doc_id, title))
        
    return titledata

In [10]:
def tuple_x(a1, a2):
    return a1, a2

In [11]:
def easy_launch():
    
    doc_to_title = {}
    with open('./data/unversal_table.csv') as f:
        for num_line, line in enumerate(f):
            if num_line == 0:
                continue

            line = line.replace('\t', ',')
            data = line.strip().split(',')

            doc_id = int(data[0])
            if len(data) == 1:
                title = ''
            else:
                title = data[1]
            doc_to_title[doc_id] = title
            
    print('doc titles dict len = {}'.format(len(doc_to_title)))
    
    train_titledata = title_info_dict('train', doc_to_title)
    test_titledata = title_info_dict('test', doc_to_title)
    
    X_train, y_train, groups_train = features_create('train', train_titledata)
    X_test, groups_test = features_create('test', test_titledata)
    
    return X_train, y_train, X_test, groups_train, groups_test

In [12]:
def flatten(x):
    
    result = []
    for elem in x:
        if hasattr(elem, "__iter__") and not isinstance(elem, str):
            result.extend(flatten(elem))
        else:
            result.append(elem)
            
    return result

In [13]:
def list_concat(list1, list2):
    
    len1 = len(list1)
    len2 = len(list2)
    
    return [[list1[i], list2[j]] for i in range(len1) for j in range(len2)]

In [14]:
def combinations(params):
    
    list_ = []
    for value in params.values():
        list_.append(value)

    tmp = reduce(lambda x, y: list_concat(x, y), list_)

    res = []
    for elem in tmp:
        param_list = flatten(elem)
        param_dict = dict(zip(params.keys(), param_list))
        res.append(param_dict)
        
    return res

In [15]:
def frange(start, stop, step):
    i = start
    while i < stop:
        yield i
        i += step

In [16]:
def validation(X_train, train_target, model, params, folds_gen_func, groups_num=10, thresholds=[0.27], **kwargs):
    
    scaler = StandardScaler()
    scaler.fit(X_train) 
    
    main_res = []
    for param_set in combinations(params):
        
        print(param_set)        
        exact_model = model(**param_set) 
        
        fold_generator = folds_gen_func(groups_num)
        
        for th in thresholds:
    #         th = 0.27
            res = []
            for train_index, test_index in fold_generator.split(X_train, train_target, **kwargs):

                exact_model.fit(scaler.transform(X_train[train_index]), train_target[train_index])

                y_pred = [0 if val < th else 1 for val in exact_model.predict_proba(scaler.transform(X_train[test_index]))[:,1]]

                score = metrics.f1_score(train_target[test_index],\
                                                y_pred)
    #               print('threshold = {}, score = {}'.format(th, score))

                res.append(score)
        #                                       exact_model.predict(scaler.transform(X_train[test_index]))))


            mean = sum(res)/len(res)
            print(mean)
#             print('threshold = {}, score = {}'.format(th, mean))
            main_res.append((mean, param_set, th))
    
    best = main_res[np.argmax([res[0] for res in main_res])]
    print('--------max-------')
    print(best)
    
    return best

In [17]:
# Сохраняет решение

def save_submission(y_pred):

    data = pd.read_csv('data/test_groups.csv')
    print('len data = ', len(data))
    data['target'] = y_pred
    
    data = data.drop(['group_id', 'doc_id'], axis=1)

    data.to_csv("submission.csv", index=False)
    
    info = np.unique(data['target'], return_counts=True)
    
    if info[0].shape[0] > 1:
        
        print('0: {}, 1: {}'.format(info[1][0], info[1][1]))
        if info[1][1] > 6000 or info[1][1] < 2500:
            print('Your submisson is shit')
#         elif info[1][1] > 4500:
#             print('Your submisson is probably shit')
    else:
        print('There are only {} in submission'.format(info[0][0]))
        
    return data

In [18]:
def predict(X_train, X_test, train_target, model, scaler=None, **kwargs):
    
    curr_model = model(**kwargs)
    
    if scaler is not None:
        
        your_scaler = scaler()
        your_scaler.fit(X_train)
        X_train = your_scaler.transform(X_train)
        X_test = your_scaler.transform(X_test)
        
    curr_model.fit(X_train, train_target)
#     y_pred = curr_model.predict(X_test)
    th = 0.27
    y_pred = [0 if val < th else 1 for val in curr_model.predict_proba(X_test)[:,1]]
    return y_pred

In [19]:
def stemming_titles(df):
    stemming = Porter
    return df['title'].apply(lambda x: stemming.stem(x))

In [21]:
morph = pymorphy2.MorphAnalyzer()

NameError: name 'pymorphy2' is not defined

In [22]:
# Лемматизация 

def str_parser(words_): 
    global j 
    new_string = '' 

    for i in re.findall(r'\b[а-я]{1,20}\b', words_): 
        new_string += (morph.parse(i)[0].normal_form) + ' ' 

    for i in re.findall(r'\b[a-z]{1,20}\b', words_): 
        new_string += (lemmatizer.lemmatize(i)) + ' ' 

    j += 1 
    if(j % 1000 == 0): 
        print(j, '/28317 loaded') 

    return new_string

In [23]:
def lemmatization(df):
    
    return df['title'].apply(lambda x: str_parser(x))

In [24]:
def titles_extraction(apply_stemming=True, apply_lemmatization=True, table_name=None):
    
    if table_name is None:
        df_train = pd.read_csv('data/core_train.csv')
        df_test = pd.read_csv('data/core_test.csv')
        df_train.fillna(' ', inplace=True)
        df_test.fillna(' ', inplace=True)

        titles_df = pd.concat([df_train, df_test])

        titles = pd.DataFrame({'doc_id': titles_df['doc_id'] ,
                               'group_id': titles_df['group_id'],
                               'title': titles_df['title'] + titles_df['h1']})
    #                            'title': titles_df['h2'] + titles_df['h3'] + titles_df['a']})
    else:
        titles_df = pd.read_csv(table_name)
        titles_df.fillna(' ', inplace=True)
        titles = pd.DataFrame({'doc_id': titles_df['doc_id'] ,
                               'group_id': titles_df['group_id'],
                               'title': titles_df['body']})
    if apply_stemming:
        titles['title'] = stemming_titles(titles)
    if apply_lemmatization:
        titles['title'] = lemmatization(titles)
    
    titles = titles[['title', 'group_id']].groupby('group_id').sum()['title']

    titles = titles.apply(lambda x: cleaner(x))
    titles = titles.values

    return titles

In [25]:
def bad_words(all_titles, X, importance, threshold=0.015, group_number=None, ngram=(1,2)):
    
#     print(all_titles[group_number], group_number)
    if group_number is not None:
        
        print('group_number = ', group_number)
        
        group_titles = importance[group_number]
        group_titles = group_titles[0]
        
        new_dict = {v: k for k, v in X.vocabulary_.items()}
        
        array = np.where(np.bitwise_and(group_titles.toarray()!=0, group_titles.toarray()<threshold))[1]
#         print(len(group_titles.toarray()[np.where(np.bitwise_and(group_titles.toarray()!=0, group_titles.toarray()<threshold))]))
#         print('--------------',len([new_dict[index] for index in array]))
        return [new_dict[index] for index in array]
    
    else:

        group_bad_words = {group: bad_words(all_titles, importance, X, threshold, group, ngram) for group in range(len(all_titles))}

        return group_bad_words

In [26]:
one = all_titles[1]

NameError: name 'all_titles' is not defined

In [686]:
all_titles = titles_extraction(apply_stemming=False, apply_lemmatization=False, table_name='data/CORE.csv')

In [36]:
threshold = [0.007, 0.008]#, 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.02]
for t in threshold:
    print(t)
    X = TfidfVectorizer(min_df=5, max_df=0.7)
    importance = X.fit_transform(all_titles)
    words = bad_words(all_titles, X, importance, t, 1)
    print(words)
    print(np.unique(all_titles[1].split()).shape, np.unique(words).shape)

In [37]:
%%time
X = TfidfVectorizer()
importance = X.fit_transform(all_titles)
words = bad_words(all_titles, importance, X, 0.004)

In [678]:
processed_df = bad_words_cleaner(words)
processed_df

Unnamed: 0,doc_id,title,group_id
0,15731,ваз зам подшипник ступиц нив зам подшипник сту...,1
1,14829,ваз опт соч сравн цен куп потребительск товар ...,1
2,15764,куп ступиц лад трансмисс переходн ступиц цен з...,1
3,17669,классик learn center,1
4,14852,ступиц нив зам подшипник сво рук ступиц нив ка...,1
...,...,...,...
28312,16637,ответ полезн куша творог утр худет поправ обед...,309
28313,16759,творог полезн свойств лечен творог лечебн свой...,309
28314,15358,творог полезн свойств творог творог,309
28315,17287,ответ чем полезн творог чем полезн творог mail,309


In [679]:
df = pd.read_csv('./data/unversal_table_with_bad_words.csv')
df.fillna(' ', inplace=True)

info = pd.DataFrame()

info['processed'] = processed_df['title'].apply(lambda x: len(x.split()))
info['non processed'] = df['title'].apply(lambda x: len(x.split()))
info['differ'] = info['non processed'] - info['processed']

print(info['differ'].max())
info[info.differ == info['differ'].max()]
info

28199


Unnamed: 0,processed,non processed,differ
0,8,8,0
1,12,12,0
2,10,11,1
3,3,3,0
4,12,13,1
...,...,...,...
28312,18,20,2
28313,14,18,4
28314,5,6,1
28315,8,8,0


In [680]:
processed_df.to_csv('./data/unversal_table.csv', index=False)

In [643]:
df_train = pd.read_csv('data/core_train.csv')
df_test = pd.read_csv('data/core_test.csv')
df_train.fillna(' ', inplace=True)
df_test.fillna(' ', inplace=True)
titles_df = pd.concat([df_train, df_test])

In [38]:
titles = pd.DataFrame({'doc_id': titles_df['doc_id'] ,
                       'title': titles_df['title'] + titles_df['h1'],
#                        'title': titles_df['h2'] + titles_df['h3'] + titles_df['a'],
                       'group_id': titles_df['group_id']})

titles['title'] = stemming_titles(titles)
titles['title'] = lemmatization(titles)

# titles

In [646]:
titles.to_csv('./data/unversal_table.csv', index=False)

In [527]:
# processed_df['title'].values

In [484]:
def unique(x):
    unique_words = np.unique(x.split())
    text = str()
    for word in unique_words:
        text += word + ' '
    return text

In [485]:
processed_df['title_unique'] = processed_df['title'].apply(lambda x: unique(x))

In [487]:
processed_df.to_csv('./data/no_bad_words_table.csv', index=False)

In [435]:
X_train, y_train, X_test, groups_train, groups_test = easy_launch()

doc titles dict len = 28026
(11690, 25) (11690,) (11690,)
(16627, 25) (16627,)


In [436]:
params = {'learning_rate': [0.07, 0.08],
#           'n_estimators': [372, 374]}
          'n_estimators': [175, 180, 185]}
groups_train = pd.read_csv('data/train_groups.csv')['group_id']

In [437]:
best = validation(X_train, y_train, GradientBoostingClassifier, params, GroupKFold, groups=groups_train)

{'learning_rate': 0.07, 'n_estimators': 175}
0.6192471631923653
{'learning_rate': 0.07, 'n_estimators': 180}
0.6193823034312785
{'learning_rate': 0.07, 'n_estimators': 185}


KeyboardInterrupt: 

In [329]:
y_pred = predict(X_train, X_test, y_train, GradientBoostingClassifier, StandardScaler, **best[1])
data = save_submission(y_pred)

len data =  16627
0: 10613, 1: 6014
Your submisson is shit


In [521]:
    df = pd.read_csv('./data/unversal_table.csv')
    df.fillna(' ', inplace=True)
    corpus = df['title'].values
    
    vectorizer = TfidfVectorizer(**kwargs)
    X = vectorizer.fit_transform(corpus)
        
    X = X.toarray()
    for 
#     length = 0
    
#     for group_num in np.unique(df['group_id']):
        
#         group_length = len(df[df.group_id==group_num])
#         group_titles = X[length:length + group_length]
#         length += group_length
        
#         features = count_distances(group_titles)

#         np.save('group_features/{}'.format(group_num), features)
#         print('Скачалась группа:', group_num)

In [703]:
def stacking(X_train, X_test, train_target, models, params, folds_gen_func, groups_num=10, thresholds=[0.27], **kwargs):
    
    scaler = StandardScaler()
        
    main_res_train = np.zeros(shape=(train_target.shape, 1))
    main_res_test = np.zeros(shape=(X_test.shape[0], 1))
    
    for model in models:
        
        print(params[model])        
        exact_model = model(params[model]) 
        
        fold_generator = folds_gen_func(groups_num)
        
        res_train = np.zeroes(train_target.shape)
        res_test = np.zeroes(X_test.shape[0])
        
        for train_index, test_index in fold_generator.split(X_train, train_target, **kwargs):
            
            scaler.fit(X_train)
            exact_model.fit(scaler.transform(X_train[train_index]), train_target[train_index])

            y_pred_train = exact_model.predict_proba(scaler.transform(X_train[test_index]))
            res_train[test_index] = y_pred_train 
            
            scaler.fit(X_test)
            y_pred_test = exact_model.predict_proba(scaler.transform(X_test))

            res_test += y_pred_test
            
        res_test /= groups_num
        main_res_train = np.hstack((main_res_train, res_train[:, np.newaxis]))
        main_res_test = np.hstack((main_res_test, res_test[:, np.newaxis]))   

    return main_res_train, main_res_test

In [709]:
models = [GradientBoostingClassifier, RandomForestClassifier, KNeighborsClassifier]

In [710]:
params = {}
params[GradientBoostingClassifier] = {'learning_rate': [0.06],
                                      'n_estimators' : [150]}
params[RandomForestClassifier] = {'n_estimators': [1300], 
                                  'criterion': ['gini'], 
                                  'max_depth': [7], 
                                  'n_jobs': [-1]}
params[KNeighborsClassifier] = {'algorithm': ['ball_tree'],
                                'leaf_size': [10],
                                'n_neighbors': [75],
                                'p': [1],
                                'n_jobs': [-1]}

In [711]:
stacking(X_train, X_test, train_target, models, params, GroupKFold, groups=groups_train)

{'learning_rate': [0.06], 'n_estimators': [150]}

In [698]:
(X_train, y_train, RandomForestClassifier, params, GroupKFold, groups=groups_train

In [699]:
res

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [700]:
a = np.array([1, 2, 3])

In [701]:
res[[5,7,8]] = a

In [702]:
res

array([0., 0., 0., 0., 0., 1., 0., 2., 3., 0.])

In [713]:
a = np.array([[1, 2, 3, 4, 5],
              [6, 7, 8, 9, 0]])

In [715]:
np.mean(a, axis=1)[:, np.newaxis]

array([[3.],
       [6.]])

In [39]:
# np.hstack((a, np.mean(a, axis=1)[:, np.newaxis], np.std(a, axis=1)[:, np.newaxis], np.median(a, axis=1)[:, np.newaxis]))

In [719]:
main_df = pd.read_csv('data/CORE.csv')

In [720]:
body = main_df['body']

In [784]:
word, num = np.unique(body[100].split(), return_counts=True)

In [785]:
sorted(list(zip(word, num)), key=lambda pair: pair[1], reverse=True)[:7]

[('ремонт', 33),
 ('подшипник', 28),
 ('свой', 26),
 ('мост', 25),
 ('шестерня', 24),
 ('ваза', 23),
 ('передний', 23)]

In [786]:
def body_change(text):
    global j
    word, num = np.unique(text.split(), return_counts=True)
    words = sorted(list(zip(word, num)), key=lambda pair: pair[1], reverse=True)[:7]
    text = ' '.join(list(map(lambda x: x[0], words)))
    j+=1 
    print(j)
    return text

In [40]:
# titles_df = pd.DataFrame()
# j=0
# titles_df['body'] = main_df['body'].apply(lambda x: body_change(x))

In [788]:
titles = pd.DataFrame({'doc_id': main_df['doc_id'] ,
                       'title': titles_df['body'],
#                        'title': titles_df['title'] + titles_df['h1'],
#                        'title': titles_df['h2'] + titles_df['h3'] + titles_df['a'],
                       'group_id': main_df['group_id']})
titles.to_csv('./data/unversal_table.csv', index=False)

In [791]:
main_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,title,h1,strong,body,group_id,doc_id
0,0,0,0,ваза зам подшипник ступица нива,зам подшипник ступица,разборк передна тормозна механизм проверк регу...,ваза замена подшипник ступица нива автомануал ...,1,15731
1,1,1,1,ваза опт соч сравна цена купа потребительск то...,ваза опт соч,,ваза оптом сочи сравнить цена купить потребите...,1,14829
2,2,2,2,купа ступица лада калин трансмисс переходна ст...,ступица лада калин,лада калин предложный,купить ступица лада калин трансмиссия переходн...,1,15764
3,3,3,3,классик,learn center,номенклатура цена,классика главный продукция интернет магазин до...,1,17669
4,4,4,4,ступица нива зам подшипник сво рука,ступица нива как провест ремонт замена подшипник,нива шеврол,ступица нива замена подшипник свой рука контак...,1,14852
...,...,...,...,...,...,...,...,...,...
28312,28312,28312,28312,ответ полезна куш творог утро есл худо поправт...,полезна куш творог утро есл худо поправть обед...,лучш ответ остальна ответ похожий вопрос такж ...,ответ полезно кушать творог утро если худеть п...,309,16637
28313,28313,28313,28313,творог полезна свойство леченый творог женск с...,лечебн свойство продукт буква творог полезна с...,,творог полезный свойство лечение творог женски...,309,16759
28314,28314,28314,28314,творог полезна опасна свойство творог,творог,,творог полезный опасный свойство творог присое...,309,15358
28315,28315,28315,28315,ответ чем полезть творог mail,чем полезть творог,лучш ответ остальна ответ похожий вопрос такж ...,ответ чем полезный творог проектывсё категория...,309,17287


In [41]:

# j=0
# titles_df['strong'] = main_df['strong'].apply(lambda x: body_change(x))

In [793]:
titles_df['strong']

0        снятой зазор опора передна подшипник проверк р...
1                                                         
2                                    калин лада предложный
3                                        номенклатура цена
4                                              нива шеврол
                               ...                        
28312      ответ вопрос лучш остальна похожий спрашив такж
28313                                                     
28314                                                     
28315      ответ вопрос лучш остальна похожий спрашив такж
28316      годност изготовитеть срок цена сутки плохой сыр
Name: strong, Length: 28317, dtype: object

In [819]:
titles_df['strong len'] = main_df['strong'].apply(lambda x: len(x.split()))

In [42]:
# titles_df['len']

In [187]:
def img_tags_counter(doc_id):
    count = 0
    read_file = open('./content/' + str(doc_id) + '.dat', 'r', encoding = 'utf-8')
    text = read_file.read()
    pattern = re.findall(r'<font color=', text)
    return len(pattern)

def link_tags_counter(doc_id):
    count = 0
    read_file = open('./content/' + str(doc_id) + '.dat', 'r', encoding = 'utf-8')
    text = read_file.read()
    pattern = re.findall(r'<a href=', text)
    return len(pattern)

In [121]:
train = pd.read_csv('./data/train_groups.csv')

In [188]:
train['br_num'] = train['doc_id'].apply(lambda x: img_tags_counter(x))
# train['href_num'] = train['doc_id'].apply(lambda x: link_tags_counter(x))

In [43]:
# np.unique(train[train.target == 1]['br_num'], return_counts=True)

In [190]:
train[train.target == 0]['br_num'].mean()

4.5515668147436665

In [191]:
train[train.target == 1]['br_num'].median()

0.0

In [192]:
train[train.target == 0]['br_num'].median()

0.0

In [44]:
# train

In [799]:
titles_df['doc_id'] = main_df['doc_id']

In [820]:
titles_df_new = pd.merge(titles_df, train[['doc_id', 'target']], on='doc_id', how='inner')

In [821]:
titles_df_new['pic_num'] = titles_df_new['doc_id'].apply(lambda x: )
titles_df_new['pic_num'] = titles_df_new['doc_id'].apply(lambda x: )

Unnamed: 0,body,strong,len,doc_id,strong len,target
0,подшипник ступица кулак поворотный снятие коль...,снятой зазор опора передна подшипник проверк р...,579,15731,26,0
1,отзыв ваза показать номер заказ цена rub,,1535,14829,0,0
2,лада ступица передний апрель сбор оригинал нал...,калин лада предложный,1129,15764,3,0
3,ваза тольятти дааз вис сбор задний левый,номенклатура цена,2582,17669,2,0
4,подшипник ступица нива колесо передний ремонт ...,нива шеврол,684,14852,2,0
...,...,...,...,...,...,...
11926,время дата сообщение ответ что я весь,,15067,26672,0,0
11927,http url bitbon geschrieben von com news,,28082,25838,0,0
11928,что я быть это весь для ты,,15230,25703,0,0
11929,шурыгин диана goo youtube говорят канал пусть,говор диана пустой шурыгин prodeundi весь мате...,6503,27885,13,0


In [842]:
titles_df_new[titles_df_new.target == 1]['len'].median()

855.0

In [843]:
titles_df_new[titles_df_new.target == 0]['len'].median()

2669.0

In [844]:
titles_df_new[titles_df_new.target == 1]['len'].mean()

1201.6268126664693

In [845]:
titles_df_new[titles_df_new.target == 0]['len'].mean()

11022.10874649205

In [827]:
titles_df

Unnamed: 0,body,strong,len,doc_id,strong len
0,подшипник ступица кулак поворотный снятие коль...,снятой зазор опора передна подшипник проверк р...,579,15731,26
1,отзыв ваза показать номер заказ цена rub,,1535,14829,0
2,лада ступица передний апрель сбор оригинал нал...,калин лада предложный,1129,15764,3
3,ваза тольятти дааз вис сбор задний левый,номенклатура цена,2582,17669,2
4,подшипник ступица нива колесо передний ремонт ...,нива шеврол,684,14852,2
...,...,...,...,...,...
28312,год назад творог полезно mail вопрос есть,ответ вопрос лучш остальна похожий спрашив такж,400,16637,8
28313,творог для сайт полезный свойство inmoment весь,,771,16759,0
28314,творог при еда молоко он продукт свойство,,1187,15358,0
28315,творог для кальций полезный год он белка,ответ вопрос лучш остальна похожий спрашив такж,945,17287,8


In [25]:
from bs4 import BeautifulSoup

In [26]:
def meta_tag(doc_id):
    
    result = str()
    file_ = open('./content/' + str(doc_id) + '.dat', 'r', encoding = 'utf-8')
    text_ = file_.read()
    soup = BeautifulSoup(text_, 'html')
    
    for i in soup.find_all('meta', attrs = {'name' : 'Keywords'}):
        if i.find_all('content'):
            if i.attrs['content'] == '':
                
    for i in soup.find_all('meta', attrs = {'name' : 'keywords'}):
        if i.find_all('content'):
            result += i.attrs['content'] + ' '
    
    if result != '':
        return 1
    else:
        return 0

In [33]:
def meta_tag(doc_id):
    global j 
    
    result = str()
    file_ = open('./content/' + str(doc_id) + '.dat', 'r', encoding = 'utf-8')
    text_ = file_.read()
    soup = BeautifulSoup(text_, 'html')
    res = 0
    
    for i in soup.find_all('meta', attrs = {'name' : 'Keywords'}):
        if i.find_all('content'):
            result += i.attrs['content'] + ' '
    for i in soup.find_all('meta', attrs = {'name' : 'keywords'}):
        if i.find_all('content'):
            result += i.attrs['content'] + ' '
    
#     result = result.lower()
#     new_string = str()
    
#     result = steming.stem(result)
    
    if re.search(r'\b[а-я]{1,20}\b', result): 
#         new_string += (morph.parse(i)[0].normal_form) + ' ' 
        res = 1
    
    if re.search(r'\b[a-z]{1,20}\b', result):
#         new_string += (lemmatizer.lemmatize(i)) + ' ' 
        res = 1
    j+=1
    print(j)
    if res:
        return 1
    else:
        return 0

In [27]:
df = pd.read_csv('./data/unversal_table.csv')

In [45]:
# j=0
# df['keyw'] = df['doc_id'].apply(lambda x: meta_tag(x))
# # df['href_num'] = df['doc_id'].apply(lambda x: link_tags_counter(x))

In [861]:
# df = pd.merge(df, titles_df[['doc_id', 'len']], on='doc_id', how='left')

In [862]:
# df['len'] = titles_df['len']

In [66]:
df['pic_num'] = train['pic_num']
df['href_num'] = train['href_num']
df['b_num'] = train['b_num']

In [128]:
df.to_csv('./data/unversal_table.csv', index=False)

In [105]:
df

Unnamed: 0,doc_id,title,group_id,len,pic_num,href_num,b_num,br_num
0,15731,ваз зам подшипник ступиц нив зам подшипник сту...,1,579,32.0,17.0,7,6
1,14829,ваз опт соч сравн цен куп потребительск товар ...,1,1535,56.0,273.0,4,1
2,15764,куп ступиц лад трансмисс переходн ступиц цен з...,1,1129,43.0,92.0,14,13
3,17669,классик learn center,1,2582,6.0,26.0,2,1
4,14852,ступиц нив зам подшипник сво рук ступиц нив ка...,1,684,11.0,67.0,9,6
...,...,...,...,...,...,...,...,...
28312,16637,ответ полезн куша творог утр есл худет поправ ...,309,400,,,53,32
28313,16759,творог полезн свойств лечен творог женск сайт ...,309,771,,,18,12
28314,15358,творог полезн опасн свойств творог творог,309,1187,,,5,4
28315,17287,ответ чем полезн творог чем полезн творог mail,309,945,,,83,63
