In [1]:
import pandas as pd
import numpy as np
import re

In [550]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.spatial.distance import cdist
from bs4 import BeautifulSoup
from  sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

In [3]:
def cleaner(text):
    
    text = re.sub(r'\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.lower()
    
    return text

In [4]:
def titles_extraction():
    
    titles_df = pd.read_csv('./data/docs_titles.tsv/docs_titles.tsv', sep='\t')
    docs_id_test = pd.read_csv('./data/test_groups.csv', sep=',')
    docs_id_train = pd.read_csv('./data/train_groups.csv', sep=',')

    info = pd.concat([docs_id_train, docs_id_test])
    info.reset_index(drop=True)

    titles = pd.merge(titles_df, info[['group_id', 'doc_id', 'target']], on='doc_id', how='inner')
    titles['title'] += ' '
    titles['title'].fillna(' ', inplace=True)

    titles = titles[['title', 'group_id']].groupby('group_id').sum()['title']

    titles = titles.apply(lambda x: cleaner(x))
    titles = titles.values

    return titles

In [65]:
all_titles = titles_extraction()
# all_titles

In [451]:
def bad_words(all_titles, threshold=0.015, group_number=None, ngram=(1,2)):
    
    X = TfidfVectorizer(ngram_range = ngram)
    importance = X.fit_transform(all_titles)
#     print(all_titles[group_number], group_number)
    if group_number is not None:
        
        group_titles = importance[group_number]
        group_titles = group_titles[0]
        
        new_dict = {v: k for k, v in X.vocabulary_.items()}
        
        array = np.where(np.bitwise_and(group_titles.toarray()!=0, group_titles.toarray()<threshold))[1]
#         print(len(group_titles.toarray()[np.where(np.bitwise_and(group_titles.toarray()!=0, group_titles.toarray()<threshold))]))
#         print('--------------',len([new_dict[index] for index in array]))
        return [new_dict[index] for index in array]
    
    else:

        group_bad_words = {group: bad_words(all_titles, threshold, group, ngram) for group in range(len(all_titles))}

        return group_bad_words

In [452]:
threshold = [0.01, 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.02]
for t in threshold:
    print(t)
    print(bad_words(all_titles, t, 1))

0.01
['ru –∫–∞–∫', '–∞—Ä—Ö–∏–≤ —Å—Ç—Ä–∞–Ω–∏—Ü–∞', '–±–µ–∑', '–±–ª–æ–≥', '–∑–∞', '–∑–∞–ø–∏—Å–∏', '–∏–∑', '–∏–ª–∏', '—Å–∞–π—Ç']
0.011
['net', 'ru –∫–∞–∫', '–∞—Ä—Ö–∏–≤ —Å—Ç—Ä–∞–Ω–∏—Ü–∞', '–±–µ–∑', '–±–ª–æ–≥', '–≤—Å—ë', '–≥—Ä—É–ø–ø—ã', '–¥–ª—è', '–∑–∞', '–∑–∞–ø–∏—Å–∏', '–∏–∑', '–∏–ª–∏', '–º–∏—Ä', '–ø—Ä–∞–≤–∏–ª—å–Ω–æ', '–ø—Ä–æ', '—Å–∞–π—Ç']
0.012
['12', 'net', 'ru –∫–∞–∫', '–∞—Ä—Ö–∏–≤ —Å—Ç—Ä–∞–Ω–∏—Ü–∞', '–±–µ–∑', '–±–ª–æ–≥', '–≤—Å—ë', '–≥–ª–∞–≤–∞', '–≥—Ä—É–ø–ø—ã', '–¥–µ—Ç–µ–π', '–¥–ª—è', '–∑–∞', '–∑–∞–ø–∏—Å–∏', '–∑–∞–ø–∏—Å–∏ —Ä—É–±—Ä–∏–∫–µ', '–∏–∑', '–∏–ª–∏', '–∫–∞–∫ –ø—Ä–∞–≤–∏–ª—å–Ω–æ', '–∫–æ–≥–¥–∞', '–º–∏—Ä', '–ø–æ–º–æ—â—å', '–ø—Ä–∞–≤–∏–ª—å–Ω–æ', '–ø—Ä–æ', '—Ä—É–±—Ä–∏–∫–µ', '—Å–∞–π—Ç', '—Å–æ–æ–±—â–µ–Ω–∏—è', '—Ü–µ–Ω—Ç—Ä']
0.013
['12', '14', '20', '2010', 'net', 'ru –∫–∞–∫', '–∞—Ä—Ö–∏–≤ —Å—Ç—Ä–∞–Ω–∏—Ü–∞', '–±–µ–∑', '–±–ª–æ–≥', '–≤—Å—ë', '–≥–∞–∑–µ—Ç–∞', '–≥–ª–∞–≤–∞', '–≥—Ä—É–ø–ø—ã', '–≥—Ä—É–ø–ø—ã –≤–∫–æ–Ω—Ç–∞–∫—Ç–µ', '–¥–µ—Ç–µ–π', '–¥–ª—è', '–∑–∞', '–∑–∞–ø–∏—Å–∏', '–∑–∞–ø–∏—Å–∏ —Ä—É–±—Ä–∏–∫–

In [37]:
corpus = [
        'This is the first document.',
        'This document is the second document.',
        'And this is the third one.',
        'Is this the first document?',
         ]

In [62]:
%%time

bad_words(corpus, 0.35)

CPU times: user 12.7 ms, sys: 2.92 ms, total: 15.6 ms
Wall time: 13.1 ms


{0: ['document', 'is', 'is the', 'the', 'this'],
 1: ['is', 'is the', 'the', 'this'],
 2: ['is', 'is the', 'the', 'this', 'this is'],
 3: ['document', 'first', 'first document', 'is', 'the', 'the first', 'this']}

In [232]:
%%time
words = bad_words(all_titles, 0.013)
# list(all_titles)
# –ø–ª–æ—Ö–∏–µ —Å–ª–æ–≤–∞!!!

CPU times: user 4min 34s, sys: 9.61 s, total: 4min 43s
Wall time: 4min 44s


In [551]:
# for i in range(20):
#     print(len(np.unique(all_titles[i].split())), len(words[i]))

In [552]:
titles_df = pd.read_csv('./data/docs_titles.tsv/docs_titles.tsv', sep='\t')
docs_id_test = pd.read_csv('./data/test_groups.csv', sep=',')
docs_id_train = pd.read_csv('./data/train_groups.csv', sep=',')
print(len(docs_id_test) , len(docs_id_train))
info = pd.concat([docs_id_train, docs_id_test])
info.reset_index(drop=True)
print(len(info), len(titles_df))
titles = pd.merge(titles_df, info[['group_id', 'doc_id', 'target']], on='doc_id', how='right')
titles['title'].fillna(' ', inplace=True)
print(len(titles))
# titles[titles.title==' ']
# titles['group_id'].fillna(-1, inplace=True)
# titles[titles.group_id == -1 ]

16627 11690
28317 27950
28317


In [174]:
def group_titles_list_generator(df):
    
    for i in np.sort(df['group_id'].unique()):
        
        group_df = df[df.group_id == i]
        
        yield group_df

In [175]:
# –†–∞—Å—Å—á–µ—Ç —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–π –ø–æ –º–∞—Ç—Ä–∏—Ü–µ –≤—Å—Ç—Ä–µ—á–∞–µ–º–æ—Å—Ç–∏

def count_distances(docs):
    
    distance = cdist(docs, docs, 'cityblock')
    
    res = np.asarray([np.concatenate((vec[:num],vec[num+1:])).astype(int) for num, vec in enumerate(distance)])
    res = np.sort(res)[:, :20]
#     res = np.flip(res, axis=1)[:, :20]
    
    return res

In [213]:
# –†–∞—Å—Å—á–∏—Ç—ã–≤–∞–µ—Ç —Ä–∞—Å—Å—Ç–æ—è–Ω–∏—è –º–µ–∂–¥—É –¥–æ–∫—É–º–µ–Ω—Ç–∞–º–∏ –±–µ—Ä–µ—Ç 20 –Ω–∞–∏–º–µ–Ω—å—à–∏—Ö –∏ —Å–æ—Ö—Ä–∞–Ω—è–µ—Ç numpy ndarray –≤ —Ñ–∞–π–ª

def features_save(titles, vec_type=1):

    for group_titles in group_titles_list_generator(titles):
        
        group_num = group_titles['group_id'].unique()[0]
        group_titles = list(group_titles['title'])

        if vec_type == 1:
            vectorizer = CountVectorizer()
            X = vectorizer.fit_transform(group_titles)

        elif vec_type == 2:
            vectorizer2 = CountVectorizer()
            X = vectorizer2.fit_transform(group_titles)
        
        features = count_distances(X.toarray())
#         print('group number = {}, differ = {}'.format(group_num, len(group_titles) - features.shape[0]))

        np.save('group_features/{}'.format(group_num), features)
        print('Group number {} saved'.format(group_num))
    return features

In [214]:
def all_group_feature_list(start_group, finish_group):
    
    res = np.load('group_features/{}.npy'.format(start_group))
  
    for group_num in range(start_group + 1, finish_group + 1):
#         res += ndarray_to_list(np.load('group_features/{}.npy'.format(group_num)))
        res = np.vstack((res, np.load('group_features/{}.npy'.format(group_num))))
        
    return res

In [183]:
test = group_titles_list_generator(titles)

In [196]:
titles_df = pd.read_csv('./data/docs_titles.tsv/docs_titles.tsv', sep='\t')
len(titles_df), len(titles)

(27950, 28241)

In [215]:
features_save(titles)

Group number 1 saved
Group number 2 saved
Group number 3 saved
Group number 4 saved
Group number 5 saved
Group number 6 saved
Group number 7 saved
Group number 8 saved
Group number 9 saved
Group number 10 saved
Group number 11 saved
Group number 12 saved
Group number 13 saved
Group number 14 saved
Group number 15 saved
Group number 16 saved
Group number 17 saved
Group number 18 saved
Group number 19 saved
Group number 20 saved
Group number 21 saved
Group number 22 saved
Group number 23 saved
Group number 24 saved
Group number 25 saved
Group number 26 saved
Group number 27 saved
Group number 28 saved
Group number 29 saved
Group number 30 saved
Group number 31 saved
Group number 32 saved
Group number 33 saved
Group number 34 saved
Group number 35 saved
Group number 36 saved
Group number 37 saved
Group number 38 saved
Group number 39 saved
Group number 40 saved
Group number 41 saved
Group number 42 saved
Group number 43 saved
Group number 44 saved
Group number 45 saved
Group number 46 sav

array([[ 4,  4,  4, ...,  6,  6,  6],
       [ 4,  5,  5, ...,  9,  9,  9],
       [ 0,  0,  0, ...,  4,  4,  4],
       ...,
       [ 2,  2,  3, ...,  7,  7,  7],
       [ 0,  0,  2, ...,  7,  7,  7],
       [ 0,  0,  0, ..., 13, 13, 13]])

In [192]:
len(titles)

28241

In [238]:
def get_df():
    
    titles_df = pd.read_csv('./data/docs_titles.tsv/docs_titles.tsv', sep='\t')
    docs_id_test = pd.read_csv('./data/test_groups.csv', sep=',')
    docs_id_train = pd.read_csv('./data/train_groups.csv', sep=',')

    info = pd.concat([docs_id_train, docs_id_test])
    info.reset_index(drop=True)

    titles = pd.merge(titles_df, info[['group_id', 'doc_id', 'target']], on='doc_id', how='right')

    titles['title'].fillna(' ', inplace=True)
    
    return titles

In [239]:
def cleaning(title, group_num, bad_words):
    
    title = title.lower()
    title = re.sub(r'\W', '  ', title)
    
    for i in bad_words[group_num-1]:
            title = title.replace(' ' + i + ' ', ' ')
    title = re.sub(r'\s+', ' ', title)
    
    return title

In [244]:
def bad_words_cleaner(bad_words):
                            #dict: {group_1 : bad_words_1,
                            #       .....................
                            #       group_n : bad_words_n}     
    df = get_df()
    print(len(df))
    df['title'] = df[['title', 'group_id']].apply(lambda x: cleaning(*x, bad_words), axis=1)
    
    return df

In [553]:
# processed_df = bad_words_cleaner(words)
# processed_df[processed_df.target is not NaN]

In [554]:
# features_save(processed_df)

In [563]:
def upload_mised_titles():
    
    df = get_df()
    df['title'].fillna(' ', inplace=True)
    docs_id = df['doc_id'][df.title == ' '].unique()
    df = pd.read_csv('./data/docs_titles.tsv/docs_titles.tsv', sep='\t')
    df
    titles = []
    for i in docs_id:

        file = open('./content/' + str(i) + '.dat', 'r', encoding = 'utf-8')
        text = file.read()
        soup = BeautifulSoup(text, 'html')

        words = ''
        for title in soup.find_all(['title']):
            words += (title.text + ' ')
        titles.append(words)

    new_df = pd.DataFrame({'title': list(df['title']) + titles,
                           'doc_id': list(df['doc_id']) + list(docs_id)})
    
    new_df['title'].fillna(' ', inplace=True)
    shit = new_df['doc_id'][new_df.title == ' '].unique()
    
    docs_id_test = pd.read_csv('./data/test_groups.csv', sep=',')
    docs_id_train = pd.read_csv('./data/train_groups.csv', sep=',')

    info = pd.concat([docs_id_train, docs_id_test])
    info.reset_index(drop=True)

    titles = pd.merge(new_df, info[['group_id', 'doc_id', 'target']], on='doc_id', how='right')
    titles = titles[['doc_id', 'title', 'group_id', 'target']]

    titles['title'] = titles['title'].apply(lambda x: x.replace('\n', ' '))
    
    titles['title'] = stemming_titles(titles)
    
    titles.to_csv('./data/unversal_table.csv', index=False)
    
    return shit, titles

In [564]:
upload_mised_titles()

(array([17939, 27042, 27737, 26800,  4078,  8569,  2073, 11658, 17122,
        17485,  9164, 10590,  2952,  3002, 18840, 13623, 23594, 16795,
        13853, 10364, 15252,  3646,  7853, 12195,  5280,  4734, 20393,
        20572,   589, 14730, 16225, 21529]),
        doc_id                                              title  group_id  \
 0       15731            –≤–∞–∑ 21213 | –∑–∞–º –ø–æ–¥—à–∏–ø–Ω–∏–∫ —Å—Ç—É–ø–∏—Ü | –Ω–∏–≤          1   
 1       14829  –≤–∞–∑ 2107 –æ–ø—Ç –≤ —Å–æ—á–∏. —Å—Ä–∞–≤–Ω —Ü–µ–Ω—ã, –∫—É–ø –ø–æ—Ç—Ä–µ–±–∏—Ç–µ...         1   
 2       15764  –∫—É–ø —Å—Ç—É–ø–∏—Ü –ª–∞–¥ –∫–∞–ª–∏–Ω–∞2. —Ç—Ä–∞–Ω—Å–º–∏—Å—Å - –ø–µ—Ä–µ—Ö–æ–¥–Ω —Å...         1   
 3       17669                             –∫–ª–∞—Å—Å–∏–∫ 21010 - 21074          1   
 4       14852                —Å—Ç—É–ø–∏—Ü –Ω–∏–≤ ‚Äî –∑–∞–º –ø–æ–¥—à–∏–ø–Ω–∏–∫ —Å–≤–æ —Ä—É–∫          1   
 ...       ...                                                ...       ...   
 28344   19232         –±–æ–µ–≤ —Ç–µ—Ö–Ω–∏–∫ - –∂–∏–∑–Ω –∏ —Å

In [None]:
# –ó–∞–≥—Ä—É–∂–∞–µ–º –≥–æ—Ç–æ–≤–æ–µ

In [532]:
doc_to_title = {}
with open('./data/unversal_table.csv') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
#         print(type(line))
        line = line.replace('\t', ',')
        
        data = line.strip().split(',')
        print(data)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = title
print (len(doc_to_title))

# s = Porter

# for i in doc_to_title:
#     doc_to_title[i] = s.stem(doc_to_title[i])
# doc_to_title

['15731', '–í–ê–ó 21213 | –ó–∞–º–µ–Ω–∞ –ø–æ–¥—à–∏–ø–Ω–∏–∫–æ–≤ —Å—Ç—É–ø–∏—Ü—ã | –ù–∏–≤–∞', '1', '0.0']
['14829', '"–í–∞–∑ 2107 –æ–ø—Ç–æ–º –≤ –°–æ—á–∏. –°—Ä–∞–≤–Ω–∏—Ç—å —Ü–µ–Ω—ã', ' –∫—É–ø–∏—Ç—å –ø–æ—Ç—Ä–µ–±–∏—Ç–µ–ª—å—Å–∫–∏–µ —Ç–æ–≤–∞—Ä—ã –Ω–∞ Tiu.ru"', '1', '0.0']
['15764', '"–ö—É–ø–∏—Ç—å —Å—Ç—É–ø–∏—Ü–∞ –õ–∞–¥–∞ –∫–∞–ª–∏–Ω–∞2. –¢—Ä–∞–Ω—Å–º–∏—Å—Å–∏—è - –ø–µ—Ä–µ—Ö–æ–¥–Ω—ã–µ —Å—Ç—É–ø–∏—Ü—ã —Ü–µ–Ω–∞', ' –∑–∞–º–µ–Ω–∞', ' —Ç—é–Ω–∏–Ω–≥."', '1', '0.0']
['17669', '–ö–ª–∞—Å—Å–∏–∫–∞ 21010 - 21074', '1', '0.0']
['14852', '–°—Ç—É–ø–∏—Ü–∞ –ù–∏–≤–∞ ‚Äî –∑–∞–º–µ–Ω–∞ –ø–æ–¥—à–∏–ø–Ω–∏–∫–∞ —Å–≤–æ–∏–º–∏ —Ä—É–∫–∞–º–∏', '1', '0.0']
['15458', '–í–ê–ó 2110', '1', '0.0']
['14899', '"–û–±–∑–æ—Ä –ø–æ–¥—à–∏–ø–Ω–∏–∫–æ–≤ –ø–æ–ª—É–æ—Å–∏ –í–ê–ó 2101-07', ' 2121', '2123"', '1', '0.0']
['16879', '–ö—É–ø–∏—Ç—å –ü–æ–¥—à–∏–ø–Ω–∏–∫–∏ –∏ —Å—Ç—É–ø–∏—Ü—ã FAG (–°—Ç—Ä–∞–Ω–∏—Ü–∞ 23)', '1', '0.0']
['16310', 'HorsePowers ‚Äî –∞–≤—Ç–æ–º–æ–±–∏–ª—å–Ω—ã–π –∏–Ω—Ç–µ—Ä–Ω–µ—Ç –ø–æ—Ä—Ç–∞–ª ¬ª –û—Ç–∑—ã–≤ –≤–ª–∞–¥–µ–ª—å—Ü–∞ –í–ê–ó 2121 –ù–∏–≤

['12242', '–ö–∞–∫ —Å–æ–∑–¥–∞—Ç—å –æ–±—Ä–∞–∑ –¥–∏—Å–∫–∞ —Å –ø–æ–º–æ—â—å—é UltraISO | Complandia. –û–±–∑–æ—Ä –º–∞—Ç—á–µ–π', '21', '0.0']
['11472', 'VMware Workstation –∑–∞–≥—Ä—É–∑–∫–∞ windows —Å —Ñ–ª–µ—à–∫–∏ - –≠–º—É–ª—è—Ç–æ—Ä—ã –∏ –≤–∏—Ä—Ç—É–∞–ª—å–Ω—ã–µ –º–∞—à–∏–Ω—ã - CyberForum.ru', '21', '0.0']
['11703', '–ö–∞–∫ —Å–¥–µ–ª–∞—Ç—å –∑–∞–≥—Ä—É–∑–æ—á–Ω—É—é —Ñ–ª–µ—à–∫—É - YouTube', '21', '1.0']
['11421', '–ß–µ—Ä–µ–∑ –ö–∞–∫—É—é –ü—Ä–æ–≥—Ä–∞–º–º—É –°–¥–µ–ª–∞—Ç—å –ü—Ä–µ–∑–µ–Ω—Ç–∞—Ü–∏—é - YouTube', '21', '0.0']
['14036', '–ù–∞—Å—Ç—Ä–æ–π–∫–∞ –ø—Ä–æ–∫—Å–∏ Android 3.2', '21', '0.0']
['12853', '–ö–∞–∫ –Ω–∞ –Ω–æ—É—Ç–±—É–∫–µ hp –∑–∞–≥—Ä—É–∑–∏—Ç—å—Å—è —Å —Ñ–ª–µ—à–∫–∏', '21', '0.0']
['12184', '"–ö–∞–∫ —Å–¥–µ–ª–∞—Ç—å –∑–∞–≥—Ä—É–∑–æ—á–Ω—É—é —Ñ–ª–µ—à–∫—É —Å Windows 7', '8 !! –ë—ã—Å—Ç—Ä–æ!!!! - YouTube"', '21', '1.0']
['13975', '–ö–∞–∫ –∏–∑–º–µ–Ω–∏—Ç—å —Ñ–æ–Ω –∑–∞–≥—Ä—É–∑—á–∏–∫–∞ —É —Ñ–ª–µ—à–∫–∏? –£—Ä–æ–∫ ‚Ññ3 - YouTube', '21', '0.0']
['12698', '–í–æ—Å—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∏–µ Windows —Å USB-—Ñ–ª–µ—à–∫–∏ –∏ DVD

['26489', '–ù–∞—Ä–æ–¥–Ω—ã–µ —Ä–µ—Ü–µ–ø—Ç—ã –¥–ª—è –ø–æ—Ö—É–¥–µ–Ω–∏—è', '42', '1.0']
['26972', '–ù–∞—Ä–æ–¥–Ω—ã–µ –º–µ—Ç–æ–¥—ã –¥–ª—è —Å–∂–∏–≥–∞–Ω–∏—è –∂–∏—Ä–∞ –Ω–∞ –∂–∏–≤–æ—Ç–µ - –ñ–µ–Ω—Å–∫–∏–π —Å–∞–π—Ç ¬´–ö–∞—Ç–µ—Ä–∏–Ω–∞¬ª', '42', '1.0']
['27281', '–ù–∞—Ä–æ–¥–Ω—ã–µ —Å—Ä–µ–¥—Å—Ç–≤–∞ –¥–ª—è –ø–æ—Ö—É–¥–µ–Ω–∏—è', '42', '1.0']
['26419', '–ë—ã—Å—Ç—Ä–æ–µ –ø–æ—Ö—É–¥–µ–Ω–∏–µ –Ω–∞—Ä–æ–¥–Ω—ã–µ —Å—Ä–µ–¥—Å—Ç–≤–∞ ¬ª –í–∞—à –¥–æ–∫—Ç–æ—Ä –ê–π–±–æ–ª–∏—Ç', '42', '1.0']
['26073', '–ö–∞–∫ –±—ã—Å—Ç—Ä–æ –ø–æ—Ö—É–¥–µ—Ç—å –≤ –¥–æ–º–∞—à–Ω–∏—Ö —É—Å–ª–æ–≤–∏—è—Ö - –ñ–µ–Ω—Å–∫–∏–π —Å–∞–π—Ç ¬´–ö–∞—Ç–µ—Ä–∏–Ω–∞¬ª', '42', '1.0']
['26084', '–ù–∞—Ä–æ–¥–Ω—ã–µ —Ä–µ—Ü–µ–ø—Ç—ã –¥–ª—è –ø–æ—Ö—É–¥–µ–Ω–∏—è –≤ –¥–æ–º–∞—à–Ω–∏—Ö —É—Å–ª–æ–≤–∏—è—Ö', '42', '1.0']
['27722', '–ö–∞–∫ –≤—ã–ª–µ—á–∏—Ç—å –∫–∞—à–µ–ª—å –±—ã—Å—Ç—Ä–æ –≤ –¥–æ–º–∞—à–Ω–∏—Ö —É—Å–ª–æ–≤–∏—è—Ö', '42', '0.0']
['25866', '"–ù–∞—Ä–æ–¥–Ω—ã–µ —Å—Ä–µ–¥—Å—Ç–≤–∞ –¥–ª—è –ø–æ—Ö—É–¥–µ–Ω–∏—è', ' —Ö—É–¥–µ–µ–º –Ω–∞—Ä–æ–¥–Ω—ã–º–∏ —Å—Ä–µ–¥—Å—Ç–≤–∞–º–∏"', '42', '1.0']
['26695', '"–î–∏–µ

['23390', '—é—Ä–æ–∫ –ø–µ—Ç—Ä–æ–≤', '64', '0.0']
['22155', '"–í –º–∞–≥–Ω–∏—Ç–æ—Ñ–æ–Ω–µ –∫–∞—Å—Å–µ—Ç–∞ –ß–µ–ª–µ–Ω—Ç–∞–Ω–æ - —Å–ª—É—à–∞—Ç—å –æ–Ω–ª–∞–π–Ω', ' –±—ã—Å—Ç—Ä–æ —Å–∫–∞—á–∞—Ç—å –±–µ–∑ —Ä–µ–≥–∏—Å—Ç—Ä–∞—Ü–∏–∏ –≤ —Ñ–æ—Ä–º–∞—Ç–µ mp3 —Ç–æ–ª—å–∫–æ —É –Ω–∞—Å"', '64', '0.0']
['23753', '–ê–≤—Ç–æ–ø–æ–Ω—Ç—ã... (21/28) [–§–æ—Ä—É–º—ã Balancer.Ru]', '64', '0.0']
['24276', '–ï–ª–µ–Ω–∞ –ü–æ–ø–æ–≤–∞. –ü—Ä–æ–∑–∞ ‚Äî –†—É–±—Ä–∏–∫–∏ –∞–≤—Ç–æ—Ä–∞ ‚Äî –û–º–∏–ª–∏—è', '64', '0.0']
['24572', 'Lib.ru/–°–æ–≤—Ä–µ–º–µ–Ω–Ω–∞—è –ª–∏—Ç–µ—Ä–∞—Ç—É—Ä–∞: –õ–µ—â–∏–Ω—Å–∫–∏–π –õ–µ–æ–Ω–∏–¥ –ê–±—Ä–∞–º–æ–≤–∏—á. –ë—É–Ω—Ç –≤ —É–º–∞—Ö', '64', '0.0']
['22474', '–Ø–ü–ª–∞–∫–∞–ª—ä - –ê—Ä—Ö–∏–≤ –æ—Ç 16.07.2014', '64', '0.0']
['24257', '—Ç—Ä–µ–±–æ–≤–∞–Ω–∏—è | –°–ø–æ—Ä—Ç–∏–≤–Ω—ã–µ —Å—Ç–∞—Ç—å–∏ - Part 11', '64', '0.0']
['22512', '–°–∫–∞—á–∞—Ç—å –∞–π –ª–∞–π–∫ - –º—É–∑—ã–∫—É –≤ mp3 –∏–ª–∏ —Å–ª—É—à–∞—Ç—å –æ–Ω–ª–∞–π–Ω - –Ω–∞–π–¥–µ–Ω–æ 281 –ø–µ—Å–Ω–∏/–ø–µ—Å–µ–Ω –Ω–∞ Zvonko.me!', '64', '0.0']
['24127', '"–∫–∞–∫ —á–µ–ª–µ–Ω—Ç–∞–Ω–æ —Ä–µ–≤–∞

['25959', '–ú–æ–±–∏–ª—å–Ω—ã–π —Ñ–æ—Ä—É–º –û–º—Å–∫–∞. –ö–æ–¥–µ–∫—Å —Ç–æ—Ä–≥–æ–≤–ª–∏. –†–∞–∑–±–æ—Ä —Ç–µ—Ö–Ω–∏–∫–∏ –≤ –û–º—Å–∫–µ. –†–µ–º–æ–Ω—Ç —Å–æ—Ç–æ–≤—ã—Ö –≤ –û–º—Å–∫–µ. –ó–∞–ø—á–∞—Å—Ç–∏ –¥–ª—è —Å–æ—Ç–æ–≤—ã—Ö. –ò–Ω—Ç–µ—Ä–Ω–µ—Ç –¥–æ—Å–∫–∞ —á–∞—Å—Ç–Ω—ã—Ö –±–µ—Å–ø–ª–∞—Ç–Ω—ã—Ö –æ–±—ä—è–≤–ª–µ–Ω–∏–π —ç–ª–µ–∫—Ç—Ä–æ–Ω–∏–∫–∏ –ú–æ–±–∏–ª—å–Ω—ã–π –û–º—Å–∫.', '79', '0.0']
['26842', '–°–∫–∞—á–∞—Ç—å –∫–≤–∞—Ä—Ç–∏—Ä—ã –¥–ª—è —Å–∏–º—Å 3 –±–µ—Å–ø–ª–∞—Ç–Ω–æ', '79', '0.0']
['27662', '–ì–¥–µ —Å–∫–∞—á–∞—Ç—å The Sims 4 –ú–∞–ª—ã—à–∏ –∏ –í–∞–º–ø–∏—Ä—ã? –ù–æ–≤—ã–π —Å–ø–æ—Å–æ–± arkwars.ru', '79', '0.0']
['27391', '–°–∫–∞—á–∞—Ç—å –º–æ–¥—ã –Ω–∞ –ø–ª–∞–Ω—à–µ—Ç –≤ —Å–∏–º—Å 3', '79', '0.0']
['27858', '–í–∏–¥–µ–æ-—É—Ä–æ–∫ ‚Ññ1 –£—á–∏–º—Å—è —Å–∫–∞—á–∏–≤–∞—Ç—å –¥–æ–ø.–º–∞—Ç–µ—Ä–∏–∞–ª—ã –¥–ª—è The sims 3. –£—Ä–æ–∫–∏ –≤—è–∑–∞–Ω–∏—è –Ω–∞ –≤–∏–¥–µ–æ', '79', '0.0']
['842', '"–ö–∞–∫ —É—Å—Ç–∞–Ω–æ–≤–∏—Ç—å Sims 4', ' –∞ —Ç–∞–∫–∂–µ –∫–∞–∫ —É—Å—Ç–∞–Ω–æ–≤–∏—Ç—å –º–æ–¥—ã –∏ –¥–æ–ø–æ–ª–Ω–µ–Ω–∏—è –∫ –Ω–µ–π"', '79', '1.0']
['842', '"–ö–

['9868', '–ö—Ç–æ –ø–µ–ª –ø–µ—Å–Ω—é bad boys', '102', '1.0']
['10518', '–ú–æ–ª–æ–¥–µ–∂–Ω—ã–π —Ö–∏—Ç-–ø–∞—Ä–∞–¥ - –ö–∞—Ä–∞–æ–∫–µ (DVD-5) ¬ª –ù–æ–≤–∞—è –º—É–∑—ã–∫–∞ 2017 –≥–æ–¥–∞ —Å–∫–∞—á–∞—Ç—å –±–µ—Å–ø–ª–∞—Ç–Ω–æ', '102', '0.0']
['11034', 'shaitanAlan  - YouTube', '102', '0.0']
['7698', '"–ë–∏–æ–≥—Ä–∞—Ñ–∏–∏ —Ä—ç–ø–ø–µ—Ä–æ–≤ - –ë–∏–æ–≥—Ä–∞—Ñ–∏–∏ - –°–ª–æ–≤–∞', ' —Ç–µ–∫—Å—Ç—ã –ø–µ—Å–µ–Ω', ' –Ω–æ–≤–æ—Å—Ç–∏', ' –Ω–∞—Ä–µ–∑–∫–∏ –Ω–∞ –∑–≤–æ–Ω–æ–∫ –Ω–∞ - Rap-Content.Ru ¬ª –°—Ç—Ä–∞–Ω–∏—Ü–∞ 3"', '102', '0.0']
['8717', '"–ù–µ–≥—Ä–∏—Ç—è–Ω—Å–∫–∞—è –ø–µ—Å–Ω—è - –±–µ—Å–ø–ª–∞—Ç–Ω–æ —Å–∫–∞—á–∞—Ç—å', ' —Å–ª—É—à–∞—Ç—å –æ–Ω–ª–∞–π–Ω –±–µ–∑ —Ä–µ–≥–∏—Å—Ç—Ä–∞—Ü–∏–∏ –≤ —Ñ–æ—Ä–º–∞—Ç–µ –º–ø3 –Ω–∞ –Ω–∞—à–µ–º –ø–æ–∏—Å–∫–æ–≤–æ–º –ø–æ—Ä—Ç–∞–ª–µ"', '102', '0.0']
['7635', '–ë–æ–± –ú–∞—Ä–ª–∏ - –Ø –ª–µ–≥–µ–Ω–¥–∞ - –±–æ–± –º–∞—Ä–ª–∏ –ø–µ—Å–Ω—è –ª–µ–≥–µ–Ω–¥–∞ —Å–∫–∞—á–∞—Ç—å –∏ —Å–ª—É—à–∞—Ç—å mp3', '102', '0.0']
['10913', '–°—Ü–µ–Ω–∞—Ä–∏–π –¥–ª—è —Ä–∞–∑–ª–∏—á–Ω—ã—Ö –º–µ—Ä–æ–ø—Ä–∏—è—Ç–∏–π ¬ª –°—Ü–µ–Ω–∫–∏ 9 –º–∞—è', '102', '0.0

['7071', '–û—Ç–≤–µ—Ç—ã@Mail.Ru: –ø–æ—á–µ–º—É –≤ –∞–∫–≤–∞—Ä–∏—É–º–µ –∑–µ–ª–µ–Ω–µ–µ—Ç –≤–æ–¥–∞?', '123', '1.0']
['5397', '–ü–æ—á–µ–º—É –≤ –∞–∫–≤–∞—Ä–∏—É–º–µ –∑–µ–ª–µ–Ω–µ–µ—Ç –≤–æ–¥–∞ –∏ –∫–∞–∫ —Å —ç—Ç–∏–º –±–æ—Ä–æ—Ç—å—Å—è', '123', '1.0']
['6384', '–û—Ç–≤–µ—Ç –Ω–∞ –≤–æ–ø—Ä–æ—Å: –∫–∞–∫ –æ—á–∏—Å—Ç–∏—Ç—å –∞–∫–≤–∞—Ä–∏—É–º –æ—Ç –∑–µ–ª–µ–Ω–∏ –ø–æ—á–µ–º—É –∑–µ–ª–µ–Ω–µ–µ—Ç –≤–æ–¥–∞', '123', '0.0']
['5955', '–ü–æ—á–µ–º—É –∑–µ–ª–µ–Ω–µ—é—Ç —Å—Ç–µ–∫–ª–∞ –≤ –∞–∫–≤–∞—Ä–∏—É–º–µ - –∞ —Å—Ç–µ–∫–ª–∞ –∑–µ–ª–µ–Ω–µ—é—Ç –ø–æ—Ç–æ–º—É', '123', '1.0']
['7376', '–ó–µ–ª–µ–Ω–µ–µ—Ç –∞–∫–≤–∞—Ä–∏—É–º. –ß—Ç–æ –¥–µ–ª–∞—Ç—å?', '123', '1.0']
['4042', '–û—Ç–≤–µ—Ç—ã@Mail.Ru: –ü–æ—á–µ–º—É –≤ –∞–∫–≤–∞—Ä–∏—É–º–µ –∑–µ–ª–µ–Ω–µ–µ—Ç –≤–æ–¥–∞?', '123', '1.0']
['6834', "–ó–µ–ª–µ–Ω–µ–µ—Ç –≤–æ–¥–∞ –≤ –∞–∫–≤–∞—Ä–∏—É–º–µ: –∏—â–µ–º –ø—Ä–∏—á–∏–Ω—ã –ø—Ä–æ–±–ª–µ–º—ã –∏ —Å–ø–æ—Å–æ–±—ã –±–æ—Ä—å–±—ã —Å –Ω–µ–π - Woman's Day", '123', '1.0']
['5969', '–ü–æ—á–µ–º—É –∑–µ–ª–µ–Ω–µ–µ—Ç –≤–æ–¥–∞ –≤ –∞–∫–≤–∞—Ä–∏—É–º–µ ? | Hawkish.ru', '123', '1.0']
['6920

['9976', '–°–∫–ª–æ–Ω–Ω–æ—Å—Ç—å –∫ —Ö—Ä–∞–ø—É —Ñ–æ—Ä–º–∏—Ä—É–µ—Ç—Å—è –≤ –¥–µ—Ç—Å—Ç–≤–µ', '145', '']
['8089', '"–í–∏—à–Ω–µ–≤–∞ –¢. - –°—Ç—Ä–µ–ª—å–Ω–∏–∫–æ–≤–∞. –í–∏–ª—É–Ω–∞—Å', ' –ë—É—Ç–µ–π–∫–æ. –õ—É—á—à–∏–µ –¥—ã—Ö–∞—Ç–µ–ª—å–Ω—ã–µ –ø—Ä–∞–∫—Ç–∏–∫–∏ –¥–ª—è –∑–¥–æ—Ä–æ–≤—å—è"', '145', '']
['8988', '–î–∏–∞–≥–Ω–æ—Å—Ç–∏–∫–∞ –¥–µ—Ç–µ–π —Ä–∞–∑–Ω—ã—Ö –≤–æ–∑—Ä–∞—Å—Ç–æ–≤. –î–∏–∞–≥–Ω–æ—Å—Ç–∏–∫–∞ –¥–µ—Ç–µ–π. zhizn-rebenka.ru', '145', '']
['10176', '–ù–∞—Ä—É—à–µ–Ω–∏—è —Å–Ω–∞ –≤ –ø–æ–∑–¥–Ω–µ–º –≤–æ–∑—Ä–∞—Å—Ç–µ. –û—Å–æ–±–µ–Ω–Ω–æ—Å—Ç–∏ –º–µ–¥–∏–∫–∞–º–µ–Ω—Ç–æ–∑–Ω–æ–π –∫–æ—Ä—Ä–µ–∫—Ü–∏–∏. –°—Ç–∞—Ç—å–∏ - –ü–æ—Ä—Ç–∞–ª ¬´–ë–æ–ª—å¬ª', '145', '']
['9843', '–û–°–û–ë–ï–ù–ù–û–°–¢–ò –°–¢–†–û–ï–ù–ò–Ø –ò –§–£–ù–ö–¶–ò–ò –í–ò–°–û–ß–ù–û-–ù–ò–ñ–ù–ï–ß–ï–õ–Æ–°–¢–ù–û–ì–û –°–£–°–¢–ê–í–ê', '145', '']
['10054', '–ú–∏—Ä –ö–∞—Ä–∏–∫–∞. –ü–µ—Ä–≤—ã–π –∏–≥—Ä–æ–∫ - –ú–∏—Ä –ö–∞—Ä–∏–∫–∞. –ü–µ—Ä–≤—ã–π –∏–≥—Ä–æ–∫', '145', '']
['8379', '"–ú–∞—Ç–µ—Ä–∏–Ω—Å—Ç–≤–æ > –û—Å—Ç–∞–≤—å—Ç–µ –µ–≥–æ', ' –ø—É—Å—Ç—å –ø–æ–ø–ª–∞—á–µ—Ç"', '145', '']
['9217', '

['5747', '–í –†–æ—Å—Å–∏–∏ –Ω–∞—á–Ω—É—Ç –ø—Ä–∏–º–µ–Ω—è—Ç—å –Ω–æ–≤—ã–π –≤–∏–¥ —É–≥–æ–ª–æ–≤–Ω–æ–≥–æ –Ω–∞–∫–∞–∑–∞–Ω–∏—è', '166', '']
['6536', '–°–µ—Ä–≥–µ–π –ú–∞—Ä—á–µ–Ω–∫–æ | –í–ö–æ–Ω—Ç–∞–∫—Ç–µ', '166', '']
['6566', '–õ–µ–æ–Ω—Ç—å–µ–≤ –ê. –û –Ω–æ–≤–æ–º –∏–∑–¥–∞–Ω–∏–∏ –ø–µ—Ä–≤–æ–≥–æ —Ç–æ–º–∞ &laquo;–ö–∞–ø–∏—Ç–∞–ª–∞&raquo;', '166', '']
['7360', '"–ö–∞–∫ —Ä–∞—Å—Å—á–∏—Ç–∞—Ç—å ¬´–±–æ–ª—å–Ω–∏—á–Ω—ã–π¬ª: —Ñ–æ—Ä–º—É–ª–∞', ' –ø—Ä–∏–º–µ—Ä—ã"', '166', '']
['6282', '–ö–ù–ò–ì–ê –í–¢–û–†–ê–Ø –°—Ç–∞–Ω–æ–≤–ª–µ–Ω–∏–µ –£—á–∏—Ç–µ–ª—è / –ú–Ω–æ–∂–µ—Å—Ç–≤–µ–Ω–Ω—ã–µ —É–º—ã –ë–∏–ª–ª–∏ –ú–∏–ª–ª–∏–≥–∞–Ω–∞', '166', '']
['6784', 'Setroi - –ê—Ä—Ç–µ—Ñ–∞–∫—Ç–æ—Ä | samread', '166', '']
['6831', '–ö–∞–∫ —Ä–∞—Å—Å—á–∏—Ç–∞—Ç—å –æ—Ç–ø—É—Å–∫–Ω—ã–µ –≤ 2016 –≥–æ–¥—É: –ò–Ω—Å—Ç—Ä—É–∫—Ü–∏—è', '166', '']
['4925', '–ö–∞–∫ —Ä–∞—Å—Å—á–∏—Ç–∞—Ç—å —Å—Ä–µ–¥–Ω–µ–¥–Ω–µ–≤–Ω–æ–π –∑–∞—Ä–∞–±–æ—Ç–æ–∫: –æ—Å–æ–±–µ–Ω–Ω–æ—Å—Ç–∏ –∏ –º–µ—Ç–æ–¥–∏–∫–∞', '166', '']
['6232', '–¥–æ—Ä–æ–¥–æ–≤—ã–µ', '166', '']
['4370', '–ö–∞–∫ —Ä–∞—Å—Å—á–∏—Ç–∞—Ç—å –∫–æ–º–ø–µ–Ω

['22224', '–°—É–ø–µ—Ä–≥–µ—Ä–æ–∏ The CW –≤—ã—à–ª–∏ –Ω–∞ –±–∏—Ç–≤—É —Å –ø—Ä–∏—à–µ–ª—å—Ü–∞–º–∏ –≤ —Ç—Ä–µ–π–ª–µ—Ä–µ –∫—Ä–æ—Å—Å–æ–≤–µ—Ä–∞ | –ù–æ–≤–æ—Å—Ç–∏  | –ö–∞–Ω–æ–±—É', '188', '']
['22886', '"—Å–µ—Ä–∏–∞–ª —Ñ–ª—ç—à 3 —Å–µ—Ä–∏—è - hd', ' –≤ —Ö–æ—Ä–æ—à–µ–º –∫–∞—á–µ—Å—Ç–≤–µ', ' –≤–∏–¥–µ–æ–±–æ–∫—Å.—Ç–≤', ' —Å–º–æ—Ç—Ä–µ—Ç—å –æ–Ω–ª–∞–π–Ω - VideoBox.tv"', '188', '']
['23146', 'MySQL Fatal Error', '188', '']
['23347', '"–°—É–ø–µ—Ä–≥—ë—Ä–ª 3 —Å–µ–∑–æ–Ω –¥–∞—Ç–∞ –≤—ã—Ö–æ–¥–∞ –≤—Å–µ—Ö —Å–µ—Ä–∏–π | –î–∞—Ç–∞ –≤—ã—Ö–æ–¥–∞ —Ñ–∏–ª—å–º–æ–≤', '–∏–≥—Ä –∏ —Å–µ—Ä–∏–∞–ª–æ–≤."', '188', '']
['22779', '–°–º–æ—Ç—Ä–µ—Ç—å –æ–Ω–ª–∞–π–Ω —Ñ–∏–ª—å–º—ã –≤ —Ö–æ—Ä–æ—à–µ–º –∫–∞—á–µ—Å—Ç–≤–µ –≤ 720p hd', '188', '']
['22542', '–°–º–æ—Ç—Ä–µ—Ç—å —Ö–æ—Ä–æ—à–µ–µ –∫–∏–Ω–æ –æ–Ω–ª–∞–π–Ω –Ω–∞ KinoVO.cc ¬ª –°—Ç—Ä–∞–Ω–∏—Ü–∞ 810', '188', '']
['22844', '–°–∫–∞—á–∞—Ç—å –õ—é—Ü–∏—Ñ–µ—Ä 2 —Å–µ–∑–æ–Ω / Lucifer (2016) MP4 –±–µ—Å–ø–ª–∞—Ç–Ω–æ —Å —Ç–æ—Ä—Ä–µ–Ω—Ç–∞ –±–µ–∑ —Ä–µ–≥–∏—Å—Ç—Ä–∞—Ü–∏–∏ - –§–∏–ª—å–º—ã  - TorrentBest.ru - —Å–∫–∞—á–∞—Ç—å –±–µ

['23035', '–û—Ç–≤–µ—Ç—ã@Mail.Ru: –ö–∞–∫ –≥–æ—Ç–æ–≤–∏—Ç—å —Å—Ç–µ—Ä–ª—è–¥—å???', '208', '']
['23711', '"–∫–∞–∫ –ø—Ä–∏–≥–æ—Ç–æ–≤–∏—Ç—å —Å—Ç–µ—Ä–ª—è–¥—å –≤ –¥—É—Ö–æ–≤–∫–µ —Ü–µ–ª–∏–∫–æ–º ¬ª –í–ö–£–°–ù–´–ï –†–ï–¶–ï–ü–¢–´ –° –§–û–¢–û. –ß—Ç–æ –ø—Ä–∏–≥–æ—Ç–æ–≤–∏—Ç—å –Ω–∞ –∑–∞–≤—Ç—Ä–∞–∫', ' –Ω–∞ –æ–±–µ–¥', ' –Ω–∞ —É–∂–∏–Ω. –ö—Ä–∞—Å–∏–≤—ã–µ –∫—É–ª–∏–Ω–∞—Ä–Ω—ã–µ —Ä–µ—Ü–µ–ø—Ç—ã –Ω–∞ –∫–∞–∂–¥—ã–π –¥–µ–Ω—å!"', '208', '']
['22309', '–ö–∞–∫ –≥–æ—Ç–æ–≤–∏—Ç—å —Å—Ç–µ—Ä–ª—è–¥—å –≤ –¥—É—Ö–æ–≤–∫–µ ¬ª –í–ö–£–°–ù–´–ï –†–ï–¶–ï–ü–¢–´ 2017 –° –§–û–¢–û', '208', '']
['23696', '–°—Ç–µ—Ä–ª—è–¥—å –∑–∞–ø–µ—á–µ–Ω–Ω–∞—è - –ø–æ—à–∞–≥–æ–≤—ã–π —Ä–µ—Ü–µ–ø—Ç —Å —Ñ–æ—Ç–æ - –¥–ª—è –¥—É—Ö–æ–≤–∫–∏.', '208', '']
['23822', '–û—Å–µ—Ç—Ä —Ü–µ–ª–∏–∫–æ–º –≤ –¥—É—Ö–æ–≤–∫–µ —Ä–µ—Ü–µ–ø—Ç | –ö–∞–∫ –ø—Ä–∏–≥–æ—Ç–æ–≤–∏—Ç—å –æ—Å–µ—Ç—Ä–∞ –≤ –¥—É—Ö–æ–≤–∫–µ', '208', '']
['23671', '–ó–∞–ø–µ–∫–∞–µ–º –æ—Å–µ—Ç—Ä–∞ –≤ –¥—É—Ö–æ–≤–∫–µ —Ü–µ–ª–∏–∫–æ–º –ø–æ—à–∞–≥–æ–≤—ã–π —Ä–µ—Ü–µ–ø—Ç —Å —Ñ–æ—Ç–æ', '208', '']
['22258', '–ö–∞–∫ –ø—Ä–∏–≥–æ—Ç–æ–≤–∏—Ç—å —Å—Ç–µ—Ä

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




['7516', '–°–∫–∞—á–∞—Ç—å –°–∫–∞—á–∞—Ç—å –±–µ—Å–ø–ª–∞—Ç–Ω–æ PDF-XChange Viewer PRO 2.5.205 (2012) ML/RUS+–∫–ª—é—á - –¢–µ–∫—Å—Ç–æ–≤—ã–µ —Ä–µ–¥–∞–∫—Ç–æ—Ä—ã  - –ø—Ä–æ–≥—Ä–∞–º–º—ã –Ω–∞ —Ä—É—Å—Å–∫–æ–º —Å–∫–∞—á–∞—Ç—å –Ω–æ–≤—ã–π —Å–æ—Ñ—Ç - RUS SOFT', '304', '']
['4054', '–ñ—É—Ä–Ω–∞–ª ¬´–ö–æ–º–ø—å—é—Ç–µ—Ä—Ä–∞¬ª ‚Ññ 25-26 –æ—Ç 10 –∏—é–ª—è 2007 –≥–æ–¥–∞ (693 –∏ 694 –Ω–æ–º–µ—Ä) (fb2) | –ö—É–ª–õ–∏–± - –ö–ª–∞—Å—Å–Ω–∞—è –±–∏–±–ª–∏–æ—Ç–µ–∫–∞! –°–∫–∞—á–∞—Ç—å –∫–Ω–∏–≥–∏ –±–µ—Å–ø–ª–∞—Ç–Ω–æ', '304', '']
['4473', '–õ–∞–Ω–¥—à–∞—Ñ—Ç–Ω—ã–π –¥–∏–∑–∞–π–Ω: –ü—Ä–æ–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –¥–æ–º–æ–≤ –∏ –∫–æ—Ç—Ç–µ–¥–∂–µ–π (–ü—Ä–æ–≥—Ä–∞–º–º—ã)', '304', '']
['5114', 'Android —Ç–µ–Ω—å —Ç–µ–∫—Å—Ç–∞', '304', '']
['5756', 'Word 2007 : –ò–Ω—Ñ–æ—Ä–º–∞—Ç–∏–∫–∞ : –î–ò–ü–õ–û–ú–ù–ê–Ø - –ë–ê–ó–ê –î–ò–ü–õ–û–ú–ù–´–• –†–ê–ë–û–¢ –†–û–°–°–ò–ô–°–ö–ò–• –í–£–ó–û–í.', '304', '']
['5501', '–ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä–æ–≤–∞—Ç—å –≤ BMP', '304', '']
['5792', 'PDF –≤ BMP: –∫–æ–Ω–≤–µ—Ä—Ç–∏—Ä–æ–≤–∞—Ç—å –∏–∑ PDF –≤ BMP (–±–∏—Ç–æ–≤—ã–µ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–

In [350]:
def tuple_x(a1, a2):
    return a1, a2

In [557]:
def my_doc_to_title():
    
    processed_df = get_df()
    processed_df['title'] = stemming_titles(processed_df)
    processed_df
    
    processed_df['target'].fillna(-1, inplace=True)
    train_titles = processed_df[processed_df.target!=-1]
    test_titles = processed_df[processed_df.target==-1]
    
    processed_df['tuple'] = processed_df[['doc_id', 'title']].apply(lambda x: tuple_x(*x), axis=1)
    doc_to_title = {}
    for i in processed_df['tuple']:
        doc_to_title[i[0]] = i[1]
        
    return doc_to_title

In [555]:

# train_titles, test_titles

In [558]:
doc_to_title = my_doc_to_title()

In [559]:
import pandas as pd
train_data = pd.read_csv('./data/train_groups.csv')
# train_data = train_titles
traingroups_titledata = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = doc_to_title[doc_id]
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))
# traingroups_titledata

In [535]:
import numpy as np
y_train = []
X_train = []
groups_train = []
for new_group in traingroups_titledata:
    docs = traingroups_titledata[new_group]
    for k, (doc_id, title, target_id) in enumerate(docs):
        y_train.append(target_id)
        groups_train.append(new_group)
        all_dist = []
        words = set(title.strip().split())
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j, target_j = docs[j]
            words_j = set(title_j.strip().split())
            all_dist.append(len(words.intersection(words_j)))
        X_train.append(sorted(all_dist, reverse=True)[0:25]    )
X_train = np.array(X_train)
y_train = np.array(y_train)
groups_train = np.array(groups_train)
print (X_train.shape, y_train.shape, groups_train.shape)

(11690, 25) (11690,) (11690,)


In [560]:
import pandas as pd
test_data = pd.read_csv('./data/test_groups.csv')
# test_data = test_titles
traingroups_titledata = {}
for i in range(len(test_data)):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    title = doc_to_title[doc_id]
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title))
# traingroups_titledata

In [537]:
# y_train = []
X_test = []
groups_test = []
for new_group in traingroups_titledata:
    docs = traingroups_titledata[new_group]
    for k, (doc_id, title) in enumerate(docs):
#         y_train.append(target_id)
        groups_test.append(new_group)
        all_dist = []
        words = set(title.strip().split())
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j = docs[j]
            words_j = set(title_j.strip().split())
            all_dist.append(len(words.intersection(words_j)))
        X_test.append(sorted(all_dist, reverse=True)[0:25]    )
X_test = np.array(X_test)
# y_train = np.array(y_train)
groups_test = np.array(groups_test)
print (X_test.shape, groups_test.shape)

(16627, 25) (16627,)


In [538]:
clf = GradientBoostingClassifier(learning_rate=0.06, n_estimators=150)

In [539]:
def predict(X_train, X_test, train_target, model, scaler=None, **kwargs):
    
    curr_model = model(**kwargs)
    
    if scaler is not None:
        
        your_scaler = scaler()
        your_scaler.fit(X_train)
        X_train = your_scaler.transform(X_train)
        X_test = your_scaler.transform(X_test)
        
    curr_model.fit(X_train, train_target)
#     y_pred = curr_model.predict(X_test)
    th = 0.34
    y_pred = [0 if val < th else 1 for val in curr_model.predict_proba(X_test)[:,1]]
    return y_pred

In [546]:
# –°–æ—Ö—Ä–∞–Ω—è–µ—Ç —Ä–µ—à–µ–Ω–∏–µ

def save_submission(y_pred):

    data = pd.read_csv('data/test_groups.csv')
    print('len data = ', len(data))
    data['target'] = y_pred
    
#     for x in shit:
#         print(data[data.doc_id==x]['target'])
    
    data = data.drop(['group_id', 'doc_id'], axis=1)

    data.to_csv("submission.csv", index=False)
    
    info = np.unique(data['target'], return_counts=True)
    
    if info[0].shape[0] > 1:
        
        print('0: {}, 1: {}'.format(info[1][0], info[1][1]))
        if info[1][1] > 3600 or info[1][1] < 2500:
            print('Your submisson is shit')
    
    else:
        print('There are only {} in submission'.format(info[0][0]))
        
    return data

In [547]:
y_pred = predict(X_train, X_test, y_train, GradientBoostingClassifier, learning_rate=0.06, n_estimators=150)
data = save_submission(y_pred)

len data =  16627
Series([], Name: target, dtype: int64)
Series([], Name: target, dtype: int64)
Series([], Name: target, dtype: int64)
Series([], Name: target, dtype: int64)
Series([], Name: target, dtype: int64)
Series([], Name: target, dtype: int64)
Series([], Name: target, dtype: int64)
Series([], Name: target, dtype: int64)
Series([], Name: target, dtype: int64)
Series([], Name: target, dtype: int64)
Series([], Name: target, dtype: int64)
Series([], Name: target, dtype: int64)
Series([], Name: target, dtype: int64)
Series([], Name: target, dtype: int64)
Series([], Name: target, dtype: int64)
Series([], Name: target, dtype: int64)
671    0
Name: target, dtype: int64
2853    0
Name: target, dtype: int64
3006    0
Name: target, dtype: int64
4046    0
Name: target, dtype: int64
6027    0
Name: target, dtype: int64
6403     0
16320    0
Name: target, dtype: int64
6913    0
Name: target, dtype: int64
9183    0
Name: target, dtype: int64
11467    0
Name: target, dtype: int64
11482    0
Na

In [417]:
import re

class Porter:
    PERFECTIVEGROUND =  re.compile(u"((–∏–≤|–∏–≤—à–∏|–∏–≤—à–∏—Å—å|—ã–≤|—ã–≤—à–∏|—ã–≤—à–∏—Å—å)|((?<=[–∞—è])(–≤|–≤—à–∏|–≤—à–∏—Å—å)))$")
    REFLEXIVE = re.compile(u"(—Å[—è—å])$")
    ADJECTIVE = re.compile(u"(–µ–µ|–∏–µ|—ã–µ|–æ–µ|–∏–º–∏|—ã–º–∏|–µ–π|–∏–π|—ã–π|–æ–π|–µ–º|–∏–º|—ã–º|–æ–º|–µ–≥–æ|–æ–≥–æ|–µ–º—É|–æ–º—É|–∏—Ö|—ã—Ö|—É—é|—é—é|–∞—è|—è—è|–æ—é|–µ—é)$")
    PARTICIPLE = re.compile(u"((–∏–≤—à|—ã–≤—à|—É—é—â)|((?<=[–∞—è])(–µ–º|–Ω–Ω|–≤—à|—é—â|—â)))$")
    VERB = re.compile(u"((–∏–ª–∞|—ã–ª–∞|–µ–Ω–∞|–µ–π—Ç–µ|—É–π—Ç–µ|–∏—Ç–µ|–∏–ª–∏|—ã–ª–∏|–µ–π|—É–π|–∏–ª|—ã–ª|–∏–º|—ã–º|–µ–Ω|–∏–ª–æ|—ã–ª–æ|–µ–Ω–æ|—è—Ç|—É–µ—Ç|—É—é—Ç|–∏—Ç|—ã—Ç|–µ–Ω—ã|–∏—Ç—å|—ã—Ç—å|–∏—à—å|—É—é|—é)|((?<=[–∞—è])(–ª–∞|–Ω–∞|–µ—Ç–µ|–π—Ç–µ|–ª–∏|–π|–ª|–µ–º|–Ω|–ª–æ|–Ω–æ|–µ—Ç|—é—Ç|–Ω—ã|—Ç—å|–µ—à—å|–Ω–Ω–æ)))$")
    NOUN = re.compile(u"(–∞|–µ–≤|–æ–≤|–∏–µ|—å–µ|–µ|–∏—è–º–∏|—è–º–∏|–∞–º–∏|–µ–∏|–∏–∏|–∏|–∏–µ–π|–µ–π|–æ–π|–∏–π|–π|–∏—è–º|—è–º|–∏–µ–º|–µ–º|–∞–º|–æ–º|–æ|—É|–∞—Ö|–∏—è—Ö|—è—Ö|—ã|—å|–∏—é|—å—é|—é|–∏—è|—å—è|—è)$")
    RVRE = re.compile(u"^(.*?[–∞–µ–∏–æ—É—ã—ç—é—è])(.*)$")
    DERIVATIONAL = re.compile(u".*[^–∞–µ–∏–æ—É—ã—ç—é—è]+[–∞–µ–∏–æ—É—ã—ç—é—è].*–æ—Å—Ç—å?$")
    DER = re.compile(u"–æ—Å—Ç—å?$")
    SUPERLATIVE = re.compile(u"(–µ–π—à–µ|–µ–π—à)$")
    I = re.compile(u"–∏$")
    P = re.compile(u"—å$")
    NN = re.compile(u"–Ω–Ω$")

    def stem(string):
        
        changed = ''
        
        string = string.lower()

        for word in string.split():
            if not word.isdigit():
                word = word.replace(u'—ë', u'–µ')
                m = re.match(Porter.RVRE, word)

                if m and m.groups():
                    pre = m.group(1)
                    rv = m.group(2)
                    temp = Porter.PERFECTIVEGROUND.sub('', rv, 1)
                    if temp == rv:
                        rv = Porter.REFLEXIVE.sub('', rv, 1)
                        temp = Porter.ADJECTIVE.sub('', rv, 1)
                        if temp != rv:
                            rv = temp
                            rv = Porter.PARTICIPLE.sub('', rv, 1)
                        else:
                            temp = Porter.VERB.sub('', rv, 1)
                            if temp == rv:
                                rv = Porter.NOUN.sub('', rv, 1)
                            else:
                                rv = temp
                    else:
                        rv = temp

                    rv = Porter.I.sub('', rv, 1)

                    if re.match(Porter.DERIVATIONAL, rv):
                        rv = Porter.DER.sub('', rv, 1)

                    temp = Porter.P.sub('', rv, 1)
                    if temp == rv:
                        rv = Porter.SUPERLATIVE.sub('', rv, 1)
                        rv = Porter.NN.sub(u'–Ω', rv, 1)
                    else:
                        rv = temp
                    word = pre+rv

            changed += word + ' '

        return changed

#     stem=staticmethod(stem)

#     if __name__ == '__main__':
#         print Porter.stem(u'—É—Å—Ç–æ–π—á–∏–≤–æ—Å—Ç—å')

In [414]:
Porter.stem(u'–∏–≥—Ä–æ–∫–∏ –∏–≥—Ä–∞–ª–∏ –≤ —Å–∫–æ—Ä–æ—Å—Ç–Ω—ã–µ –º—è—á–∏')

'–∏–≥—Ä–æ–∫ –∏–≥—Ä–∞ –≤ —Å–∫–æ—Ä–æ—Å—Ç–Ω –º—è—á '

In [415]:
def stemming_titles(df):
    stemming = Porter
    return df['title'].apply(lambda x: stemming.stem(x))

In [419]:
processed_df['title'] = stemming_titles(processed_df)
processed_df

Unnamed: 0,doc_id,title,group_id,target,tuple,new_title
0,15731,–≤–∞–∑ 21213 –∑–∞–º –ø–æ–¥—à–∏–ø–Ω–∏–∫ —Å—Ç—É–ø–∏—Ü –Ω–∏–≤,1,0.0,"(15731, –≤–∞–∑ 21213 –∑–∞–º–µ–Ω–∞ –ø–æ–¥—à–∏–ø–Ω–∏–∫–æ–≤ —Å—Ç—É–ø–∏—Ü—ã –Ω...",–≤–∞–∑ –∑–∞–º –ø–æ–¥—à–∏–ø–Ω–∏–∫ —Å—Ç—É–ø–∏—Ü –Ω–∏–≤
1,14829,–≤–∞–∑ 2107 –≤ —Å–æ—á —Å—Ä–∞–≤–Ω —Ü–µ–Ω –∫—É–ø –ø–æ—Ç—Ä–µ–±–∏—Ç–µ–ª—å—Å–∫ —Ç–æ–≤...,1,0.0,"(14829, –≤–∞–∑ 2107 –≤ —Å–æ—á–∏ —Å—Ä–∞–≤–Ω–∏—Ç—å —Ü–µ–Ω—ã –∫—É–ø–∏—Ç—å –ø...",–≤–∞–∑ —Å–æ—á —Å—Ä–∞–≤–Ω —Ü–µ–Ω –∫—É–ø –ø–æ—Ç—Ä–µ–±–∏—Ç–µ–ª—å—Å–∫ —Ç–æ–≤–∞—Ä –Ω–∞
2,15764,–∫—É–ø —Å—Ç—É–ø–∏—Ü –ª–∞–¥ –∫–∞–ª–∏–Ω–∞2 –ø–µ—Ä–µ—Ö–æ–¥–Ω —Å—Ç—É–ø–∏—Ü —Ü–µ–Ω –∑–∞–º,1,0.0,"(15764, –∫—É–ø–∏—Ç—å —Å—Ç—É–ø–∏—Ü–∞ –ª–∞–¥–∞ –∫–∞–ª–∏–Ω–∞2 –ø–µ—Ä–µ—Ö–æ–¥–Ω—ã–µ...",–∫—É–ø —Å—Ç—É–ø–∏—Ü –ª–∞–¥ –∫–∞–ª–∏–Ω–∞2 –ø–µ—Ä–µ—Ö–æ–¥–Ω —Å—Ç—É–ø–∏—Ü —Ü–µ–Ω –∑–∞–º
3,17669,–∫–ª–∞—Å—Å–∏–∫ 21010 21074,1,0.0,"(17669, –∫–ª–∞—Å—Å–∏–∫–∞ 21010 21074 )",–∫–ª–∞—Å—Å–∏–∫
4,14852,—Å—Ç—É–ø–∏—Ü –Ω–∏–≤ –∑–∞–º –ø–æ–¥—à–∏–ø–Ω–∏–∫ —Å–≤–æ —Ä—É–∫,1,0.0,"(14852, —Å—Ç—É–ø–∏—Ü–∞ –Ω–∏–≤–∞ –∑–∞–º–µ–Ω–∞ –ø–æ–¥—à–∏–ø–Ω–∏–∫–∞ —Å–≤–æ–∏–º–∏ ...",—Å—Ç—É–ø–∏—Ü –Ω–∏–≤ –∑–∞–º –ø–æ–¥—à–∏–ø–Ω–∏–∫ —Å–≤–æ —Ä—É–∫
...,...,...,...,...,...,...
28312,19232,,267,-1.0,"(19232, )",
28313,18594,,267,-1.0,"(18594, )",
28314,20015,,267,-1.0,"(20015, )",
28315,21264,,267,-1.0,"(21264, )",
