In [2]:
import warnings
warnings.simplefilter('ignore')

import os
import re
import gc
import glob
from collections import Counter

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from tqdm.auto import tqdm
tqdm.pandas()

import scipy
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk import ne_chunk, pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import spacy
from textblob import TextBlob
import gensim

STOP_WORDS = set(stopwords.words('english'))
SPACY_NER_MODEL = spacy.load('en_core_web_sm')

from pandarallel import pandarallel
pandarallel.initialize(nb_workers=4, progress_bar=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
%%time

train = pd.read_pickle('./data/train_data.pkl')
valid = pd.read_pickle('./data/valid_data.pkl')

# 暂时不用 title
train.drop(['title'], axis=1, inplace=True)
valid.drop(['title'], axis=1, inplace=True)
gc.collect()

display(train)
display(valid)

Unnamed: 0,prompt_answer,target,candidate,id,text,recall_rank
0,What is the formula for calculating the nullit...,1,17458663,3,The nullity of a graph in the mathematical sub...,1
1,What is the formula for calculating the nullit...,0,17458495,3,"In graph theory, a branch of mathematics, the ...",2
2,What is the formula for calculating the nullit...,0,745714,3,"In the mathematical theory of matroids, a grap...",3
3,What is the formula for calculating the nullit...,0,37123010,3,"In the mathematical theory of matroids, the ra...",4
4,What is the formula for calculating the nullit...,0,244321,3,"In combinatorics, a branch of mathematics, a m...",5
...,...,...,...,...,...,...
29999995,What is the Ukrainian Physicists' Tournament f...,0,68001314,68891,Andrey Valeryevich Andreychenko (; born 29 Mar...,996
29999996,What is the Ukrainian Physicists' Tournament f...,0,52096152,68891,The 2016 European championships of internation...,997
29999997,What is the Ukrainian Physicists' Tournament f...,0,53113196,68891,Curling at the 2017 Winter Universiade was hel...,998
29999998,What is the Ukrainian Physicists' Tournament f...,0,57749759,68891,The 2018–19 Ukrainian Football Amateur League ...,999


Unnamed: 0,prompt_answer,target,candidate,id,text,recall_rank
0,What is the definition of a solvent? A substan...,0,37431,0,"thumb|Ethyl acetate, nail polish solvent. A so...",1
1,What is the definition of a solvent? A substan...,0,2740944,0,"Universal solvent may refer to: *Water, descri...",2
2,What is the definition of a solvent? A substan...,1,73305281,0,A solvent is a substance that is liquid at the...,3
3,What is the definition of a solvent? A substan...,0,5048079,0,A solvent dye is a dye soluble in organic solv...,4
4,What is the definition of a solvent? A substan...,0,229643,0,Molality is a measure of the number of moles o...,5
...,...,...,...,...,...,...
2526995,"Where is the Komatsu 960E-1 assembled? Peoria,...",0,25121593,2526,"KOA Corporation (Japanese: コーア株式会社, Kōa kabush...",996
2526996,"Where is the Komatsu 960E-1 assembled? Peoria,...",0,15040535,2526,The Mitsubishi Colt (A20) was one of their fir...,997
2526997,"Where is the Komatsu 960E-1 assembled? Peoria,...",0,62697879,2526,The KTM 250 FRR was a racing motorcycle made b...,998
2526998,"Where is the Komatsu 960E-1 assembled? Peoria,...",0,1054261,2526,The Mitsubishi 6A1 engine is a series of pisto...,999


CPU times: user 29.5 s, sys: 13.7 s, total: 43.1 s
Wall time: 43.1 s


In [4]:
# train 负采样

train_pos = train[train['target'] == 1].copy().reset_index(drop=True)
train_neg = train[train['target'] == 0].copy().reset_index(drop=True)
train_neg = train_neg.sample(n=300000, random_state=42).copy().reset_index(drop=True)
train = pd.concat([train_pos, train_neg]).sort_values(
    ['id', 'recall_rank'], ascending=[True, True]).reset_index(drop=True)
print(train['target'].mean())

0.0908842968444594


In [5]:
display(train)

Unnamed: 0,prompt_answer,target,candidate,id,text,recall_rank
0,What is the formula for calculating the nullit...,1,17458663,3,The nullity of a graph in the mathematical sub...,1
1,What is the formula for calculating the nullit...,0,2472880,3,"In mathematics, a biased graph is a graph with...",68
2,What is the formula for calculating the nullit...,0,59747277,3,thumb|An illustration of the three sets in the...,286
3,What is the formula for calculating the nullit...,0,37868330,3,"In probability theory, a transition rate matri...",292
4,What is the formula for calculating the nullit...,0,48413286,3,"In discrete mathematics, the Bregman–Minc ineq...",344
...,...,...,...,...,...,...
329986,What is the Ukrainian Physicists' Tournament f...,0,49478061,68891,The Ukrainian men's national 3x3 team represen...,394
329987,What is the Ukrainian Physicists' Tournament f...,0,26757624,68891,The 2009–10 Ukrainian League Cup was the only ...,430
329988,What is the Ukrainian Physicists' Tournament f...,0,67123178,68891,This is a comprehensive list of victories of t...,554
329989,What is the Ukrainian Physicists' Tournament f...,0,73925214,68891,FC Shakhta Ukraina Ukrainsk was a football tea...,610


In [9]:
# 输入 datafram 输出特征

def make_features(dataframe):
    
    dataframe['token1'] = dataframe['prompt_answer'].parallel_apply(lambda x: x.split()) 
    dataframe['token2'] = dataframe['text'].parallel_apply(lambda x: x.split())
    
    dataframe.drop(['prompt_answer', 'text'], axis=1, inplace=True)
    gc.collect()
    
    def word_overlap_count(token1, token2): 
        def check_is_stop_word(word):
            return word not in STOP_WORDS
        words1 = list(filter(check_is_stop_word, token1))
        words2 = list(filter(check_is_stop_word, token2))
        return len(set(words1).intersection(set(words2)))
    
    def ngrams(token, n):
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]
    
    def ngram_co_occurrence(token1, token2, n):
        ngrams1 = set(ngrams(token1, n))
        ngrams2 = set(ngrams(token2, n))
        common_ngrams = ngrams1.intersection(ngrams2)
        return len(common_ngrams)
    
    dataframe['words_overlap_count'] = dataframe.parallel_apply(
        lambda row: word_overlap_count(row['token1'], row['token2']), axis=1)
    
    dataframe['2gram_overlap_count'] = dataframe.parallel_apply(
        lambda row: ngram_co_occurrence(row['token1'], row['token2'], 2), axis=1)

    dataframe['3gram_overlap_count'] = dataframe.parallel_apply(
        lambda row: ngram_co_occurrence(row['token1'], row['token2'], 3), axis=1)

    dataframe['4gram_overlap_count'] = dataframe.parallel_apply(
        lambda row: ngram_co_occurrence(row['token1'], row['token2'], 4), axis=1)
    
    dataframe['text_token_length'] = dataframe['token2'].apply(len)
    
    dataframe['words_overlap_ratio'] = dataframe['words_overlap_count'] / (dataframe['text_token_length'] + 1)
    dataframe['2gram_overlap_ratio'] = dataframe['2gram_overlap_count'] / (dataframe['text_token_length'] + 1)
    dataframe['3gram_overlap_ratio'] = dataframe['3gram_overlap_count'] / (dataframe['text_token_length'] + 1)
    dataframe['4gram_overlap_ratio'] = dataframe['4gram_overlap_count'] / (dataframe['text_token_length'] + 1)
    
    dataframe.drop(['token1', 'token2'], axis=1, inplace=True)
    gc.collect()
    
    return dataframe

In [10]:
%%time

bs = 100000
n_splits = len(train) // bs if len(train) % bs == 0 else len(train) // bs + 1

for i in range(n_splits):
    print(f'{i*bs} start')

    if bs*(i+1) < len(train):
        dataframe = train[bs*i:bs*(i+1)].copy().reset_index(drop=True)
    else:
        dataframe = train[bs*i:].copy().reset_index(drop=True)
        
    dataframe = make_features(dataframe)
    display(dataframe)
    dataframe.to_pickle(f'train_features_{i}.pkl')
    del dataframe; gc.collect()

0 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,17458663,3,1,11,16,11,6,330,0.033233,0.048338,0.033233,0.018127
1,0,2472880,3,68,9,10,4,1,1534,0.005863,0.006515,0.002606,0.000651
2,0,59747277,3,286,5,4,1,0,975,0.005123,0.004098,0.001025,0.000000
3,0,37868330,3,292,3,3,1,0,279,0.010714,0.010714,0.003571,0.000000
4,0,48413286,3,344,5,5,2,0,589,0.008475,0.008475,0.003390,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,35184346,20678,690,0,0,0,0,1185,0.000000,0.000000,0.000000,0.000000
99996,0,25233839,20678,749,0,0,0,0,64,0.000000,0.000000,0.000000,0.000000
99997,0,72067116,20678,778,0,0,0,0,108,0.000000,0.000000,0.000000,0.000000
99998,0,27057753,20678,875,0,0,0,0,219,0.000000,0.000000,0.000000,0.000000


100000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,0,58611075,20678,926,3,0,0,0,354,0.008451,0.000000,0.000000,0.0
1,1,4485403,20681,1,7,6,2,0,3756,0.001863,0.001597,0.000532,0.0
2,0,5302845,20681,224,1,1,0,0,1033,0.000967,0.000967,0.000000,0.0
3,0,46863916,20681,225,3,2,0,0,326,0.009174,0.006116,0.000000,0.0
4,0,423832,20681,497,2,2,0,0,359,0.005556,0.005556,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,70788340,41754,606,1,1,0,0,406,0.002457,0.002457,0.000000,0.0
99996,0,43072020,41754,624,1,2,0,0,381,0.002618,0.005236,0.000000,0.0
99997,0,57442012,41754,644,0,2,0,0,3362,0.000000,0.000595,0.000000,0.0
99998,0,651485,41754,701,6,4,1,0,1720,0.003486,0.002324,0.000581,0.0


200000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,0,27400699,41754,998,0,1,0,0,202,0.000000,0.004926,0.000000,0.000000
1,1,19874353,41757,1,21,20,7,4,1371,0.015306,0.014577,0.005102,0.002915
2,0,23305394,41757,79,2,0,0,0,61,0.032258,0.000000,0.000000,0.000000
3,0,1884226,41757,179,3,2,0,0,318,0.009404,0.006270,0.000000,0.000000
4,0,1528995,41757,275,1,0,0,0,40,0.024390,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,2175813,62695,698,9,3,0,0,2254,0.003991,0.001330,0.000000,0.000000
99996,0,63345354,62695,716,1,0,0,0,155,0.006410,0.000000,0.000000,0.000000
99997,0,32961823,62695,855,1,0,0,0,81,0.012195,0.000000,0.000000,0.000000
99998,0,21152191,62695,874,3,0,0,0,625,0.004792,0.000000,0.000000,0.000000


300000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=7498), Label(value='0 / 7498'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=7498), Label(value='0 / 7498'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=7498), Label(value='0 / 7498'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=7498), Label(value='0 / 7498'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=7498), Label(value='0 / 7498'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=7498), Label(value='0 / 7498'))), …

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,0,32566,62698,200,4,3,0,0,1377,0.002903,0.002177,0.0,0.0
1,0,21227565,62698,248,9,4,0,0,2281,0.003944,0.001753,0.0,0.0
2,0,70470456,62698,266,1,0,0,0,17,0.055556,0.000000,0.0,0.0
3,0,13936369,62698,363,3,2,0,0,552,0.005425,0.003617,0.0,0.0
4,0,4320615,62698,390,4,4,0,0,2457,0.001627,0.001627,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29986,0,49478061,68891,394,3,1,0,0,153,0.019481,0.006494,0.0,0.0
29987,0,26757624,68891,430,5,1,0,0,1817,0.002750,0.000550,0.0,0.0
29988,0,67123178,68891,554,0,0,0,0,887,0.000000,0.000000,0.0,0.0
29989,0,73925214,68891,610,2,1,0,0,146,0.013605,0.006803,0.0,0.0


CPU times: user 5min 11s, sys: 1min 34s, total: 6min 45s
Wall time: 9min 49s


In [11]:
%%time

bs = 100000
n_splits = len(valid) // bs if len(valid) % bs == 0 else len(valid) // bs + 1

for i in range(n_splits):
    print(f'{i*bs} start')
    
    if bs*(i+1) < len(valid):
        dataframe = valid[bs*i:bs*(i+1)].copy().reset_index(drop=True)
    else:
        dataframe = valid[bs*i:].copy().reset_index(drop=True)
        
    dataframe = make_features(dataframe)
    display(dataframe)
    dataframe.to_pickle(f'valid_features_{i}.pkl')
    del dataframe; gc.collect()

0 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,0,37431,0,1,4,3,0,0,3710,0.001078,0.000808,0.000000,0.000000
1,0,2740944,0,2,3,0,0,0,36,0.081081,0.000000,0.000000,0.000000
2,1,73305281,0,3,8,12,7,6,4104,0.001949,0.002923,0.001705,0.001462
3,0,5048079,0,4,1,0,0,0,167,0.005952,0.000000,0.000000,0.000000
4,0,229643,0,5,5,4,1,0,2334,0.002141,0.001713,0.000428,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,58055739,99,996,0,0,0,0,505,0.000000,0.000000,0.000000,0.000000
99996,0,38776215,99,997,0,0,0,0,458,0.000000,0.000000,0.000000,0.000000
99997,0,43344479,99,998,4,2,1,0,605,0.006601,0.003300,0.001650,0.000000
99998,0,25025517,99,999,4,2,0,0,3397,0.001177,0.000589,0.000000,0.000000


100000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,674969,100,1,7,6,4,3,880,0.007946,0.006810,0.004540,0.003405
1,0,43551551,100,2,2,0,0,0,207,0.009615,0.000000,0.000000,0.000000
2,0,27127825,100,3,0,0,0,0,59,0.000000,0.000000,0.000000,0.000000
3,0,65962788,100,4,1,0,0,0,169,0.005882,0.000000,0.000000,0.000000
4,0,6521538,100,5,0,1,0,0,133,0.000000,0.007463,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,47190687,199,996,5,2,0,0,344,0.014493,0.005797,0.000000,0.000000
99996,0,28382142,199,997,11,9,5,3,16155,0.000681,0.000557,0.000309,0.000186
99997,0,1475520,199,998,6,2,0,0,796,0.007528,0.002509,0.000000,0.000000
99998,0,72201050,199,999,7,6,2,1,931,0.007511,0.006438,0.002146,0.001073


200000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,10181116,200,1,15,11,5,3,1143,0.013112,0.009615,0.004371,0.002622
1,0,2727254,200,2,5,0,0,0,1752,0.002852,0.000000,0.000000,0.000000
2,0,24328441,200,3,5,1,0,0,1212,0.004122,0.000824,0.000000,0.000000
3,0,2741068,200,4,6,1,0,0,1661,0.003610,0.000602,0.000000,0.000000
4,0,392034,200,5,7,2,0,0,1421,0.004923,0.001406,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,56193899,299,996,2,1,0,0,290,0.006873,0.003436,0.000000,0.000000
99996,0,43835762,299,997,4,1,0,0,890,0.004489,0.001122,0.000000,0.000000
99997,0,1116577,299,998,0,1,0,0,140,0.000000,0.007092,0.000000,0.000000
99998,0,23984627,299,999,0,1,0,0,167,0.000000,0.005952,0.000000,0.000000


300000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,38626463,300,1,7,7,1,0,3957,0.001769,0.001769,0.000253,0.0
1,0,15813371,300,2,1,0,0,0,17,0.055556,0.000000,0.000000,0.0
2,0,71111774,300,3,1,1,0,0,67,0.014706,0.014706,0.000000,0.0
3,0,42101604,300,4,1,0,0,0,180,0.005525,0.000000,0.000000,0.0
4,0,2948142,300,5,1,1,0,0,69,0.014286,0.014286,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,2483376,399,996,2,1,0,0,1019,0.001961,0.000980,0.000000,0.0
99996,0,17421000,399,997,0,0,0,0,928,0.000000,0.000000,0.000000,0.0
99997,0,48512293,399,998,1,1,0,0,294,0.003390,0.003390,0.000000,0.0
99998,0,592968,399,999,1,0,0,0,413,0.002415,0.000000,0.000000,0.0


400000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,23624028,400,1,10,15,12,9,2397,0.004170,0.006255,0.005004,0.003753
1,0,15650509,400,2,3,2,1,0,124,0.024000,0.016000,0.008000,0.000000
2,0,10722906,400,3,2,1,0,0,798,0.002503,0.001252,0.000000,0.000000
3,0,548115,400,4,9,8,2,0,3871,0.002324,0.002066,0.000517,0.000000
4,0,304907,400,5,3,4,2,0,826,0.003628,0.004837,0.002418,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,27676361,499,996,2,0,0,0,210,0.009479,0.000000,0.000000,0.000000
99996,0,69138274,499,997,0,0,0,0,52,0.000000,0.000000,0.000000,0.000000
99997,0,65811617,499,998,0,0,0,0,922,0.000000,0.000000,0.000000,0.000000
99998,0,5092263,499,999,0,0,0,0,893,0.000000,0.000000,0.000000,0.000000


500000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,53311744,500,1,22,22,15,12,1769,0.012429,0.012429,0.008475,0.00678
1,0,27146693,500,2,8,2,0,0,255,0.031250,0.007812,0.000000,0.00000
2,0,7121345,500,3,0,0,0,0,1,0.000000,0.000000,0.000000,0.00000
3,0,1250206,500,4,8,1,0,0,897,0.008909,0.001114,0.000000,0.00000
4,0,57461942,500,5,7,3,0,0,612,0.011419,0.004894,0.000000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,56222182,599,996,2,2,0,0,624,0.003200,0.003200,0.000000,0.00000
99996,0,8546209,599,997,6,3,0,0,1867,0.003212,0.001606,0.000000,0.00000
99997,0,31122096,599,998,3,1,0,0,1644,0.001824,0.000608,0.000000,0.00000
99998,0,20642168,599,999,1,1,0,0,305,0.003268,0.003268,0.000000,0.00000


600000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,0,18973446,600,1,5,4,2,0,5140,0.000973,0.000778,0.000389,0.000000
1,0,6473626,600,2,1,0,0,0,454,0.002198,0.000000,0.000000,0.000000
2,0,38945358,600,3,2,2,0,0,2424,0.000825,0.000825,0.000000,0.000000
3,1,11953,600,4,5,6,2,1,6479,0.000772,0.000926,0.000309,0.000154
4,0,18716923,600,5,4,2,0,0,4952,0.000808,0.000404,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,21376680,699,996,0,0,0,0,41,0.000000,0.000000,0.000000,0.000000
99996,0,12805520,699,997,1,2,0,0,307,0.003247,0.006494,0.000000,0.000000
99997,0,5763503,699,998,2,2,0,0,301,0.006623,0.006623,0.000000,0.000000
99998,0,32160047,699,999,1,1,0,0,135,0.007353,0.007353,0.000000,0.000000


700000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,0,1758814,700,1,3,1,0,0,25,0.115385,0.038462,0.000000,0.0
1,1,1758788,700,2,6,3,1,0,977,0.006135,0.003067,0.001022,0.0
2,0,29155639,700,3,1,0,0,0,32,0.030303,0.000000,0.000000,0.0
3,0,1937128,700,4,6,0,0,0,737,0.008130,0.000000,0.000000,0.0
4,0,63812315,700,5,4,2,0,0,114,0.034783,0.017391,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,11987570,799,996,2,1,0,0,203,0.009804,0.004902,0.000000,0.0
99996,0,58267,799,997,4,0,0,0,609,0.006557,0.000000,0.000000,0.0
99997,0,32543471,799,998,2,0,0,0,240,0.008299,0.000000,0.000000,0.0
99998,0,43687957,799,999,8,3,0,0,443,0.018018,0.006757,0.000000,0.0


800000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,27965042,800,1,12,11,7,4,233,0.051282,0.047009,0.029915,0.017094
1,0,29218677,800,2,1,0,0,0,108,0.009174,0.000000,0.000000,0.000000
2,0,16256107,800,3,6,4,2,1,314,0.019048,0.012698,0.006349,0.003175
3,0,34789445,800,4,3,0,0,0,30,0.096774,0.000000,0.000000,0.000000
4,0,1383324,800,5,0,0,0,0,46,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,52738661,899,996,0,0,0,0,138,0.000000,0.000000,0.000000,0.000000
99996,0,2591256,899,997,0,0,0,0,344,0.000000,0.000000,0.000000,0.000000
99997,0,1286163,899,998,3,0,0,0,1154,0.002597,0.000000,0.000000,0.000000
99998,0,47777358,899,999,0,0,0,0,733,0.000000,0.000000,0.000000,0.000000


900000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,38279,900,1,19,24,15,10,1278,0.014855,0.018765,0.011728,0.007819
1,0,32103630,900,2,7,4,1,0,467,0.014957,0.008547,0.002137,0.000000
2,0,676632,900,3,13,11,3,2,1100,0.011807,0.009991,0.002725,0.001817
3,0,546183,900,4,4,3,1,0,209,0.019048,0.014286,0.004762,0.000000
4,0,2273418,900,5,12,12,2,0,550,0.021779,0.021779,0.003630,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,66324261,999,996,0,0,0,0,117,0.000000,0.000000,0.000000,0.000000
99996,0,10167825,999,997,2,2,0,0,200,0.009950,0.009950,0.000000,0.000000
99997,0,28256450,999,998,0,1,0,0,225,0.000000,0.004425,0.000000,0.000000
99998,0,20279343,999,999,1,0,0,0,524,0.001905,0.000000,0.000000,0.000000


1000000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,63778,1000,1,21,27,21,17,3247,0.006466,0.008313,0.006466,0.005234
1,0,1473033,1000,2,9,4,0,0,614,0.014634,0.006504,0.000000,0.000000
2,0,4751128,1000,3,13,9,2,1,1621,0.008015,0.005549,0.001233,0.000617
3,0,24462958,1000,4,22,15,4,0,7999,0.002750,0.001875,0.000500,0.000000
4,0,3069520,1000,5,14,7,0,0,3213,0.004356,0.002178,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,66069171,1099,996,6,4,0,0,1152,0.005204,0.003469,0.000000,0.000000
99996,0,69650537,1099,997,3,3,0,0,2884,0.001040,0.001040,0.000000,0.000000
99997,0,65865322,1099,998,2,1,0,0,67,0.029412,0.014706,0.000000,0.000000
99998,0,19150842,1099,999,0,1,0,0,197,0.000000,0.005051,0.000000,0.000000


1100000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,1184256,1100,1,15,11,5,2,914,0.016393,0.012022,0.005464,0.002186
1,0,14886108,1100,2,10,5,1,0,2713,0.003685,0.001842,0.000368,0.000000
2,0,61137235,1100,3,4,0,0,0,436,0.009153,0.000000,0.000000,0.000000
3,0,27893,1100,4,7,5,0,0,3056,0.002290,0.001636,0.000000,0.000000
4,0,3906529,1100,5,3,0,0,0,1085,0.002762,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,7073120,1199,996,2,1,0,0,1415,0.001412,0.000706,0.000000,0.000000
99996,0,619137,1199,997,5,1,0,0,4438,0.001126,0.000225,0.000000,0.000000
99997,0,28368826,1199,998,0,0,0,0,48,0.000000,0.000000,0.000000,0.000000
99998,0,22178141,1199,999,0,0,0,0,19,0.000000,0.000000,0.000000,0.000000


1200000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,37440954,1200,1,22,30,24,20,358,0.061281,0.083565,0.066852,0.05571
1,0,34684267,1200,2,10,5,0,0,899,0.011111,0.005556,0.000000,0.00000
2,0,7717738,1200,3,9,3,0,0,1171,0.007679,0.002560,0.000000,0.00000
3,0,1739001,1200,4,13,8,1,0,4925,0.002639,0.001624,0.000203,0.00000
4,0,22639515,1200,5,4,1,0,0,585,0.006826,0.001706,0.000000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,53299226,1299,996,0,0,0,0,47,0.000000,0.000000,0.000000,0.00000
99996,0,57216813,1299,997,1,0,0,0,49,0.020000,0.000000,0.000000,0.00000
99997,0,2346998,1299,998,1,0,0,0,1853,0.000539,0.000000,0.000000,0.00000
99998,0,28792470,1299,999,0,0,0,0,1269,0.000000,0.000000,0.000000,0.00000


1300000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,1061330,1300,1,7,4,2,1,2006,0.003488,0.001993,0.000997,0.000498
1,0,10538313,1300,2,3,3,2,1,17,0.166667,0.166667,0.111111,0.055556
2,0,4642469,1300,3,0,0,0,0,274,0.000000,0.000000,0.000000,0.000000
3,0,25171268,1300,4,0,0,0,0,29,0.000000,0.000000,0.000000,0.000000
4,0,33010564,1300,5,5,0,0,0,192,0.025907,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,591552,1399,996,0,1,0,0,914,0.000000,0.001093,0.000000,0.000000
99996,0,19385468,1399,997,0,1,0,0,72,0.000000,0.013699,0.000000,0.000000
99997,0,41440502,1399,998,0,0,0,0,17,0.000000,0.000000,0.000000,0.000000
99998,0,56081765,1399,999,0,1,0,0,77,0.000000,0.012821,0.000000,0.000000


1400000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,51420514,1400,1,11,8,5,3,305,0.035948,0.026144,0.01634,0.009804
1,0,58095,1400,2,4,1,0,0,174,0.022857,0.005714,0.00000,0.000000
2,0,504113,1400,3,6,1,0,0,2145,0.002796,0.000466,0.00000,0.000000
3,0,62899370,1400,4,5,2,0,0,410,0.012165,0.004866,0.00000,0.000000
4,0,27315541,1400,5,2,0,0,0,66,0.029851,0.000000,0.00000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,39164445,1499,996,1,1,0,0,577,0.001730,0.001730,0.00000,0.000000
99996,0,51093888,1499,997,0,1,0,0,458,0.000000,0.002179,0.00000,0.000000
99997,0,1461554,1499,998,2,2,0,0,734,0.002721,0.002721,0.00000,0.000000
99998,0,43310556,1499,999,0,0,0,0,56,0.000000,0.000000,0.00000,0.000000


1500000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,192266,1500,1,20,30,23,19,2084,0.009592,0.014388,0.011031,0.009113
1,0,2702319,1500,2,15,15,4,2,890,0.016835,0.016835,0.004489,0.002245
2,0,40116145,1500,3,10,8,2,0,282,0.035336,0.028269,0.007067,0.000000
3,0,40083877,1500,4,13,12,1,0,2762,0.004705,0.004343,0.000362,0.000000
4,0,37853173,1500,5,11,6,0,0,2085,0.005273,0.002876,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,14874419,1599,996,1,0,0,0,226,0.004405,0.000000,0.000000,0.000000
99996,0,16845570,1599,997,1,0,0,0,53,0.018519,0.000000,0.000000,0.000000
99997,0,15213668,1599,998,0,0,0,0,320,0.000000,0.000000,0.000000,0.000000
99998,0,14776731,1599,999,0,0,0,0,172,0.000000,0.000000,0.000000,0.000000


1600000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,15467799,1600,1,11,6,3,2,282,0.038869,0.021201,0.010601,0.007067
1,0,32274761,1600,2,1,0,0,0,101,0.009804,0.000000,0.000000,0.000000
2,0,23433554,1600,3,4,2,0,0,871,0.004587,0.002294,0.000000,0.000000
3,0,4405086,1600,4,3,0,0,0,160,0.018634,0.000000,0.000000,0.000000
4,0,18016986,1600,5,7,1,0,0,3068,0.002281,0.000326,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,33731726,1699,996,4,3,0,0,2866,0.001395,0.001046,0.000000,0.000000
99996,0,62700303,1699,997,0,1,0,0,190,0.000000,0.005236,0.000000,0.000000
99997,0,27919032,1699,998,4,2,0,0,2652,0.001508,0.000754,0.000000,0.000000
99998,0,25038063,1699,999,1,2,0,0,885,0.001129,0.002257,0.000000,0.000000


1700000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,21447170,1700,1,6,2,1,0,697,0.008596,0.002865,0.001433,0.0
1,0,2169108,1700,2,0,0,0,0,279,0.000000,0.000000,0.000000,0.0
2,0,61357947,1700,3,0,0,0,0,53,0.000000,0.000000,0.000000,0.0
3,0,16589577,1700,4,0,0,0,0,73,0.000000,0.000000,0.000000,0.0
4,0,9072592,1700,5,0,0,0,0,31,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,67401417,1799,996,1,1,0,0,406,0.002457,0.002457,0.000000,0.0
99996,0,59890641,1799,997,3,1,0,0,799,0.003750,0.001250,0.000000,0.0
99997,0,466322,1799,998,1,1,0,0,1107,0.000903,0.000903,0.000000,0.0
99998,0,2971112,1799,999,1,1,0,0,558,0.001789,0.001789,0.000000,0.0


1800000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,44119867,1800,1,5,1,0,0,2877,0.001737,0.000347,0.000000,0.0
1,0,481305,1800,2,0,0,0,0,768,0.000000,0.000000,0.000000,0.0
2,0,359729,1800,3,0,0,0,0,540,0.000000,0.000000,0.000000,0.0
3,0,3748028,1800,4,1,0,0,0,975,0.001025,0.000000,0.000000,0.0
4,0,15607026,1800,5,0,0,0,0,25,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,40549986,1899,996,1,0,0,0,105,0.009434,0.000000,0.000000,0.0
99996,0,57493706,1899,997,9,5,1,0,1347,0.006677,0.003709,0.000742,0.0
99997,0,12274261,1899,998,1,3,1,0,1580,0.000633,0.001898,0.000633,0.0
99998,0,1961658,1899,999,1,0,0,0,63,0.015625,0.000000,0.000000,0.0


1900000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,6045801,1900,1,11,18,9,5,2424,0.004536,0.007423,0.003711,0.002062
1,0,47143780,1900,2,7,1,0,0,275,0.025362,0.003623,0.000000,0.000000
2,0,58463358,1900,3,9,8,1,0,1209,0.007438,0.006612,0.000826,0.000000
3,0,22377473,1900,4,1,0,0,0,16,0.058824,0.000000,0.000000,0.000000
4,0,1163024,1900,5,9,10,1,0,2566,0.003506,0.003896,0.000390,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,42185120,1999,996,3,2,0,0,1584,0.001893,0.001262,0.000000,0.000000
99996,0,7150234,1999,997,1,0,0,0,111,0.008929,0.000000,0.000000,0.000000
99997,0,14851545,1999,998,0,0,0,0,358,0.000000,0.000000,0.000000,0.000000
99998,0,343949,1999,999,5,2,0,0,12560,0.000398,0.000159,0.000000,0.000000


2000000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,223183,2000,1,20,16,11,6,423,0.047170,0.037736,0.025943,0.014151
1,0,20281080,2000,2,4,1,0,0,423,0.009434,0.002358,0.000000,0.000000
2,0,49474181,2000,3,4,0,0,0,503,0.007937,0.000000,0.000000,0.000000
3,0,57226979,2000,4,0,0,0,0,136,0.000000,0.000000,0.000000,0.000000
4,0,16669718,2000,5,1,0,0,0,440,0.002268,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,62485713,2099,996,1,1,0,0,104,0.009524,0.009524,0.000000,0.000000
99996,0,2428290,2099,997,1,1,0,0,1268,0.000788,0.000788,0.000000,0.000000
99997,0,24812527,2099,998,1,1,0,0,586,0.001704,0.001704,0.000000,0.000000
99998,0,4155527,2099,999,1,0,0,0,1256,0.000796,0.000000,0.000000,0.000000


2100000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,41020,2100,1,11,12,6,2,169,0.064706,0.070588,0.035294,0.011765
1,0,3689264,2100,2,7,5,0,0,1576,0.004439,0.003171,0.000000,0.000000
2,0,2466770,2100,3,2,1,0,0,218,0.009132,0.004566,0.000000,0.000000
3,0,14171448,2100,4,9,4,0,0,3587,0.002508,0.001115,0.000000,0.000000
4,0,57477295,2100,5,4,3,0,0,1241,0.003221,0.002415,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,18517748,2199,996,0,0,0,0,581,0.000000,0.000000,0.000000,0.000000
99996,0,60083,2199,997,4,3,0,0,1928,0.002074,0.001555,0.000000,0.000000
99997,0,45467085,2199,998,0,0,0,0,587,0.000000,0.000000,0.000000,0.000000
99998,0,72824114,2199,999,1,0,0,0,90,0.010989,0.000000,0.000000,0.000000


2200000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,0,1183199,2200,1,2,2,1,0,57,0.034483,0.034483,0.017241,0.0
1,1,39295868,2200,2,5,4,0,0,259,0.019231,0.015385,0.000000,0.0
2,0,63363244,2200,3,1,1,0,0,169,0.005882,0.005882,0.000000,0.0
3,0,50161579,2200,4,1,1,0,0,100,0.009901,0.009901,0.000000,0.0
4,0,35644552,2200,5,1,0,0,0,25,0.038462,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,7760747,2299,996,6,1,0,0,1098,0.005460,0.000910,0.000000,0.0
99996,0,54597935,2299,997,4,0,0,0,270,0.014760,0.000000,0.000000,0.0
99997,0,65175699,2299,998,10,3,0,0,1859,0.005376,0.001613,0.000000,0.0
99998,0,19340261,2299,999,2,0,0,0,260,0.007663,0.000000,0.000000,0.0


2300000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,50311973,2300,1,26,31,19,13,2960,0.008781,0.010469,0.006417,0.00439
1,0,36431056,2300,2,15,8,2,0,854,0.017544,0.009357,0.002339,0.00000
2,0,18906,2300,3,21,12,1,0,5076,0.004136,0.002364,0.000197,0.00000
3,0,60405154,2300,4,10,9,0,0,1135,0.008803,0.007923,0.000000,0.00000
4,0,24833024,2300,5,7,4,0,0,2908,0.002406,0.001375,0.000000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,51224953,2399,996,2,1,0,0,822,0.002430,0.001215,0.000000,0.00000
99996,0,33753213,2399,997,0,0,0,0,492,0.000000,0.000000,0.000000,0.00000
99997,0,63031462,2399,998,4,1,0,0,1407,0.002841,0.000710,0.000000,0.00000
99998,0,12199452,2399,999,3,1,0,0,601,0.004983,0.001661,0.000000,0.00000


2400000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25000), Label(value='0 / 25000')))…

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,1507174,2400,1,4,0,0,0,1668,0.002397,0.000000,0.000000,0.0
1,0,17580436,2400,2,0,0,0,0,291,0.000000,0.000000,0.000000,0.0
2,0,52086631,2400,3,1,0,0,0,315,0.003165,0.000000,0.000000,0.0
3,0,32446613,2400,4,1,0,0,0,268,0.003717,0.000000,0.000000,0.0
4,0,13554259,2400,5,0,0,0,0,257,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,215539,2499,996,9,6,1,0,4951,0.001817,0.001212,0.000202,0.0
99996,0,4713918,2499,997,1,1,0,0,218,0.004566,0.004566,0.000000,0.0
99997,0,46182856,2499,998,0,0,0,0,102,0.000000,0.000000,0.000000,0.0
99998,0,10019499,2499,999,2,0,0,0,992,0.002014,0.000000,0.000000,0.0


2500000 start


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6750), Label(value='0 / 6750'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6750), Label(value='0 / 6750'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6750), Label(value='0 / 6750'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6750), Label(value='0 / 6750'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6750), Label(value='0 / 6750'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6750), Label(value='0 / 6750'))), …

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,490387,2500,1,2,1,0,0,621,0.003215,0.001608,0.0,0.0
1,0,38298699,2500,2,0,0,0,0,46,0.000000,0.000000,0.0,0.0
2,0,65206136,2500,3,0,1,0,0,432,0.000000,0.002309,0.0,0.0
3,0,45344496,2500,4,0,0,0,0,34,0.000000,0.000000,0.0,0.0
4,0,22130933,2500,5,0,0,0,0,114,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26995,0,25121593,2526,996,0,0,0,0,70,0.000000,0.000000,0.0,0.0
26996,0,15040535,2526,997,0,0,0,0,1367,0.000000,0.000000,0.0,0.0
26997,0,62697879,2526,998,0,0,0,0,1037,0.000000,0.000000,0.0,0.0
26998,0,1054261,2526,999,1,0,0,0,531,0.001880,0.000000,0.0,0.0


CPU times: user 35min 24s, sys: 9min 58s, total: 45min 23s
Wall time: 1h 7min 25s


In [12]:
train = []
for filename in glob.glob('train_features_*.pkl'):
    train.append(pd.read_pickle(filename))
train = pd.concat(train).sort_values('id').reset_index(drop=True)
display(train)

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,17458663,3,1,11,16,11,6,330,0.033233,0.048338,0.033233,0.018127
1,0,3955446,3,852,1,2,0,0,507,0.001969,0.003937,0.000000,0.000000
2,0,67059821,3,670,5,4,0,0,322,0.015480,0.012384,0.000000,0.000000
3,0,18412684,3,655,2,2,1,0,134,0.014815,0.014815,0.007407,0.000000
4,0,20249418,3,653,4,2,0,0,364,0.010959,0.005479,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
329986,0,49478061,68891,394,3,1,0,0,153,0.019481,0.006494,0.000000,0.000000
329987,0,67123178,68891,554,0,0,0,0,887,0.000000,0.000000,0.000000,0.000000
329988,0,73925214,68891,610,2,1,0,0,146,0.013605,0.006803,0.000000,0.000000
329989,0,45228346,68891,856,1,0,0,0,503,0.001984,0.000000,0.000000,0.000000


In [13]:
valid = []
for filename in glob.glob('valid_features_*.pkl'):
    valid.append(pd.read_pickle(filename))
valid = pd.concat(valid).sort_values('id').reset_index(drop=True)
display(valid)

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,0,43046101,0,999,0,0,0,0,75,0.000000,0.000000,0.0,0.0
1,0,529609,0,658,2,4,0,0,1522,0.001313,0.002626,0.0,0.0
2,0,5639039,0,659,0,0,0,0,88,0.000000,0.000000,0.0,0.0
3,0,462409,0,660,0,0,0,0,21,0.000000,0.000000,0.0,0.0
4,0,3080690,0,661,1,1,0,0,66,0.014925,0.014925,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526995,0,17573213,2526,340,0,0,0,0,224,0.000000,0.000000,0.0,0.0
2526996,0,16701385,2526,341,2,0,0,0,654,0.003053,0.000000,0.0,0.0
2526997,0,63269809,2526,342,1,1,0,0,493,0.002024,0.002024,0.0,0.0
2526998,0,6000455,2526,329,1,1,0,0,2629,0.000380,0.000380,0.0,0.0


In [14]:
%%time

train.to_pickle('./data/train.pickle')
valid.to_pickle('./data/valid.pickle')

CPU times: user 94 ms, sys: 103 ms, total: 197 ms
Wall time: 206 ms
