In [7]:
import csv
import spacy
import pickle
from math import log, ceil
from scipy.stats import describe
from scipy.stats.mstats import gmean, hmean
from numpy import mean as amean

nlp = spacy.load('en_default')  # using en_core_web_md model

MAXIMUM_IDF = log(5000000)  # doesn't really matter, this is close to having only one occurrence in the Quora dataset


def idfs():
    with open('input/idfs.pickle', 'rb') as input_file:
        return pickle.load(input_file)

IDFS = idfs()


def get_idf(spacy_word):
    key = (spacy_word.lemma_, spacy_word.pos_)
    return IDFS.get(key, MAXIMUM_IDF)


def train_set(size=404301):
    with open('input/train.csv', 'r') as input_file:
        reader = csv.DictReader(input_file)
        for counter, line in enumerate(reader):
            if counter >= size:
                break
            yield int(line['id']), line['question1'], line['question2'], int(line['is_duplicate'])

In [3]:
def jaccard_index(set1, set2):
    if len(set1) == 0 and len(set2) == 0:
        return 1.0
    return len(set1 & set2) / len(set1 | set2)


def filter_by_minimum_idf(spacy_document, minimum_idf):
    return set(
        (word.lemma_ for word in spacy_document if get_idf(word) >= minimum_idf)
    )

In [8]:
print('\t\tnumber of observations\t(minimum, maximum)\tmean\tvariance\tskewness\tkurtosis')

for minimum_idf in range(0, ceil(MAXIMUM_IDF)):
    duplicate = list()
    non_duplicate = list()
    for pair_id, question1, question2, is_duplicate in train_set(10000):
        question1_doc = nlp(question1)
        question2_doc = nlp(question2)
        
        overlap_score = jaccard_index(
            filter_by_minimum_idf(question1_doc, minimum_idf),
            filter_by_minimum_idf(question2_doc, minimum_idf)
        )
        
        if is_duplicate:
            duplicate.append(overlap_score)
        else:
            non_duplicate.append(overlap_score)

    print('MINIMUM:\t%s' % minimum_idf)
    print('DUPLICATE:\t', '\t'.join(map(str, describe(duplicate))))
    print('NOT DUPLICATE:\t', '\t'.join(map(str, describe(non_duplicate))))
    print('\n')

		number of observations	(minimum, maximum)	mean	variance	skewness	kurtosis


MINIMUM:	0
DUPLICATE:	 3711	(0.090909090909090912, 1.0)	0.557678310241	0.0395095764097	0.24153978258070485	-0.7665211347897531
NOT DUPLICATE:	 6289	(0.0, 1.0)	0.383935352073	0.0541890925243	0.6204053483680291	-0.6107482351567355




MINIMUM:	1
DUPLICATE:	 3711	(0.0, 1.0)	0.495165433371	0.0432986956872	0.4676834864115215	-0.5274396949554121
NOT DUPLICATE:	 6289	(0.0, 1.0)	0.316025564064	0.055080911096	0.769225303047541	-0.24186679499440222




MINIMUM:	2
DUPLICATE:	 3711	(0.0, 1.0)	0.512584446013	0.0500473150062	0.5517202839367089	-0.48088797360704394
NOT DUPLICATE:	 6289	(0.0, 1.0)	0.308264384527	0.0571768958037	0.7346095018646495	-0.2763900360758975




MINIMUM:	3
DUPLICATE:	 3711	(0.0, 1.0)	0.52871184858	0.0544358252093	0.5332302065389398	-0.5616588032286276
NOT DUPLICATE:	 6289	(0.0, 1.0)	0.315569254023	0.0605059042796	0.7178273521669866	-0.2571107354806177




MINIMUM:	4
DUPLICATE:	 3711	(0.0, 1.0)	0.562405830165	0.0623080751215	0.3976297409226414	-0.7735759078827118
NOT DUPLICATE:	 6289	(0.0, 1.0)	0.330528381723	0.066666287562	0.6836936696037529	-0.27146562074685043




MINIMUM:	5
DUPLICATE:	 3711	(0.0, 1.0)	0.567903972336	0.0814693274336	0.1318291783596864	-0.8709155135834505
NOT DUPLICATE:	 6289	(0.0, 1.0)	0.330143200929	0.0752177873968	0.7737924324492068	-0.11735439268226155




MINIMUM:	6
DUPLICATE:	 3711	(0.0, 1.0)	0.567219189303	0.118962179164	-0.10435264190690177	-1.203611433767062
NOT DUPLICATE:	 6289	(0.0, 1.0)	0.326328677153	0.0934830679571	0.8444072466343407	-0.20519865678267735




MINIMUM:	7
DUPLICATE:	 3711	(0.0, 1.0)	0.584606438164	0.162577409892	-0.28161211685531296	-1.4924267478332922
NOT DUPLICATE:	 6289	(0.0, 1.0)	0.328213654888	0.126695813133	0.826140899645756	-0.6431497453354695




MINIMUM:	8
DUPLICATE:	 3711	(0.0, 1.0)	0.655943949012	0.18288723856	-0.6319195292252436	-1.3687006024793935
NOT DUPLICATE:	 6289	(0.0, 1.0)	0.362016513148	0.166983838442	0.6326338363553026	-1.2435264487767002




MINIMUM:	9
DUPLICATE:	 3711	(0.0, 1.0)	0.730133066431	0.177141828894	-1.0232420982385257	-0.8226478974587526
NOT DUPLICATE:	 6289	(0.0, 1.0)	0.427639628618	0.210571130929	0.3161748026066476	-1.7521479074951882




MINIMUM:	10
DUPLICATE:	 3711	(0.0, 1.0)	0.803238120902	0.149989249692	-1.5199709594278579	0.3907122106912304
NOT DUPLICATE:	 6289	(0.0, 1.0)	0.524990099298	0.234585813497	-0.09141679810540093	-1.933769496005887




MINIMUM:	11
DUPLICATE:	 3711	(0.0, 1.0)	0.861059783655	0.116484804825	-2.085049250907725	2.4016362027816536
NOT DUPLICATE:	 6289	(0.0, 1.0)	0.630272433349	0.227027533911	-0.536319929792744	-1.6859755586202407




MINIMUM:	12
DUPLICATE:	 3711	(0.0, 1.0)	0.913038457097	0.0780029424052	-2.9297181257822804	6.63845487692738
NOT DUPLICATE:	 6289	(0.0, 1.0)	0.715617215244	0.201334334581	-0.9544428012201216	-1.0762650419729303




MINIMUM:	13
DUPLICATE:	 3711	(0.0, 1.0)	0.946474445343	0.0501893490847	-3.9648690502637978	13.764572614153217
NOT DUPLICATE:	 6289	(0.0, 1.0)	0.798749138708	0.160235571088	-1.4898881796524976	0.22484274107168245




MINIMUM:	14
DUPLICATE:	 3711	(0.0, 1.0)	0.967663702506	0.0311643246094	-5.2875348974998175	25.992575967256805
NOT DUPLICATE:	 6289	(0.0, 1.0)	0.861186198124	0.119404508741	-2.089276879878149	2.367863210669557




MINIMUM:	15
DUPLICATE:	 3711	(0.0, 1.0)	0.975478307734	0.0239268264161	-6.148606175530092	35.80535790176675
NOT DUPLICATE:	 6289	(0.0, 1.0)	0.885832405788	0.101149438193	-2.4265066729025255	3.8879346336404854




In [10]:
def relative_difference(value1, value2):
    diff = abs(value1 / value2)
    if diff > 1:
        return diff
    else:
        return 1 / diff
    

def absolute_difference(value1, value2):
    return abs(value1 - value2)


means = [amean, gmean, hmean]
differences = [relative_difference, absolute_difference]

for mean in means:
    for difference in differences:
        duplicate = list()
        non_duplicate = list()
        for pair_id, question1, question2, is_duplicate in train_set(10000):
            question1_doc = nlp(question1)
            question2_doc = nlp(question2)
            
            idfs1 = list(map(get_idf, question1_doc))
            idfs2 = list(map(get_idf, question2_doc))
            
            question_difference = difference(
                mean(idfs1), mean(idfs2)
            )
            
            if is_duplicate:
                duplicate.append(question_difference)
            else:
                non_duplicate.append(question_difference)
    
        print('%s, %s' % (mean.__name__, difference.__name__))
        print('DUPLICATE:\t', '\t'.join(map(str, describe(duplicate))))
        print('NOT DUPLICATE:\t', '\t'.join(map(str, describe(non_duplicate))))
        print('\n')

mean, relative_difference
DUPLICATE:	 3711	(1.0, 2.480560455841101)	1.13679802156	0.0172794899647	1.980417603865905	6.610397255471382
NOT DUPLICATE:	 6289	(1.0, 3.3532484125679822)	1.19206982906	0.0382072288189	2.2751843846086377	8.988505024237263




mean, absolute_difference
DUPLICATE:	 3711	(0.0, 3.8704809695412425)	0.438007520793	0.158739825838	1.6751807063727422	4.163405466659966
NOT DUPLICATE:	 6289	(0.0, 5.775112630251785)	0.656837999639	0.39122247172	1.8783217813806519	5.245805497035381




gmean, relative_difference
DUPLICATE:	 3711	(1.0, 4.1806249473146817)	1.26761100305	0.088716299729	2.6305181311718386	11.317481839896546
NOT DUPLICATE:	 6289	(1.0, 11.305244811826743)	1.39217077122	0.217174993901	4.135763550348844	46.497100913174556




gmean, absolute_difference
DUPLICATE:	 3711	(0.0, 2.4539374745942002)	0.28931766826	0.0771543233843	1.8658858760853543	5.310925181554088
NOT DUPLICATE:	 6289	(0.0, 7.4127034230730908)	0.454065044527	0.185509723852	2.1733599065219895	13.834008732230664




hmean, relative_difference
DUPLICATE:	 3711	(1.0, 234.82672212845662)	1.81721344654	90.0276523229	18.83506483129104	375.43473680451757
NOT DUPLICATE:	 6289	(1.0, 1058.1141440857477)	1.93357709496	219.026918703	59.92463827388839	4148.4188289545355




hmean, absolute_difference
DUPLICATE:	 3711	(0.0, 2.7443857128268183)	0.010826826583	0.0150638500945	17.414286544491922	309.1889316022544
NOT DUPLICATE:	 6289	(0.0, 7.9349263681499229)	0.014277222299	0.0266113424569	28.947679706467788	1075.0505947994043


