In [7]:
import csv
import spacy
from collections import defaultdict
from scipy.stats import describe
from spacy.symbols import nsubj, dobj

nlp = spacy.load('en_default')  # using en_core_web_md model


def train_set(size=404301):
    with open('input/train.csv', 'r') as input_file:
        reader = csv.DictReader(input_file)
        for counter, line in enumerate(reader):
            if counter >= size:
                break
            yield int(line['id']), line['question1'], line['question2'], int(line['is_duplicate'])


def precalculated_predictions():
    predictions = dict()

    with open('input/trian_predictions_0_324.csv') as input_file:
        reader = csv.DictReader(input_file)
        for line in reader:
            predictions[int(line['train_id'])] = float(line['probability'])

    return predictions

Get false prediction examples

In [8]:
predictions = precalculated_predictions()

for pair_id, question1, question2, is_duplicate in train_set(100):
    if abs(predictions[pair_id] - is_duplicate) > 0.5:
        print('%s\n%s\n%s\n%s\n-------------------' % (question1, question2, abs(predictions[pair_id] - is_duplicate), bool(is_duplicate)))

What is the step by step guide to invest in share market in india?
What is the step by step guide to invest in share market?
0.683279
False
-------------------
Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?
Which fish would survive in salt water?
0.848533
False
-------------------
Should I buy tiago?
What keeps childern active and far from phone and video games?
0.842406
False
-------------------
When do you use シ instead of し?
When do you use "&" instead of "and"?
0.894469
False
-------------------
Method to find separation of slits using fresnel biprism?
What are some of the things technicians can tell about the durability and reliability of Laptops and its components?
0.968832
False
-------------------
How do I read and find my YouTube comments?
How can I see all my Youtube comments?
0.520986
True
-------------------
What was your first sexual experience like?
What was your first sexual experience?
0.974132
True
-------------------
What would a Trump pr

In [9]:
def get_subjects(question):
    return set([word.lemma_ for word in question if word.dep == nsubj])


def get_objects(question):
    return set([word.lemma_ for word in question if word.dep == dobj])


def get_roots(question):
    return set([word.lemma_ for word in question if word.dep_ == 'ROOT'])


def get_heads(question):
    return set([word.head.lemma_ for word in question])

In [10]:
subjects = list()
objects = list()
roots = list()
heads = list()

for pair_id, question1, question2, is_duplicate in train_set(100):
    if abs(predictions[pair_id] - is_duplicate) > 0.5 and not is_duplicate:
        question1_doc = nlp(question1)
        question2_doc = nlp(question2)
        
        subjects.append((get_subjects(question1_doc), get_subjects(question2_doc)))
        objects.append((get_objects(question1_doc), get_objects(question2_doc)))
        roots.append((get_roots(question1_doc), get_roots(question2_doc)))
        heads.append((get_heads(question1_doc), get_heads(question2_doc)))

print('SUBJECTS')
for pair in subjects:
    print('%s\n%s\n-------------------' % pair)

print('\n\nOBJECTS')
for pair in objects:
    print('%s\n%s\n-------------------' % pair)

print('\n\nROOTS')
for pair in roots:
    print('%s\n%s\n-------------------' % pair)

print('\n\nHEADS')
for pair in heads:
    print('%s\n%s\n-------------------' % pair)

SUBJECTS
{'step'}
{'step'}
-------------------
set()
{'fish'}
-------------------
{'-PRON-'}
{'what'}
-------------------
{'-PRON-'}
{'-PRON-'}
-------------------
set()
{'some', 'technician'}
-------------------
{'tip'}
{'tip'}
-------------------
set()
{'sport'}
-------------------
{'-PRON-', 'boyfriend', 'what', 'girlfriend'}
{'-PRON-', 'girlfriend'}
-------------------
set()
{'aircraft'}
-------------------
{'-PRON-'}
{'-PRON-'}
-------------------
set()
set()
-------------------
{'university', 'rexnord', '-PRON-'}
{'foods', '-PRON-'}
-------------------
{'vader'}
{'quora'}
-------------------
{'who'}
{'boyfriend', '-PRON-'}
-------------------
{'what'}
{'some'}
-------------------
{'-PRON-'}
{'torrent'}
-------------------
{'-PRON-'}
{'-PRON-'}
-------------------
{'-PRON-'}
{'-PRON-'}
-------------------
{'cost'}
{'who'}
-------------------
{'-PRON-'}
{'way'}
-------------------
{'government'}
{'india'}
-------------------
{'procedure', 'someone', '-PRON-'}
{'-PRON-'}
-----------

In [11]:
def jaccard_index(set1, set2):
    if len(set1) == 0 and len(set2) == 0:
        return 1.0
    return len(set1 & set2) / len(set1 | set2)

In [12]:
subjects = defaultdict(list)
objects = defaultdict(list)
roots = defaultdict(list)
heads = defaultdict(list)

for pair_id, question1, question2, is_duplicate in train_set(100000):
        question1_doc = nlp(question1)
        question2_doc = nlp(question2)

        subjects[is_duplicate].append(jaccard_index(get_subjects(question1_doc), get_subjects(question2_doc)))
        objects[is_duplicate].append(jaccard_index(get_objects(question1_doc), get_objects(question2_doc)))
        roots[is_duplicate].append(jaccard_index(get_roots(question1_doc), get_roots(question2_doc)))
        heads[is_duplicate].append(jaccard_index(get_heads(question1_doc), get_heads(question2_doc)))

print('\tnumber of observations\t(minimum, maximum)\tmean\tvariance\tskewness\tkurtosis')
print('SUBJECTS')
print('DUPLICATE:\t', '\t'.join(map(str, describe(subjects[1]))))
print('NOT DUPLICATE:\t', '\t'.join(map(str, describe(subjects[0]))))
print('\nOBJECTS')
print('DUPLICATE:\t', '\t'.join(map(str, describe(objects[1]))))
print('NOT DUPLICATE:\t', '\t'.join(map(str, describe(objects[0]))))
print('\nROOTS')
print('DUPLICATE:\t', '\t'.join(map(str, describe(roots[1]))))
print('NOT DUPLICATE:\t', '\t'.join(map(str, describe(roots[0]))))
print('\nHEADS')
print('DUPLICATE:\t', '\t'.join(map(str, describe(heads[1]))))
print('NOT DUPLICATE:\t', '\t'.join(map(str, describe(heads[0]))))

	number of observations	(minimum, maximum)	mean	variance	skewness	kurtosis
SUBJECTS
DUPLICATE:	 37254	(0.0, 1.0)	0.520536944579	0.218928498626	-0.07258566702361444	-1.860007775243315
NOT DUPLICATE:	 62746	(0.0, 1.0)	0.370059015967	0.199678019125	0.5534414840760108	-1.5232534084683247

OBJECTS
DUPLICATE:	 37254	(0.0, 1.0)	0.669082131161	0.203801192924	-0.7102806989927335	-1.4021024803067506
NOT DUPLICATE:	 62746	(0.0, 1.0)	0.473461813034	0.236205047528	0.11384966274387981	-1.9352000411457764

ROOTS
DUPLICATE:	 37254	(0.0, 1.0)	0.59802379802	0.225224956912	-0.39604874810104057	-1.7746373433805054
NOT DUPLICATE:	 62746	(0.0, 1.0)	0.442296345604	0.229163241002	0.23996918696869363	-1.868598080707311

HEADS
DUPLICATE:	 37254	(0.0, 1.0)	0.499143042344	0.0781966093312	0.37882622611424516	-0.7513576897489256
NOT DUPLICATE:	 62746	(0.0, 1.0)	0.331432179202	0.0858043118546	0.9322872053274988	-0.07576396293850074
