In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import json
from sklearn.pipeline import make_pipeline, make_union
from sklearn.linear_model import LogisticRegression
from functools import partial
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from collections import defaultdict
from sklearn.base import TransformerMixin
from sklearn.preprocessing import PolynomialFeatures, scale, FunctionTransformer
from nltk import stem
from sner import Ner

In [2]:
non_ascii = re.compile(r'[^A-Za-z0-9 ]')
countries = ['usa', 'china', 'europe', 'united states', 'america', 'mexico', 'india', 'hindi',
            'chinese', 'european', 'american', 'mexican', 'spain', 'italy', 'russia', 'italian',
            'great britain', 'british']

In [24]:
def get_ngram_features(row, n_gram=1):
    
    counts = []
    if isinstance(n_gram, int):
        n_grams = [n_gram]
    del n_gram
    for n_gram in n_grams:
        ques = list(map(lambda x: x.split(), row[['question1', 'question2']].tolist()))
        first = [' '.join(ques[0][j:j+n_gram]) for j in range(len(ques[0]) - n_gram +1)]
        second = [' '.join(ques[1][j:j+n_gram]) for j in range(len(ques[1])- n_gram +1)]

        sfirst = set(first)
        ssecond = set(second)
        common = [x for x in first if x in ssecond] + [x for x in second if x in sfirst]

        arr = [len(common), len(first), len(second), len(sfirst), len(ssecond)]

        ag1 = sum(arr[1:3])
        ag2 = sum(arr[3:])
        z = []
        if ag1 == 0:
            z.append(0)
        else:
            z.append(arr[0]/ag1)

        if ag2 == 0:
            z.append(0)
        else:
            z.append(arr[0]/ag2)

        z.append( abs( arr[3] - arr[4] ) )
        z.append( abs( arr[1] - arr[2] ) )
        
        counts += z
        
    return counts

In [4]:
def is_subset(row):
    
    words_1 = set(row['question1'].lower().split())
    words_2 = set(row['question2'].lower().split())
    
    if len(words_1) - len(words_2) == len(words_1 - words_2):
        
        return 1
    
    elif len(words_2) - len(words_1) == len(words_2 - words_1):
        
        return -1
    
    else:
        
        return 0

In [17]:
def get_score_match(row, vect):
    
    q1 = vect.transform([row['question1']])
    q2 = vect.transform([row['question2']])
    
    nonzero1 = set(q1.nonzero()[1])
    nonzero2 = set(q2.nonzero()[1])
    
    numerator = q1[0, list(nonzero2)].sum() + q2[0, list(nonzero1)].sum()
    denominator = q1.sum() + q2.sum()
    if denominator:
        return numerator/denominator
    else:
        return 0

In [6]:
def get_lengths_info(row, stop_words=[]):
    
    if len(stop_words):
        
        row['question1'] = [x for x in row['question1'].split() if x not in stop_words]
        row['question2'] = [x for x in row['question2'].split() if x not in stop_words]
    
    l1 = len(row['question1'])
    l2 = len(row['question2'])
    
    if l1 == 0 or l2 == 0:
        return [0]*5
    
    word_length_1 = list(map(len, row['question1']))
    word_length_2 = list(map(len, row['question2']))
    
    avg_word_length_1 = np.mean(word_length_1) if len(word_length_1) else 0
    avg_word_length_2 = np.mean(word_length_2) if len(word_length_2) else 0
    
    median_wl_1 = np.median(word_length_1) if len(word_length_1) else 0
    median_wl_2 = np.median(word_length_2) if len(word_length_2) else 0
    
    uniq_l1, uniq_cnt_l1 = np.unique(word_length_1, return_counts=True)
    uniq_l2, uniq_cnt_l2 = np.unique(word_length_2, return_counts=True)
    
    var_word_length_1 = np.var(word_length_1) if len(word_length_1) else 0
    var_word_length_2 = np.var(word_length_2) if len(word_length_2) else 0
    
    return [abs(avg_word_length_1-avg_word_length_2), abs(median_wl_1-median_wl_2),
            abs(var_word_length_1-var_word_length_2), abs(l1-l2),
            abs(np.log(l1)-np.log(l2))]

In [7]:
def calculate_vect_cosine(row, vect):
    
    q1 = vect.transform([row['question1']])
    q2 = vect.transform([row['question2']])

    return q1.dot(q2.T).toarray().flatten()[0]


In [8]:
train = pd.read_csv('preprocessed_to_word2vec.csv')

In [9]:
class RowTransformer(TransformerMixin):
    
    def __init__(self, func):
        self.func = func
        
    def translate(self, X):
        d = X.apply(self.func, axis=1).as_matrix()
        try:
            s = len(d[0])
        except:
            s = 1
        z = np.zeros((d.shape[0], s))
        for i in range(d.shape[0]):
            z[i, :] = d[i]
        return z
    
    def fit_transform(self, X, y=None):
        return self.translate(X)
    
    def transform(self, X, y=None):
        return self.translate(X)
    
    def fit(self,X,y=None):
        return self

In [35]:
raw_train = pd.read_csv('train.csv')

In [38]:
test_string = "President Obama took his daughter into White House."
tagger = Ner(host='localhost',port=9199)


def get_entities(row):
    
    q1 = [x for x in tagger.get_entities(row['question1']) if x[1] != 'O']
    q2 = [x for x in tagger.get_entities(row['question2']) if x[1] != 'O']
    
    return [q1, q2]

ent = raw_train.ix[:10000].apply(get_entities, axis=1)

ent.apply(lambda x: True if x == [[], []] else False).mean()

types_of_entities = set()

for row in ent:
    
    types_of_entities = types_of_entities | set([x[1] for x in row[1]]) | set([x[1] for x in row[0]])

types_of_entities

from collections import defaultdict

def get_entities_diff(row):
    
    counts = [defaultdict(list), defaultdict(list)]
    
    for r in range(2):
        
        for ent in row[r]:
            
            counts[r][ent[1]].append(ent[0])
            
    stats = []
            
    for t in ['LOCATION', 'ORGANIZATION', 'PERSON']:
        
        common = [x for x in counts[0][t] if x in counts[1][t]] + [x for x in counts[1][t] if x in counts[0][t]]
        logor = counts[0][t] + counts[1][t]
        prop = len(common)/len(logor) if len(logor) else 0
        
        stats += [prop, len(common), len(logor), len(counts[1][t]), len(counts[0][t]), 
                  abs(len(counts[1][t]) - len(counts[0][t]))]
        
    return stats

In [13]:
base_to_vectorize = np.concatenate([train['question1'], train['question2']], axis=0)

In [14]:
tfidf = TfidfVectorizer(max_features=1024, stop_words='english', ngram_range=(1,3)).fit(base_to_vectorize)
count = CountVectorizer(max_features=1024, stop_words='english', ngram_range=(1,3)).fit(base_to_vectorize)
binary = CountVectorizer(binary=True, max_features=1024, stop_words='english', ngram_range=(1,3)).fit(base_to_vectorize)

In [18]:
tfidf_match = partial(get_score_match, vect=tfidf)
count_match = partial(get_score_match, vect=count)
binary_match = partial(get_score_match, vect=binary)

In [19]:
train.ix[:1].apply(tfidf_match, axis=1)

0    0.910801
1    0.000000
dtype: float64

In [21]:
tfidf_cosine = partial(calculate_vect_cosine, vect=tfidf)

In [25]:
grand_transformer = make_union(*[RowTransformer(x) for x in [tfidf_cosine, tfidf_match, count_match, binary_match,
                                                             is_subset, get_lengths_info, get_ngram_features]])

In [26]:
tent = grand_transformer.transform(train.ix[:10000])

In [34]:
lr = LogisticRegression(C=5)
cross_val_score(lr, tent, train.ix[:len(tent)-1, 'is_duplicate'], scoring='neg_log_loss', cv=10).mean()

-0.57981578889201402

In [40]:
entx = raw_train.ix[:10000].apply(get_entities, axis=1)

In [41]:
entx = np.asarray(entx.apply(get_entities_diff).tolist())

In [49]:
for C in range(1, 11):
    lr = LogisticRegression(C=C)
    print(
        cross_val_score(lr, np.hstack([tent, entx]), train.ix[:len(tent)-1, 'is_duplicate'], 
                        scoring='neg_log_loss', cv=10).mean()
    )

-0.568670262818
-0.568214044144
-0.567968685893
-0.567793623461
-0.567668578216
-0.567632151996
-0.567577609
-0.567581787563
-0.567492825674
-0.567484730666


In [50]:
tent.shape

(10001, 14)