### Features:
* T-SNE and Word2Vec: https://www.kaggle.com/jeffd23/quora-question-pairs/visualizing-word-vectors-with-t-sne
* WMD and Word2Vec: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/WMD_tutorial.ipynb



### Processing:
* Replace abbreviations: https://www.kaggle.com/life2short/quora-question-pairs/data-processing-replace-abbreviation-of-word

In [1]:
import numpy as np
import pandas as pd
import gensim
import re
import nltk

import datetime
import operator
from collections import Counter
from nltk.corpus import stopwords
from nltk import word_tokenize, ngrams
import matplotlib.pyplot as plt
%matplotlib inline
from pylab import plot, show, subplot, specgram, imshow, savefig

import csv
from gensim import corpora, models, similarities
import gensim
import math

In [2]:
#https://www.kaggle.com/life2short/data-processing-replace-abbreviation-of-word

punctuation='["\'?,\.]' # I will replace all these punctuation with ''

abbr_dict={
    "what's":"what is",
    "what're":"what are",
    "who's":"who is",
    "who're":"who are",
    "where's":"where is",
    "where're":"where are",
    "when's":"when is",
    "when're":"when are",
    "how's":"how is",
    "how're":"how are",

    "i'm":"i am",
    "we're":"we are",
    "you're":"you are",
    "they're":"they are",
    "it's":"it is",
    "he's":"he is",
    "she's":"she is",
    "that's":"that is",
    "there's":"there is",
    "there're":"there are",

    "i've":"i have",
    "we've":"we have",
    "you've":"you have",
    "they've":"they have",
    "who've":"who have",
    "would've":"would have",
    "not've":"not have",

    "i'll":"i will",
    "we'll":"we will",
    "you'll":"you will",
    "he'll":"he will",
    "she'll":"she will",
    "it'll":"it will",
    "they'll":"they will",

    "isn't":"is not",
    "wasn't":"was not",
    "aren't":"are not",
    "weren't":"were not",
    "can't":"can not",
    "couldn't":"could not",
    "don't":"do not",
    "didn't":"did not",
    "shouldn't":"should not",
    "wouldn't":"would not",
    "doesn't":"does not",
    "haven't":"have not",
    "hasn't":"has not",
    "hadn't":"had not",
    "won't":"will not",
    punctuation:'',
    '\s+':' ', # replace multi space with one single space
}

def process_data(data):
    data.replace(abbr_dict,regex=True,inplace=True)
    return data

def basic_cleaning2(string):
    string = str(string)
    string = string.lower()
    string = re.sub('[0-9\(\)\!\^\%\$\'\"\.;,-\?\{\}\[\]\\/]', ' ', string)
    string = ' '.join([i for i in string.split() if i not in ["a", "and", "of", "the", "to", "on", "in", "at", "is"]])
    string = re.sub(' +', ' ', string)
    return string

def basic_clean2df(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i in text_feats:
        df[i] = df[i].apply(lambda x: basic_cleaning2(x).split())
        df[i] = df[i].apply(lambda x: (' '.join(i for i in x)))
    return df

In [3]:
df_train = pd.read_csv('df_train_stemmed.csv')
df_test  = pd.read_csv('df_test_stemmed.csv')

In [4]:
df_train = process_data(df_train)
df_test  = process_data(df_test)

df_train = basic_clean2df(df_train)
df_test = basic_clean2df(df_test)

In [6]:
# https://www.kaggle.com/philschmidt/quora-question-pairs/quora-eda-model-selection-roc-pr-plots

def normalized_word_share(row):
        w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
        w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
        return 1.0 * len(w1 & w2)/(len(w1) + len(w2))
    
def modelselection_features(df2):
    df = df2.copy()
    df['q1len'] = df['question1'].str.len()
    df['q2len'] = df['question2'].str.len()
    df['q1_n_words'] = df['question1'].apply(lambda row: len(row.split(" ")))
    df['q2_n_words'] = df['question2'].apply(lambda row: len(row.split(" ")))
    df['word_share'] = df.apply(normalized_word_share, axis=1)
    return df


eda_train = modelselection_features(df_train)
eda_test = modelselection_features(df_test)

eda_train.to_csv('train_eda_features.csv', index = False)
eda_test.to_csv('test_eda_features.csv', index = False)

In [8]:
full = pd.concat([df_train, df_test])

In [None]:
# https://www.kaggle.com/puneetsl/quora-question-pairs/unusual-meaning-map

def basic_cleaning(string):
    string = str(string)
    try:
        string = string.decode('unicode-escape')
    except Exception:
        pass
    string = string.lower()
    string = re.sub(' +', ' ', string)
    return string

def basic_cleaning2(string):
    string = str(string)
    string = string.lower()
    string = re.sub('[0-9\(\)\!\^\%\$\'\"\.;,-\?\{\}\[\]\\/]', ' ', string)
    string = ' '.join([i for i in string.split() if i not in ["a", "and", "of", "the", "to", "on", "in", "at", "is"]])
    string = re.sub(' +', ' ', string)
    return string

def idf(word):
    return 1 - math.sqrt(docf[word]/total_docs)

def w2v_sim(w1, w2):
    try:
        return model.similarity(w1, w2)*idf(w1)*idf(w2)
    except Exception:
        return 0.0

def img_feature(row):
    s1 = row['question1']
    s2 = row['question2']
    t1 = list((basic_cleaning2(s1)).split())
    t2 = list((basic_cleaning2(s2)).split())
    Z = [[w2v_sim(x, y) for x in t1] for y in t2] 
    a = np.array(Z, order='C')
    return [np.resize(a,(10,10)).flatten()]

def get_img_features(df):
    s = df
    img = s.apply(img_feature, axis=1, raw=True)
    pix_col = [[] for y in range(100)] 
    for k in img.iteritems():
            for f in range(len(list(k[1][0]))):
                pix_col[f].append(k[1][0][f])
    x_train = pd.DataFrame()
    for g in range(len(pix_col)):
        x_train['img'+str(g)] = pix_col[g]
    return x_train

def to_sentences(df):
    questions = []
    for i in range(len(df)):
        questions.append(df.loc[i, 'question1'])
        questions.append(df.loc[i, 'question2'])
    for i in questions:
        sentences.append(nltk.word_tokenize(questions[i]))
    return sentences

full = pd.concat([df_train, df_test])
sentences = to_sentences(full)

model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)

xtr = get_img_features(df_train)
xte = get_img_features(df_test)

xtr.to_csv('train_img_features.csv', index = False)
xte.to_csv('test_img_features.csv', index = False)

In [None]:
# https://www.kaggle.com/sudalairajkumar/quora-question-pairs/simple-exploration-notebook-quora-ques-pair

def get_unigrams(que):
    return [word for word in word_tokenize(que.lower()) if word not in eng_stopwords]

def get_common_unigrams(row):
    return len( set(row["unigrams_ques1"]).intersection(set(row["unigrams_ques2"])) )

def get_common_unigram_ratio(row):
    return float(row["unigrams_common_count"]) / max(len( set(row["unigrams_ques1"]).union(set(row["unigrams_ques2"])) ),1)

def get_bigrams(que):
    return [i for i in ngrams(que, 2)]

def get_common_bigrams(row):
    return len( set(row["bigrams_ques1"]).intersection(set(row["bigrams_ques2"])) )

def get_common_bigram_ratio(row):
    return float(row["bigrams_common_count"]) / max(len( set(row["bigrams_ques1"]).union(set(row["bigrams_ques2"])) ),1)


def feature_extraction(row):
    que1 = str(row['question1'])
    que2 = str(row['question2'])
    out_list = []
    # get unigram features #
    unigrams_que1 = [word for word in que1.lower().split() if word not in eng_stopwords]
    unigrams_que2 = [word for word in que2.lower().split() if word not in eng_stopwords]
    common_unigrams_len = len(set(unigrams_que1).intersection(set(unigrams_que2)))
    common_unigrams_ratio = float(common_unigrams_len) / max(len(set(unigrams_que1).union(set(unigrams_que2))),1)
    out_list.extend([common_unigrams_len, common_unigrams_ratio])

    # get bigram features #
    bigrams_que1 = [i for i in ngrams(unigrams_que1, 2)]
    bigrams_que2 = [i for i in ngrams(unigrams_que2, 2)]
    common_bigrams_len = len(set(bigrams_que1).intersection(set(bigrams_que2)))
    common_bigrams_ratio = float(common_bigrams_len) / max(len(set(bigrams_que1).union(set(bigrams_que2))),1)
    out_list.extend([common_bigrams_len, common_bigrams_ratio])

    # get trigram features #
    trigrams_que1 = [i for i in ngrams(unigrams_que1, 3)]
    trigrams_que2 = [i for i in ngrams(unigrams_que2, 3)]
    common_trigrams_len = len(set(trigrams_que1).intersection(set(trigrams_que2)))
    common_trigrams_ratio = float(common_trigrams_len) / max(len(set(trigrams_que1).union(set(trigrams_que2))),1)
    out_list.extend([common_trigrams_len, common_trigrams_ratio])
    return out_list


eng_stopwords = set(stopwords.words('english'))
train_X = np.vstack( np.array(df_train.apply(lambda row: feature_extraction(row), axis=1)) ) 
test_X = np.vstack( np.array(df_test.apply(lambda row: feature_extraction(row), axis=1)) )

train_X = pd.DataFrame(train_X)
train_X.columns = ['common_unigrams_len', 'common_unigrams_ratio', 
                   'common_bigrams_len', 'common_bigrams_ratio',
                   'common_trigrams_len', 'common_trigrams_ratio']
test_X = pd.DataFrame(test_X)
test_X.columns = ['common_unigrams_len', 'common_unigrams_ratio', 
                   'common_bigrams_len', 'common_bigrams_ratio',
                   'common_trigrams_len', 'common_trigrams_ratio']

train_X.to_csv('train_SRKgrams_features.csv', index = False)
test_X.to_csv('test_SRKgrams_features.csv', index = False)

In [None]:
# https://www.kaggle.com/dasolmar/xgb-with-whq-jaccard
    
print("Original data: X_train: {}, X_test: {}".format(df_train.shape, df_test.shape))
print("Features processing, be patient...")

# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=10000, min_count=2):
    return 0 if count < min_count else 1 / (count + eps)

train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

stops = set(stopwords.words("english"))
def word_shares(row):
    q1_list = str(row['question1']).lower().split()
    q1 = set(q1_list)
    q1words = q1.difference(stops)
    if len(q1words) == 0:
        return '0:0:0:0:0:0:0:0'

    q2_list = str(row['question2']).lower().split()
    q2 = set(q2_list)
    q2words = q2.difference(stops)
    if len(q2words) == 0:
        return '0:0:0:0:0:0:0:0'

    words_hamming = sum(1 for i in zip(q1_list, q2_list) if i[0]==i[1])/max(len(q1_list), len(q2_list))

    q1stops = q1.intersection(stops)
    q2stops = q2.intersection(stops)

    q1_2gram = set([i for i in zip(q1_list, q1_list[1:])])
    q2_2gram = set([i for i in zip(q2_list, q2_list[1:])])

    shared_2gram = q1_2gram.intersection(q2_2gram)

    shared_words = q1words.intersection(q2words)
    shared_weights = [weights.get(w, 0) for w in shared_words]
    q1_weights = [weights.get(w, 0) for w in q1words]
    q2_weights = [weights.get(w, 0) for w in q2words]
    total_weights = q1_weights + q1_weights

    R1 = np.sum(shared_weights) / np.sum(total_weights) #tfidf share
    R2 = len(shared_words) / (len(q1words) + len(q2words) - len(shared_words)) #count share
    R31 = len(q1stops) / len(q1words) #stops in q1
    R32 = len(q2stops) / len(q2words) #stops in q2
    Rcosine_denominator = (np.sqrt(np.dot(q1_weights,q1_weights))*np.sqrt(np.dot(q2_weights,q2_weights)))
    Rcosine = np.dot(shared_weights, shared_weights)/Rcosine_denominator
    if len(q1_2gram) + len(q2_2gram) == 0:
        R2gram = 0
    else:
        R2gram = len(shared_2gram) / (len(q1_2gram) + len(q2_2gram))
    return '{}:{}:{}:{}:{}:{}:{}:{}'.format(R1, R2, len(shared_words), R31, R32, R2gram, Rcosine, words_hamming)


df = pd.concat([df_train, df_test])
df['word_shares'] = df.apply(word_shares, axis=1, raw=True)

x = pd.DataFrame()

x['word_match']       = df['word_shares'].apply(lambda x: float(x.split(':')[0]))
x['word_match_2root'] = np.sqrt(x['word_match'])
x['tfidf_word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[1]))
x['shared_count']     = df['word_shares'].apply(lambda x: float(x.split(':')[2]))

x['stops1_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[3]))
x['stops2_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[4]))
x['shared_2gram']     = df['word_shares'].apply(lambda x: float(x.split(':')[5]))
x['cosine']           = df['word_shares'].apply(lambda x: float(x.split(':')[6]))
x['words_hamming']    = df['word_shares'].apply(lambda x: float(x.split(':')[7]))
x['diff_stops_r']     = x['stops1_ratio'] - x['stops2_ratio']

x['len_q1'] = df['question1'].apply(lambda x: len(str(x)))
x['len_q2'] = df['question2'].apply(lambda x: len(str(x)))
x['diff_len'] = x['len_q1'] - x['len_q2']

x['caps_count_q1'] = df['question1'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
x['caps_count_q2'] = df['question2'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
x['diff_caps'] = x['caps_count_q1'] - x['caps_count_q2']

x['len_char_q1'] = df['question1'].apply(lambda x: len(str(x).replace(' ', '')))
x['len_char_q2'] = df['question2'].apply(lambda x: len(str(x).replace(' ', '')))
x['diff_len_char'] = x['len_char_q1'] - x['len_char_q2']

x['len_word_q1'] = df['question1'].apply(lambda x: len(str(x).split()))
x['len_word_q2'] = df['question2'].apply(lambda x: len(str(x).split()))
x['diff_len_word'] = x['len_word_q1'] - x['len_word_q2']

x['avg_world_len1'] = x['len_char_q1'] / x['len_word_q1']
x['avg_world_len2'] = x['len_char_q2'] / x['len_word_q2']
x['diff_avg_word'] = x['avg_world_len1'] - x['avg_world_len2']

x['exactly_same'] = (df['question1'] == df['question2']).astype(int)
x['duplicated'] = df.duplicated(['question1','question2']).astype(int)
add_word_count(x, df,'how')
add_word_count(x, df,'what')
add_word_count(x, df,'which')
add_word_count(x, df,'who')
add_word_count(x, df,'where')
add_word_count(x, df,'when')
add_word_count(x, df,'why')

print(x.columns)
print(x.describe())

feature_names = list(x.columns.values)
create_feature_map(feature_names)
print("Features: {}".format(feature_names))

x_train = x[:df_train.shape[0]]
x_test  = x[df_train.shape[0]:]
y_train = df_train['is_duplicate'].values
del x, df_train

x_train.drop(['len_q1', 'len_q2', 'len_word_q1', 'len_word_q2', 'diff_len'], axis = 1, inplace = True)
x_test.drop(['len_q1', 'len_q2', 'len_word_q1', 'len_word_q2', 'diff_len'], axis = 1, inplace = True)

x_train.to_csv('train_whq_with_jaccard_feats.csv', index = False)
x_test.to_csv('train_whq_with_jaccard_feats.csv', index = False)

In [None]:
# https://www.kaggle.com/antriksh5235/doc2vec-starter

import math
from gensim.models.doc2vec import Doc2Vec
from gensim.models import doc2vec


def clean_sentence(sent):
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', sent).lower()
    sentence = sentence.split(" ")
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)
    sentence = " ".join(sentence)
    return sentence

def cosine(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)
    return np.dot(v1, v2) / (np.sqrt(np.sum(v1**2)) * np.sqrt(np.sum(v2**2)))

def concatenate(data):
    X_set1 = data['question1']
    X_set2 = data['question2']
    X = X_set1.append(X_set2, ignore_index=True)
    return X

class LabeledLineSentence(object):

    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list

    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield doc2vec.TaggedDocument(words=word_tokenize(doc),
                                         tags=[self.labels_list[idx]])
            
            
data = df_train.dropna(how="any")
#data = df_test.dropna(how="any")

for col in ['question1', 'question2']:
    data[col] = data[col].apply(clean_sentence)
    
y = data['is_duplicate']
X_train, X_test, y_train, y_test = tts(data[['id','question1', 'question2']], y, test_size=0.3)

import multiprocessing
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1

X = concatenate(X_train)
labels = []
for label in X_train['id'].tolist():
    labels.append('SENT_%s_1' % label)
for label in X_train['id'].tolist():
    labels.append('SENT_%s_2' % label)

docs = LabeledLineSentence(X.tolist(), labels)
it = docs.__iter__()
model1 = Doc2Vec(it, size=12, window=8, min_count=5, workers=4)

for epoch in range(10):
    model1.train(it)
    model1.alpha -= 0.0002  # decrease the learning rate
    model1.min_alpha = model1.alpha  # fix the learning rate, no deca
    model1.train(it)

    
X_test.index = np.arange(0, X_test['question1'].shape[0])
y_test.index = np.arange(0, X_test['question1'].shape[0])
count = 0
for i in range(X_test['question1'].shape[0]):
    doc1 = word_tokenize(X_test['question1'][i])
    doc2 = word_tokenize(X_test['question2'][i])
    docvec1 = model1.infer_vector(doc1)
    docvec2 = model1.infer_vector(doc2)
    print(cosine(docvec1, docvec2), y_test[i])
    if count>100:
        break
    count+=1

In [None]:
import timeit

src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/'

train_orig =  pd.read_csv(src + 'train.csv', header=0)
test_orig =  pd.read_csv(src + 'test.csv', header=0)

tic0=timeit.default_timer()
df1 = train_orig[['question1']].copy()
df2 = train_orig[['question2']].copy()
df1_test = test_orig[['question1']].copy()
df2_test = test_orig[['question2']].copy()

df2.rename(columns = {'question2':'question1'},inplace=True)
df2_test.rename(columns = {'question2':'question1'},inplace=True)

train_questions = df1.append(df2)
train_questions = train_questions.append(df1_test)
train_questions = train_questions.append(df2_test)
#train_questions.drop_duplicates(subset = ['qid1'],inplace=True)
train_questions.drop_duplicates(subset = ['question1'],inplace=True)

train_questions.reset_index(inplace=True,drop=True)
questions_dict = pd.Series(train_questions.index.values,index=train_questions.question1.values).to_dict()
train_cp = train_orig.copy()
test_cp = test_orig.copy()
train_cp.drop(['qid1','qid2'],axis=1,inplace=True)

test_cp['is_duplicate'] = -1
test_cp.rename(columns={'test_id':'id'},inplace=True)
comb = pd.concat([train_cp,test_cp])

comb['q1_hash'] = comb['question1'].map(questions_dict)
comb['q2_hash'] = comb['question2'].map(questions_dict)

q1_vc = comb.q1_hash.value_counts().to_dict()
q2_vc = comb.q2_hash.value_counts().to_dict()

def try_apply_dict(x,dict_to_apply):
    try:
        return dict_to_apply[x]
    except KeyError:
        return 0
#map to frequency space
comb['q1_freq'] = comb['q1_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))
comb['q2_freq'] = comb['q2_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))

train_comb = comb[comb['is_duplicate'] >= 0][['id','q1_hash','q2_hash','q1_freq','q2_freq','is_duplicate']]
test_comb = comb[comb['is_duplicate'] < 0][['id','q1_hash','q2_hash','q1_freq','q2_freq']]


In [None]:
# https://www.kaggle.com/jpmiller/which-topics-are-actually-hot

def connected_tuples(pairs):
    # for every element, we keep a reference to the list it belongs to
    lists_by_element = {}
    def make_new_list_for(x, y):
        lists_by_element[x] = lists_by_element[y] = [x, y]
    def add_element_to_list(lst, el):
        lst.append(el)
        lists_by_element[el] = lst
    def merge_lists(lst1, lst2):
        merged_list = lst1 + lst2
        for el in merged_list:
            lists_by_element[el] = merged_list

    for x, y in pairs:
        xList = lists_by_element.get(x)
        yList = lists_by_element.get(y)

        if not xList and not yList:
            make_new_list_for(x, y)

        if xList and not yList:
            add_element_to_list(xList, y)

        if yList and not xList:
            add_element_to_list(yList, x)            

        if xList and yList and xList != yList:
            merge_lists(xList, yList)
    # return the unique lists present in the dictionary
    return set(tuple(l) for l in lists_by_element.values())



train = pd.read_csv('df_train_stemmed.csv')

trainstay = train.loc[train['is_duplicate'] == 1, ['qid1', 'qid2']]
stays = pd.Series(trainstay.values.ravel()).unique().tolist()
allvals = list(range(1, 537934)) # one larger than our max qid
solos = set(allvals) - set (stays)
qid1 = trainstay['qid1'].tolist()
qid2 = trainstay['qid2'].tolist()
mypairs = list(zip(qid1, qid2))
cpairs =  connected_tuples(mypairs)
universe = cpairs.union(solos)

uni2 = list(universe)

ctlist = []
i = 0  
while i < len(uni2):  
    item = str(uni2[i])
    ct = item.count(',') + 1
    ctlist.append(ct)  
    i += 1 
print('Number of Questions in all Sets: {}'.format(sum(ctlist)))
print('Lengths of Connected Sets')

# put it in d dataframe
qSets = pd.DataFrame(
    {'qid': uni2,
    'set_length': ctlist}
    )
qSets.sort_values('set_length', axis=0, ascending=False, inplace=True)
qSets.reset_index(inplace=True, drop=True)
qSets['set_id'] = qSets.index + 1

qSetsS = qSets.loc[qSets['set_length'] == 1] 
qSetsL = qSets.loc[qSets['set_length'] > 1] 

# unnest
rows = []
_ = qSetsL.apply(lambda row: [rows.append([row['set_id'], row['set_length'], nn]) 
                         for nn in row.qid], axis=1)

qRef = pd.DataFrame(rows, columns = ['set_id', 'set_length', 'qid'])

qRef = qRef.append(qSetsS)
qRef.sort_values('qid', inplace=True)
qRef.reset_index(inplace=True, drop=True)
qRef.to_csv('qRef.csv', index=False)


q1s = train.iloc[:, [1,3]]
q2s = train.iloc[:, [2,4]]

new_cols = ['qid', 'question']
q1s.columns = new_cols
q2s.columns = new_cols

lookup = pd.concat([q1s, q2s], ignore_index=True)
lookup.drop_duplicates('qid', inplace=True)

qTop = qRef.drop_duplicates('set_id', keep='first')
j = qTop.merge(lookup, how='left', on='qid')
j.sort_values('set_length', ascending=False).head(6)