### Features:
* T-SNE and Word2Vec: https://www.kaggle.com/jeffd23/quora-question-pairs/visualizing-word-vectors-with-t-sne
* WMD and Word2Vec: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/WMD_tutorial.ipynb



### Processing:
* Replace abbreviations: https://www.kaggle.com/life2short/quora-question-pairs/data-processing-replace-abbreviation-of-word

In [6]:
import numpy as np
import pandas as pd
import gensim
import re
import nltk

import datetime
import operator
from collections import Counter
from nltk.corpus import stopwords
from nltk import word_tokenize, ngrams
import matplotlib.pyplot as plt
%matplotlib inline
from pylab import plot, show, subplot, specgram, imshow, savefig

import csv
from gensim import corpora, models, similarities
import gensim
import math

In [2]:
input_folder = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/'

df_train = pd.read_csv(input_folder + 'train.csv')
df_test  = pd.read_csv(input_folder + 'test.csv')

In [None]:
# https://www.kaggle.com/philschmidt/quora-question-pairs/quora-eda-model-selection-roc-pr-plots
def normalized_word_share(row):
        w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
        w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
        return 1.0 * len(w1 & w2)/(len(w1) + len(w2))
    
def modelselection_features(df2):
    df = df2.copy()
    df['q1len'] = df['question1'].str.len()
    df['q2len'] = df['question2'].str.len()
    df['q1_n_words'] = df['question1'].apply(lambda row: len(row.split(" ")))
    df['q2_n_words'] = df['question2'].apply(lambda row: len(row.split(" ")))
    df['word_share'] = df.apply(normalized_word_share, axis=1)
    return df

In [3]:
# https://www.kaggle.com/ranlocar/quora-question-pairs/xgb-with-whq
def get_weight(count, eps=10000, min_count=2):
    return 0 if count < min_count else 1 / (count + eps)

def add_word_count(x, df, word):
    x['q1_' + word] = df['question1'].apply(lambda x: (word in str(x).lower())*1)
    x['q2_' + word] = df['question2'].apply(lambda x: (word in str(x).lower())*1)
    x[word + '_both'] = x['q1_' + word] * x['q2_' + word]
    
def word_shares(row):
    q1 = set(str(row['question1']).lower().split())
    q1words = q1.difference(stops)
    if len(q1words) == 0:
        return '0:0:0:0:0'

    q2 = set(str(row['question2']).lower().split())
    q2words = q2.difference(stops)
    if len(q2words) == 0:
        return '0:0:0:0:0'

    q1stops = q1.intersection(stops)
    q2stops = q2.intersection(stops)

    shared_words = q1words.intersection(q2words)
    shared_weights = [weights.get(w, 0) for w in shared_words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]

    R1 = np.sum(shared_weights) / np.sum(total_weights) #tfidf share
    R2 = len(shared_words) / (len(q1words) + len(q2words)) #count share
    R31 = len(q1stops) / len(q1words) #stops in q1
    R32 = len(q2stops) / len(q2words) #stops in q2
    return '{}:{}:{}:{}:{}'.format(R1, R2, len(shared_words), R31, R32)

train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}
stops = set(stopwords.words("english"))

df = pd.concat([df_train, df_test])
df['word_shares'] = df.apply(word_shares, axis=1, raw=True)
x = pd.DataFrame()
x['word_match']       = df['word_shares'].apply(lambda x: float(x.split(':')[0]))
x['word_match_2root'] = np.sqrt(x['word_match'])
x['tfidf_word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[1]))
x['shared_count']     = df['word_shares'].apply(lambda x: float(x.split(':')[2]))

x['stops1_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[3]))
x['stops2_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[4]))
x['diff_stops_r']     = x['stops1_ratio'] - x['stops2_ratio']

x['len_q1'] = df['question1'].apply(lambda x: len(str(x)))
x['len_q2'] = df['question2'].apply(lambda x: len(str(x)))
x['diff_len'] = x['len_q1'] - x['len_q2']

x['len_char_q1'] = df['question1'].apply(lambda x: len(str(x).replace(' ', '')))
x['len_char_q2'] = df['question2'].apply(lambda x: len(str(x).replace(' ', '')))
x['diff_len_char'] = x['len_char_q1'] - x['len_char_q2']

x['len_word_q1'] = df['question1'].apply(lambda x: len(str(x).split()))
x['len_word_q2'] = df['question2'].apply(lambda x: len(str(x).split()))
x['diff_len_word'] = x['len_word_q1'] - x['len_word_q2']

x['avg_world_len1'] = x['len_char_q1'] / x['len_word_q1']
x['avg_world_len2'] = x['len_char_q2'] / x['len_word_q2']
x['diff_avg_word'] = x['avg_world_len1'] - x['avg_world_len2']

x['exactly_same'] = (df['question1'] == df['question2']).astype(int)
x['duplicated'] = df.duplicated(['question1','question2']).astype(int)
add_word_count(x, df,'how')
add_word_count(x, df,'what')
add_word_count(x, df,'which')
add_word_count(x, df,'who')
add_word_count(x, df,'where')
add_word_count(x, df,'when')
add_word_count(x, df,'why')

print(x.columns)
print(x.describe())

#x['q1_how'] = df['question1'].apply(lambda x: ("how" in str(x).lower())*1)
#x['q2_how'] = df['question2'].apply(lambda x: ("how" in str(x).lower())*1)
#x['how_both'] = x['q1_how'] * x['q2_how']
#x['q1_what'] = df['question1'].apply(lambda x: ("what" in str(x).lower())*1)
#x['q2_what'] = df['question2'].apply(lambda x: ("what" in str(x).lower())*1)
#x['what_both'] = x['q1_what'] * x['q2_what']
#x['q1_which'] = df['question1'].apply(lambda x: ("what" in str(x).lower())*1)
#x['q2_which'] = df['question1'].apply(lambda x: ("what" in str(x).lower())*1)

#x['q1_which'] = ("which" in df['question1'].lower()).astype(int)
#x['q2_which'] = ("which" in df['question2'].lower()).astype(int)

#x['q1_where'] = ("where" in df['question1'].lower()).astype(int)
#x['q2_where'] = ("where" in df['question2'].lower()).astype(int)

#x['q1_when'] = ("when" in df['question1'].lower()).astype(int)
#x['q2_when'] = ("when" in df['question2'].lower()).astype(int)

#x.to_csv('Features_fulldata_part1', index = False)



Index(['word_match', 'word_match_2root', 'tfidf_word_match', 'shared_count',
       'stops1_ratio', 'stops2_ratio', 'diff_stops_r', 'len_q1', 'len_q2',
       'diff_len', 'len_char_q1', 'len_char_q2', 'diff_len_char',
       'len_word_q1', 'len_word_q2', 'diff_len_word', 'avg_world_len1',
       'avg_world_len2', 'diff_avg_word', 'exactly_same', 'duplicated',
       'q1_how', 'q2_how', 'how_both', 'q1_what', 'q2_what', 'what_both',
       'q1_which', 'q2_which', 'which_both', 'q1_who', 'q2_who', 'who_both',
       'q1_where', 'q2_where', 'where_both', 'q1_when', 'q2_when', 'when_both',
       'q1_why', 'q2_why', 'why_both'],
      dtype='object')
         word_match  word_match_2root  tfidf_word_match  shared_count  \
count  2.749811e+06      2.749811e+06      2.750086e+06  2.750086e+06   
mean   1.471691e-01      3.163215e-01      1.473250e-01  1.605620e+00   
std    1.279563e-01      2.170479e-01      1.244995e-01  1.548127e+00   
min    0.000000e+00      0.000000e+00      0.000000e+

In [None]:
full_feats = pd.read_csv('../../data/other/other_features/textfeatures_fulldata_part1.csv')

Xtr_feats = full_feats.iloc[0:df_train.shape[0], :]
Xte_feats = full_feats.iloc[df_train.shape[0]:, :]

Xtr_feats.drop(['len_q1', 'len_q2', 'len_word_q1', 'len_word_q2', 'diff_len'], axis = 1, inplace = True)
Xte_feats.drop(['len_q1', 'len_q2', 'len_word_q1', 'len_word_q2', 'diff_len'], axis = 1, inplace = True)

Xtr_feats.to_csv('../../data/other/other_features/textfeatures_train.csv', index = False)
Xte_feats.to_csv('../../data/other/other_features/textfeatures_test.csv', index = False)

In [31]:
# https://www.kaggle.com/puneetsl/quora-question-pairs/unusual-meaning-map

def basic_cleaning(string):
    string = str(string)
    try:
        string = string.decode('unicode-escape')
    except Exception:
        pass
    string = string.lower()
    string = re.sub(' +', ' ', string)
    return string

def basic_cleaning2(string):
    string = str(string)
    string = string.lower()
    string = re.sub('[0-9\(\)\!\^\%\$\'\"\.;,-\?\{\}\[\]\\/]', ' ', string)
    string = ' '.join([i for i in string.split() if i not in ["a", "and", "of", "the", "to", "on", "in", "at", "is"]])
    string = re.sub(' +', ' ', string)
    return string

def idf(word):
    return 1 - math.sqrt(docf[word]/total_docs)

def w2v_sim(w1, w2):
    try:
        return model.similarity(w1, w2)*idf(w1)*idf(w2)
    except Exception:
        return 0.0

def img_feature(row):
    s1 = row['question1']
    s2 = row['question2']
    t1 = list((basic_cleaning2(s1)).split())
    t2 = list((basic_cleaning2(s2)).split())
    Z = [[w2v_sim(x, y) for x in t1] for y in t2] 
    a = np.array(Z, order='C')
    return [np.resize(a,(10,10)).flatten()]

def get_img_features(df):
    s = df
    img = s.apply(img_feature, axis=1, raw=True)
    pix_col = [[] for y in range(100)] 
    for k in img.iteritems():
            for f in range(len(list(k[1][0]))):
                pix_col[f].append(k[1][0][f])
    x_train = pd.DataFrame()
    for g in range(len(pix_col)):
        x_train['img'+str(g)] = pix_col[g]
    return x_train


In [57]:
df_train = pd.read_csv(input_folder + 'train.csv')
df_test  = pd.read_csv(input_folder + 'test.csv')
model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)

xte = get_img_features(df_test)
xte.to_csv('img_features_test.csv', index = False)

In [7]:
# https://www.kaggle.com/sudalairajkumar/quora-question-pairs/simple-exploration-notebook-quora-ques-pair

def get_unigrams(que):
    return [word for word in word_tokenize(que.lower()) if word not in eng_stopwords]

def get_common_unigrams(row):
    return len( set(row["unigrams_ques1"]).intersection(set(row["unigrams_ques2"])) )

def get_common_unigram_ratio(row):
    return float(row["unigrams_common_count"]) / max(len( set(row["unigrams_ques1"]).union(set(row["unigrams_ques2"])) ),1)

def get_bigrams(que):
    return [i for i in ngrams(que, 2)]

def get_common_bigrams(row):
    return len( set(row["bigrams_ques1"]).intersection(set(row["bigrams_ques2"])) )

def get_common_bigram_ratio(row):
    return float(row["bigrams_common_count"]) / max(len( set(row["bigrams_ques1"]).union(set(row["bigrams_ques2"])) ),1)



def feature_extraction(row):
    que1 = str(row['question1'])
    que2 = str(row['question2'])
    out_list = []
    # get unigram features #
    unigrams_que1 = [word for word in que1.lower().split() if word not in eng_stopwords]
    unigrams_que2 = [word for word in que2.lower().split() if word not in eng_stopwords]
    common_unigrams_len = len(set(unigrams_que1).intersection(set(unigrams_que2)))
    common_unigrams_ratio = float(common_unigrams_len) / max(len(set(unigrams_que1).union(set(unigrams_que2))),1)
    out_list.extend([common_unigrams_len, common_unigrams_ratio])

    # get bigram features #
    bigrams_que1 = [i for i in ngrams(unigrams_que1, 2)]
    bigrams_que2 = [i for i in ngrams(unigrams_que2, 2)]
    common_bigrams_len = len(set(bigrams_que1).intersection(set(bigrams_que2)))
    common_bigrams_ratio = float(common_bigrams_len) / max(len(set(bigrams_que1).union(set(bigrams_que2))),1)
    out_list.extend([common_bigrams_len, common_bigrams_ratio])

    # get trigram features #
    trigrams_que1 = [i for i in ngrams(unigrams_que1, 3)]
    trigrams_que2 = [i for i in ngrams(unigrams_que2, 3)]
    common_trigrams_len = len(set(trigrams_que1).intersection(set(trigrams_que2)))
    common_trigrams_ratio = float(common_trigrams_len) / max(len(set(trigrams_que1).union(set(trigrams_que2))),1)
    out_list.extend([common_trigrams_len, common_trigrams_ratio])
    return out_list


eng_stopwords = set(stopwords.words('english'))
train_X = np.vstack( np.array(df_train.apply(lambda row: feature_extraction(row), axis=1)) ) 
test_X = np.vstack( np.array(df_test.apply(lambda row: feature_extraction(row), axis=1)) )


In [10]:
train_X = pd.DataFrame(train_X)
train_X.columns = ['common_unigrams_len', 'common_unigrams_ratio', 
                   'common_bigrams_len', 'common_bigrams_ratio',
                   'common_trigrams_len', 'common_trigrams_ratio']
test_X = pd.DataFrame(test_X)
test_X.columns = ['common_unigrams_len', 'common_unigrams_ratio', 
                   'common_bigrams_len', 'common_bigrams_ratio',
                   'common_trigrams_len', 'common_trigrams_ratio']

In [12]:
train_X.to_csv('SRK_grams_features_train.csv', index = False)
test_X.to_csv('SRK_grams_features_test.csv', index = False)