In [1]:
import numpy as np
import pandas as pd
import gensim
import re
import nltk
import json
import sys
import datetime
import operator
import matplotlib.pyplot as plt
import math
import csv
import timeit
import os

from collections import Counter
from nltk.corpus import stopwords
from nltk import word_tokenize, ngrams
from gensim import corpora, models, similarities
from gensim.models.doc2vec import Doc2Vec
from gensim.models import doc2vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pylab import plot, show, subplot, specgram, imshow, savefig
from tqdm import tqdm

from kernels_functions import *

In [2]:
src_train = 'df_train_NER.csv'
src_test = 'df_test_NER.csv'

df_train = pd.read_csv(src_train)
df_test = pd.read_csv(src_test)

df_train.fillna('NULL', inplace =  True)
df_test.fillna('NULL', inplace = True)

os.chdir('NER_features')

In [4]:
eda_train = modelselection_features(df_train)
eda_test = modelselection_features(df_test)
eda_train.to_csv('train_eda_features.csv', index = False)
eda_test.to_csv('test_eda_features.csv', index = False)

train_X = np.vstack( np.array(df_train.apply(lambda row: feature_extraction(row), axis=1)) ) 
test_X = np.vstack( np.array(df_test.apply(lambda row: feature_extraction(row), axis=1)) )
train_X = pd.DataFrame(train_X)
train_X.columns = ['common_unigrams_len', 'common_unigrams_ratio', 
                   'common_bigrams_len', 'common_bigrams_ratio',
                   'common_trigrams_len', 'common_trigrams_ratio']
test_X = pd.DataFrame(test_X)
test_X.columns = ['common_unigrams_len', 'common_unigrams_ratio', 
                   'common_bigrams_len', 'common_bigrams_ratio',
                   'common_trigrams_len', 'common_trigrams_ratio']

train_X.to_csv('train_SRKgrams_features.csv', index = False)
test_X.to_csv('test_SRKgrams_features.csv', index = False)


train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}
df = pd.concat([df_train, df_test])
df['word_shares'] = df.apply(word_shares, axis=1, raw=True)
x = pd.DataFrame()
x['word_match']       = df['word_shares'].apply(lambda x: float(x.split(':')[0]))
x['word_match_2root'] = np.sqrt(x['word_match'])
x['tfidf_word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[1]))
x['shared_count']     = df['word_shares'].apply(lambda x: float(x.split(':')[2]))
x['stops1_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[3]))
x['stops2_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[4]))
x['shared_2gram']     = df['word_shares'].apply(lambda x: float(x.split(':')[5]))
x['cosine']           = df['word_shares'].apply(lambda x: float(x.split(':')[6]))
x['words_hamming']    = df['word_shares'].apply(lambda x: float(x.split(':')[7]))
x['diff_stops_r']     = x['stops1_ratio'] - x['stops2_ratio']
x['len_q1'] = df['question1'].apply(lambda x: len(str(x)))
x['len_q2'] = df['question2'].apply(lambda x: len(str(x)))
x['diff_len'] = x['len_q1'] - x['len_q2']
x['caps_count_q1'] = df['question1'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
x['caps_count_q2'] = df['question2'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
x['diff_caps'] = x['caps_count_q1'] - x['caps_count_q2']
x['len_char_q1'] = df['question1'].apply(lambda x: len(str(x).replace(' ', '')))
x['len_char_q2'] = df['question2'].apply(lambda x: len(str(x).replace(' ', '')))
x['diff_len_char'] = x['len_char_q1'] - x['len_char_q2']
x['len_word_q1'] = df['question1'].apply(lambda x: len(str(x).split()))
x['len_word_q2'] = df['question2'].apply(lambda x: len(str(x).split()))
x['diff_len_word'] = x['len_word_q1'] - x['len_word_q2']
x['avg_world_len1'] = x['len_char_q1'] / x['len_word_q1']
x['avg_world_len2'] = x['len_char_q2'] / x['len_word_q2']
x['diff_avg_word'] = x['avg_world_len1'] - x['avg_world_len2']
x['exactly_same'] = (df['question1'] == df['question2']).astype(int)
x['duplicated'] = df.duplicated(['question1','question2']).astype(int)
add_word_count(x, df,'how')
add_word_count(x, df,'what')
add_word_count(x, df,'which')
add_word_count(x, df,'who')
add_word_count(x, df,'where')
add_word_count(x, df,'when')
add_word_count(x, df,'why')
print(x.columns)
print(x.describe())
x_train = x[:df_train.shape[0]]
x_test  = x[df_train.shape[0]:]
y_train = df_train['is_duplicate'].values
del x, df_train

x_train.drop(['len_q1', 'len_q2', 'len_word_q1', 'len_word_q2', 'diff_len'], axis = 1, inplace = True)
x_test.drop(['len_q1', 'len_q2', 'len_word_q1', 'len_word_q2', 'diff_len'], axis = 1, inplace = True)

x_train.to_csv('train_whq_with_jaccard_feats.csv', index = False)
x_test.to_csv('test_whq_with_jaccard_feats.csv', index = False)

  Rcosine = np.dot(shared_weights, shared_weights)/Rcosine_denominator
  R1 = np.sum(shared_weights) / np.sum(total_weights) #tfidf share
  R1 = np.sum(shared_weights) / np.sum(total_weights) #tfidf share


Index(['word_match', 'word_match_2root', 'tfidf_word_match', 'shared_count',
       'stops1_ratio', 'stops2_ratio', 'shared_2gram', 'cosine',
       'words_hamming', 'diff_stops_r', 'len_q1', 'len_q2', 'diff_len',
       'caps_count_q1', 'caps_count_q2', 'diff_caps', 'len_char_q1',
       'len_char_q2', 'diff_len_char', 'len_word_q1', 'len_word_q2',
       'diff_len_word', 'avg_world_len1', 'avg_world_len2', 'diff_avg_word',
       'exactly_same', 'duplicated', 'q1_how', 'q2_how', 'how_both', 'q1_what',
       'q2_what', 'what_both', 'q1_which', 'q2_which', 'which_both', 'q1_who',
       'q2_who', 'who_both', 'q1_where', 'q2_where', 'where_both', 'q1_when',
       'q2_when', 'when_both', 'q1_why', 'q2_why', 'why_both'],
      dtype='object')
         word_match  word_match_2root  tfidf_word_match  shared_count  \
count  2.749217e+06      2.749217e+06      2.750086e+06  2.750086e+06   
mean   2.092401e-01      4.009454e-01      3.026323e-01  2.118248e+00   
std    1.556959e-01      2.20

In [5]:
# https://www.kaggle.com/jturkewitz/magic-features-0-03-gain/

src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/'
train_orig =  pd.read_csv(src + 'train.csv', header=0)
test_orig =  pd.read_csv(src + 'test.csv', header=0)

train_orig = pd.read_csv(src_train)
test_orig = pd.read_csv(src_test)

train_orig = train_orig.loc[:, ['qid1', 'qid2', 'question1', 'question2', 'is_duplicate']]
test_orig = test_orig.loc[:, ['test_id', 'question1', 'question2']]

df1 = train_orig[['question1']].copy()
df2 = train_orig[['question2']].copy()
df1_test = test_orig[['question1']].copy()
df2_test = test_orig[['question2']].copy()

df2.rename(columns = {'question2':'question1'},inplace=True)
df2_test.rename(columns = {'question2':'question1'},inplace=True)

train_questions = df1.append(df2)
train_questions = train_questions.append(df1_test)
train_questions = train_questions.append(df2_test)
#train_questions.drop_duplicates(subset = ['qid1'],inplace=True)
train_questions.drop_duplicates(subset = ['question1'],inplace=True)

train_questions.reset_index(inplace=True,drop=True)
questions_dict = pd.Series(train_questions.index.values,index=train_questions.question1.values).to_dict()
train_cp = train_orig.copy()
test_cp = test_orig.copy()
train_cp.drop(['qid1','qid2'],axis=1,inplace=True)

test_cp['is_duplicate'] = -1
test_cp.rename(columns={'test_id':'id'},inplace=True)
comb = pd.concat([train_cp,test_cp])

comb['q1_hash'] = comb['question1'].map(questions_dict)
comb['q2_hash'] = comb['question2'].map(questions_dict)

q1_vc = comb.q1_hash.value_counts().to_dict()
q2_vc = comb.q2_hash.value_counts().to_dict()

def try_apply_dict(x,dict_to_apply):
    try:
        return dict_to_apply[x]
    except KeyError:
        return 0
#map to frequency space
comb['q1_freq'] = comb['q1_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))
comb['q2_freq'] = comb['q2_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))

train_comb = comb[comb['is_duplicate'] >= 0][['id','q1_hash','q2_hash','q1_freq','q2_freq','is_duplicate']]
test_comb = comb[comb['is_duplicate'] < 0][['id','q1_hash','q2_hash','q1_freq','q2_freq']]

train_comb.to_csv('train_turkewitz_features.csv', index = False)
test_comb.to_csv('test_turkewitz_features.csv', index = False)