In [12]:
import pandas as pd

from to_feature import sentences2feature, tokens2feature, clean_and_tokenize, clean_txt, ltokenize

In [2]:
df = pd.DataFrame({
    'question1': ["today is friday.", "today is friday."], 
    'question2': ["today is monday.", "today is monday."],
    'words1': [["today", "is", "friday."], ["today", "is", "friday."]], 
    'words2': [["today", "is", "monday."], ["today", "is", "monday."]]
})
df

Unnamed: 0,question1,question2,words1,words2
0,today is friday.,today is monday.,"[today, is, friday.]","[today, is, monday.]"
1,today is friday.,today is monday.,"[today, is, friday.]","[today, is, monday.]"


In [3]:
tokens2feature(df, cleaned=False, verbose=True)

cosine finished.
cityblock finished.
jaccard finished.
canberra finished.
minkowski finished.
braycurtis finished.


Unnamed: 0,braycurtis_distance,canberra_distance,cityblock_distance,cos_distance,jaccard_distance,minkowski_distance,shared_word_score,words_len1,words_len2
0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,3,3
1,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,3,3


In [9]:
train_data = pd.read_csv("../datasets/train.csv")
train_data = train_data.fillna("")

In [13]:
train_data["q1_cleaned"] = train_data.question1.apply(clean_txt)
train_data["q2_cleaned"] = train_data.question2.apply(clean_txt)
train_data['words1_cleaned'] = train_data.q1_cleaned.apply(lambda x: ltokenize(x, stem=True))
train_data['words2_cleaned'] = train_data.q2_cleaned.apply(lambda x: ltokenize(x, stem=True))

In [14]:
token_features = tokens2feature(train_data, cleaned=True, verbose=True)

cosine finished.
cityblock finished.
jaccard finished.
canberra finished.
minkowski finished.
braycurtis finished.


In [15]:
sentence_features = sentences2feature(train_data, cleaned=True, verbose=True)

qratio finished.
wratio finished.
partial_ratio finished.
partial_token_set_ratio finished.
partial_token_sort_ratio finished.
token_set_ratio finished.
token_sort_ratio finished.


In [16]:
token_features.head(2)

Unnamed: 0,braycurtis_distance,canberra_distance,cityblock_distance,cos_distance,jaccard_distance,minkowski_distance,shared_word_score,words_len1,words_len2
0,0.142465,32.635204,24.759953,0.041607,1.0,2.819773,0.307692,7,6
1,0.870533,87.560307,86.794571,0.873567,1.0,9.620695,0.153846,4,9


In [17]:
sentence_features.head(2)

Unnamed: 0,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_qratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,fuzz_wratio,question_len1,question_len2
0,98,100,89,93,100,93,95,66,57
1,73,100,75,66,86,63,86,51,88


In [18]:
features = pd.concat([token_features, sentence_features], axis=1)

In [19]:
features.head(2)

Unnamed: 0,braycurtis_distance,canberra_distance,cityblock_distance,cos_distance,jaccard_distance,minkowski_distance,shared_word_score,words_len1,words_len2,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_qratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,fuzz_wratio,question_len1,question_len2
0,0.142465,32.635204,24.759953,0.041607,1.0,2.819773,0.307692,7,6,98,100,89,93,100,93,95,66,57
1,0.870533,87.560307,86.794571,0.873567,1.0,9.620695,0.153846,4,9,73,100,75,66,86,63,86,51,88


In [24]:
features = features[[u'braycurtis_distance', u'canberra_distance', u'cityblock_distance',
       u'cos_distance', u'jaccard_distance',
       u'minkowski_distance', u'shared_word_score', u'words_len1', u'words_len2',
       u'fuzz_partial_ratio', u'fuzz_partial_token_set_ratio',
       u'fuzz_partial_token_sort_ratio', u'fuzz_qratio',
       u'fuzz_token_set_ratio', u'fuzz_token_sort_ratio', u'fuzz_wratio',
       u'question_len1', u'question_len2']]

In [25]:
features.columns = [u'braycurtis_distance', u'canberra_distance', u'cityblock_distance',
       u'cos_distance', u'jaccard_distance',
       u'minkowski_distance', u'shared_word_score', u'word_len1', u'word_len2',
       u'fuzz_partial_ratio', u'fuzz_partial_token_set_ratio',
       u'fuzz_partial_token_sort_ratio', u'fuzz_qratio',
       u'fuzz_token_set_ratio', u'fuzz_token_sort_ratio', u'fuzz_wratio',
       u'question_len1', u'question_len2']

In [26]:
features.to_csv("../datasets/train_featured.csv", index=False)