# Basic Feature Engineering

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('input/train.csv')
df_test = pd.read_csv("input/test.csv", low_memory=False, iterator=True, chunksize=600000)
df_test = pd.concat(df_test, ignore_index=True)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2345796 entries, 0 to 2345795
Data columns (total 3 columns):
test_id      int64
question1    object
question2    object
dtypes: int64(1), object(2)
memory usage: 53.7+ MB


## Text Cleaning
- **parallelize_dataframe** : Split dataframe into number of cores (4)
- **substitute_thousands** : 10k -> 10000
- **eng_pos_tagger** : Which -> Which/JJ
- **text_cleaning** : remove stop words, stemming words

In [3]:
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from multiprocessing import Pool
import re

In [4]:
# import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_treebank_pos_tagger')  
num_cores = 4

In [5]:
def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_cores)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [6]:
def substitute_thousands(text):
    matches = re.finditer(r'[0-9]+(?P<thousands>\s{0,2}k\b)', text, flags=re.I)
    result = ''
    len_offset = 0
    for match in matches:
        result += '{}000'.format(text[len(result)-len_offset:match.start('thousands')])
        len_offset += 3 - (match.end('thousands') - match.start('thousands'))
    result += text[len(result)-len_offset:]
    return result

In [7]:
tests = ['another 15K keys', 'more 60 k keys', 'a 20 keys', '10K.']
for test in tests:
    print(substitute_thousands(test))

another 15000 keys
more 60000 keys
a 20 keys
10000.


In [8]:
def eng_pos_tagger(text):
    # Using averaged_perceptron_tagger, maxent_treebank_pos_tagger
    # nltk.download('averaged_perceptron_tagger')
    tagger = ["/".join(i) for i in pos_tag(text.split())]
    return ' '.join(tagger)

In [9]:
tests = "Which fish would survive in salt water?"
eng_pos_tagger(tests)

'Which/JJ fish/NN would/MD survive/VB in/IN salt/NN water?/NN'

In [10]:
def text_cleaning(text, remove_stop_words=True, stem_words=True):
    # Clean the text with the option to remove stop_words and to stem words.
    # Clean the text
    text = text.lower()
    text = substitute_thousands(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " america ", text)
    text = re.sub(r" u s ", " america ", text)
    text = re.sub(r"the us", "america", text)
    text = re.sub(r" uk ", " england ", text)
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r" dms ", "direct messages ", text)  
    text = re.sub(r"demonitization", "demonetization", text) 
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text) 
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r"\0rs ", " rs ", text)
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r" j k ", " jk ", text)
    
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stopwords.words('english')]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [11]:
def preprocessing(df):
    question1 = df['question1'].apply(lambda x: text_cleaning(str(x)))
    question2 = df['question2'].apply(lambda x: text_cleaning(str(x)))
    return pd.DataFrame({'question1': question1, 'question2': question2, 'is_duplicate': df['is_duplicate']})

In [41]:
def preprocessing_test(df):
    question1 = df['question1'].apply(lambda x: text_cleaning(str(x)))
    question2 = df['question2'].apply(lambda x: text_cleaning(str(x)))
    return pd.DataFrame({'question1': question1, 'question2': question2})

In [12]:
%time train = parallelize_dataframe(df, preprocessing)

CPU times: user 420 ms, sys: 252 ms, total: 672 ms
Wall time: 5min 10s


In [13]:
train.question1[:10]

0             step step guid invest share market india
1                      stori kohinoor kohinoor diamond
2               increas speed internet connect use vpn
3                                     mental lone solv
4    one dissolv water quick sugar salt methan carb...
5    astrolog capricorn sun cap moon cap risingwhat...
6                                            buy tiago
7                                       good geologist
8                                          use instead
9      motorola compani hack charter motorolla dcx3400
Name: question1, dtype: object

## Difference in Question length

In [34]:
def word_match_share(row):
    q1words = {}
    q2words = {}
    stops = set(stopwords.words("english"))
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

In [32]:
def median_normalizer(feature):
    median = feature.median()
    feature = feature.apply(lambda x: (x-median)/np.sqrt(feature.sum()))
    return feature

In [35]:
df['len_q1'] = df['question1'].apply(lambda x: len(str(x)))
df['len_q2'] = df['question2'].apply(lambda x: len(str(x)))
df['len_word_q1'] = df['question1'].apply(lambda x: len(str(x).split()))
df['len_word_q2'] = df['question2'].apply(lambda x: len(str(x).split()))
df['len_word_q2'] = df['len_word_q2'].fillna(0)

df['avg_world_len1'] = df['len_q1'] / df['len_word_q1']
df['avg_world_len2'] = df['len_q2'] / df['len_word_q2']

train['diff_avg_word'] = df['avg_world_len1'] - df['avg_world_len2']
train['word_match'] = df.apply(word_match_share, axis=1, raw=True)

## Question to TfidfVector

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
all_questions = pd.concat([train['question1'], train['question2']])

In [20]:
%time tfidf = TfidfVectorizer(lowercase=True, binary=True).fit(all_questions)

CPU times: user 6.22 s, sys: 24 ms, total: 6.25 s
Wall time: 6.25 s


In [21]:
q1_tfidf1 = tfidf.transform(train['question1'])
q2_tfidf1 = tfidf.transform(train['question2'])

In [24]:
%time tfidf = TfidfVectorizer(lowercase=True, binary=True, ngram_range=(1,3), analyzer='word', \
                              max_features=100000, max_df=0.5, min_df=30, use_idf=True).fit(all_questions)

CPU times: user 28.4 s, sys: 448 ms, total: 28.9 s
Wall time: 28.9 s


In [25]:
q1_tfidf2 = tfidf.transform(train['question1'])
q2_tfidf2 = tfidf.transform(train['question2'])

In [26]:
%time count = CountVectorizer(lowercase=True, binary=True, ngram_range=(1,10), analyzer='char', \
                              max_features=300000, max_df=0.999, min_df=50).fit(all_questions)

CPU times: user 4min 49s, sys: 4.74 s, total: 4min 54s
Wall time: 4min 54s


In [27]:
q1_count = count.transform(train['question1'])
q2_count = count.transform(train['question2'])

## Calculate Distance, Similarity
- paired cosine distances
- pairwise distances (cosine, euclidean)

In [28]:
from sklearn.metrics.pairwise import paired_cosine_distances
from nltk.metrics.distance import jaccard_distance

In [29]:
%%time
train['tf_distance1'] = paired_cosine_distances(q1_tfidf1, q2_tfidf1)
train['tf_distance2'] = paired_cosine_distances(q1_tfidf2, q2_tfidf2)
train['cnt_distance'] = paired_cosine_distances(q1_count, q2_count)

CPU times: user 20.9 s, sys: 1.16 s, total: 22.1 s
Wall time: 22.1 s


In [None]:
train['tf_cosine1'] = pairwise_distances(q1_tfidf1, q2_tfidf1, metric='cosine', n_jobs=-1)
train['tf_cosine2'] = pairwise_distances(q1_tfidf2, q2_tfidf2, metric='cosine', n_jobs=-1)
train['cnt_cosine'] = pairwise_distances(q1_count, q2_count, metric='cosine', n_jobs=-1)
train['tf_eucdian1'] = pairwise_distances(q1_tfidf1, q2_tfidf1, metric='euclidean', n_jobs=-1)
train['tf_eucdian2'] = pairwise_distances(q1_tfidf2, q2_tfidf2, metric='euclidean', n_jobs=-1)
train['cnt_eucdian'] = pairwise_distances(q1_count, q2_count, metric='euclidean', n_jobs=-1)

In [None]:
train['jaccard_dist'] = df.apply(
    lambda x: jaccard_distance(set(str(x.question1).split(' ')), set(str(x.question2).split(' '))), axis=1
)

## Save as pickle

In [37]:
train.to_pickle('input/train.p')

In [38]:
train = pd.read_pickle('input/train.p')
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404290 entries, 0 to 404289
Data columns (total 9 columns):
is_duplicate     404290 non-null int64
question1        404290 non-null object
question2        404290 non-null object
diff_avg_word    404290 non-null float64
tf_distance1     404290 non-null float64
tf_distance2     404290 non-null float64
cnt_distance     404290 non-null float64
jaccard_dist     404290 non-null float64
word_match       404290 non-null float64
dtypes: float64(6), int64(1), object(2)
memory usage: 30.8+ MB
