# Basic Feature Engineering

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('input/train.csv')
df_test = pd.read_csv("input/test.csv", low_memory=False, iterator=True, chunksize=600000)
df_test = pd.concat(df_test, ignore_index=True)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2345796 entries, 0 to 2345795
Data columns (total 3 columns):
test_id      int64
question1    object
question2    object
dtypes: int64(1), object(2)
memory usage: 53.7+ MB


## Text Cleaning
- **parallelize_dataframe** : Split dataframe into number of cores (4)
- **substitute_thousands** : 10k -> 10000
- **eng_pos_tagger** : Which -> Which/JJ
- **text_cleaning** : remove stop words, stemming words

In [4]:
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from multiprocessing import Pool
import re

In [5]:
# import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_treebank_pos_tagger')  
num_cores = 4

In [6]:
def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_cores)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [7]:
def substitute_thousands(text):
    matches = re.finditer(r'[0-9]+(?P<thousands>\s{0,2}k\b)', text, flags=re.I)
    result = ''
    len_offset = 0
    for match in matches:
        result += '{}000'.format(text[len(result)-len_offset:match.start('thousands')])
        len_offset += 3 - (match.end('thousands') - match.start('thousands'))
    result += text[len(result)-len_offset:]
    return result

In [8]:
tests = ['another 15K keys', 'more 60 k keys', 'a 20 keys', '10K.']
for test in tests:
    print(substitute_thousands(test))

another 15000 keys
more 60000 keys
a 20 keys
10000.


In [9]:
def eng_pos_tagger(text):
    # Using averaged_perceptron_tagger, maxent_treebank_pos_tagger
    # nltk.download('averaged_perceptron_tagger')
    tagger = ["/".join(i) for i in pos_tag(text.split())]
    return ' '.join(tagger)

In [10]:
tests = "Which fish would survive in salt water?"
eng_pos_tagger(tests)

'Which/JJ fish/NN would/MD survive/VB in/IN salt/NN water?/NN'

In [11]:
def text_cleaning(text, remove_stop_words=True, stem_words=True):
    # Clean the text with the option to remove stop_words and to stem words.
    # Clean the text
    text = text.lower()
    text = substitute_thousands(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " america ", text)
    text = re.sub(r" u s ", " america ", text)
    text = re.sub(r"the us", "america", text)
    text = re.sub(r" uk ", " england ", text)
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r" dms ", "direct messages ", text)  
    text = re.sub(r"demonitization", "demonetization", text) 
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text) 
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r"\0rs ", " rs ", text)
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r" j k ", " jk ", text)
    
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stopwords.words('english')]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [14]:
def preprocessing(df):
    question1 = df['question1'].apply(lambda x: text_cleaning(str(x)))
    question2 = df['question2'].apply(lambda x: text_cleaning(str(x)))
    return pd.DataFrame({'question1': question1, 'question2': question2, 'is_duplicate': df['is_duplicate']})

In [15]:
%time train = parallelize_dataframe(df, preprocessing)

CPU times: user 512 ms, sys: 196 ms, total: 708 ms
Wall time: 5min 12s


In [17]:
train.question1[:10]

0             step step guid invest share market india
1                      stori kohinoor kohinoor diamond
2               increas speed internet connect use vpn
3                                     mental lone solv
4    one dissolv water quick sugar salt methan carb...
5    astrolog capricorn sun cap moon cap risingwhat...
6                                            buy tiago
7                                       good geologist
8                                          use instead
9      motorola compani hack charter motorolla dcx3400
Name: question1, dtype: object

## Difference in Question length

In [18]:
df['len_q1'] = df['question1'].apply(lambda x: len(str(x)))
df['len_q2'] = df['question2'].apply(lambda x: len(str(x)))

df['len_word_q1'] = df['question1'].apply(lambda x: len(str(x).split()))
df['len_word_q2'] = df['question2'].apply(lambda x: len(str(x).split()))
df['len_word_q2'] = df['len_word_q2'].fillna(0)

df['avg_world_len1'] = df['len_q1'] / df['len_word_q1']
df['avg_world_len2'] = df['len_q2'] / df['len_word_q2']
train['diff_avg_word'] = df['avg_world_len1'] - df['avg_world_len2']

In [19]:
def median_normalizer(feature):
    median = feature.median()
    feature = feature.apply(lambda x: (x-median)/np.sqrt(feature.sum()))
    return feature

## Question to TfidfVector

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
all_questions = train['question1'].append(train['question2'])

In [31]:
tfidf = TfidfVectorizer(lowercase=True, binary=True)
%time tfidf.fit(all_questions)

CPU times: user 6.29 s, sys: 20 ms, total: 6.31 s
Wall time: 6.3 s


TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [32]:
q1_tfidf1 = tfidf.transform(train['question1'])
q2_tfidf1 = tfidf.transform(train['question2'])

In [24]:
tfidf = TfidfVectorizer(lowercase=True, binary=True, ngram_range=(1,3), analyzer='word',
                        max_features=100000, max_df=0.5, min_df=30, use_idf=True)
%time tfidf.fit(all_questions)

CPU times: user 28.4 s, sys: 328 ms, total: 28.7 s
Wall time: 28.7 s


TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=100000, min_df=30,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [25]:
q1_tfidf2 = tfidf.transform(train['question1'])
q2_tfidf2 = tfidf.transform(train['question2'])

In [27]:
count = CountVectorizer(lowercase=True, binary=True, ngram_range=(1,10), analyzer='char', 
                        max_features=300000, max_df=0.999, min_df=50)
%time count.fit(all_questions)

CPU times: user 4min 49s, sys: 4.8 s, total: 4min 54s
Wall time: 4min 53s


CountVectorizer(analyzer='char', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.999, max_features=300000, min_df=50,
        ngram_range=(1, 10), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [28]:
q1_count = count.transform(train['question1'])
q2_count = count.transform(train['question2'])

## Dimensionality reduction with LSA

In [18]:
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [19]:
svd = TruncatedSVD(n_components=150, random_state=42)
lsa = make_pipeline(svd, Normalizer(norm='l2', copy=False))

In [59]:
all_vect = df['q1_vect'].append(df['q2_vect'], ignore_index=True)
%time lsa.fit_transform(np.asarray(all_vect))

## Calculate Distance, Similarity

In [37]:
from sklearn.metrics.pairwise import paired_cosine_distances
from nltk.metrics.distance import jaccard_distance

In [42]:
%%time
train['tf_distance1'] = paired_cosine_distances(q1_tfidf1, q2_tfidf1)
train['tf_distance2'] = paired_cosine_distances(q1_tfidf2, q2_tfidf2)

CPU times: user 308 ms, sys: 28 ms, total: 336 ms
Wall time: 337 ms


In [44]:
%%time
train['cnt_similarity'] = -(q1_count != q2_count).astype(int)
train['jaccard_dist'] = df.apply(lambda x: jaccard_distance(set(str(x.question1).split(' ')), 
                                                            set(str(x.question2).split(' '))), axis=1)

CPU times: user 16.9 s, sys: 512 ms, total: 17.4 s
Wall time: 17.4 s


## Save as pickle

In [45]:
train.to_pickle('input/train.p')

In [46]:
train = pd.read_pickle('input/train.p')
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404290 entries, 0 to 404289
Data columns (total 8 columns):
is_duplicate      404290 non-null int64
question1         404290 non-null object
question2         404290 non-null object
diff_avg_word     404290 non-null float64
tf_distance1      404290 non-null float64
tf_distance2      404290 non-null float64
cnt_similarity    404290 non-null object
jaccard_dist      404290 non-null float64
dtypes: float64(4), int64(1), object(3)
memory usage: 27.8+ MB
