# Basic Feature Engineering

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('input/train.csv')
df['question1'] = df['question1'].fillna('')
df['question2'] = df['question2'].fillna('')
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


## Text Cleaning
- **parallelize_dataframe** : Split dataframe into number of cores (4)
- **substitute_thousands** : 10k -> 10000
- **eng_pos_tagger** : Which -> Which/JJ
- **text_cleaning** : remove stop words, stemming words

In [3]:
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from multiprocessing import Pool
import re

num_cores = 4

In [4]:
def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_cores)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [5]:
def substitute_thousands(text):
    matches = re.finditer(r'[0-9]+(?P<thousands>\s{0,2}k\b)', text, flags=re.I)
    result = ''
    len_offset = 0
    for match in matches:
        result += '{}000'.format(text[len(result)-len_offset:match.start('thousands')])
        len_offset += 3 - (match.end('thousands') - match.start('thousands'))
    result += text[len(result)-len_offset:]
    return result

In [6]:
tests = ['another 15K keys', 'more 60 k keys', 'a 20 keys', '10K.']
for test in tests:
    print(substitute_thousands(test))

another 15000 keys
more 60000 keys
a 20 keys
10000.


In [7]:
def eng_pos_tagger(text):
    # Using averaged_perceptron_tagger, maxent_treebank_pos_tagger
    # nltk.download('averaged_perceptron_tagger)
    tagger = ["/".join(i) for i in pos_tag(text.split())]
    return ' '.join(tagger)

In [8]:
tests = "Which fish would survive in salt water?"
eng_pos_tagger(tests)

'Which/JJ fish/NN would/MD survive/VB in/IN salt/NN water?/NN'

In [9]:
def text_cleaning(text, remove_stop_words=True, stem_words=False):
    # Clean the text with the option to remove stop_words and to stem words.
    # Clean the text
    text = text.lower()
    text = substitute_thousands(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " america ", text)
    text = re.sub(r" u s ", " america ", text)
    text = re.sub(r"the us", "america", text)
    text = re.sub(r" uk ", " england ", text)
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r" dms ", "direct messages ", text)  
    text = re.sub(r"demonitization", "demonetization", text) 
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text) 
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r"\0rs ", " rs ", text)
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r" j k ", " jk ", text)
    
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stopwords.words('english')]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [10]:
def preprocessing(df):
    df['question1'] = df['question1'].apply(lambda x: text_cleaning(x))
    df['question2'] = df['question2'].apply(lambda x: text_cleaning(x))
    return df

In [11]:
%time df = parallelize_dataframe(df, preprocessing)

CPU times: user 870 ms, sys: 855 ms, total: 1.72 s
Wall time: 9min 49s


In [12]:
df.question1[:10]

0            step step guide invest share market india
1                      story kohinoor kohinoor diamond
2         increase speed internet connection using vpn
3                                mentally lonely solve
4    one dissolve water quickly sugar salt methane ...
5    astrology capricorn sun cap moon cap risingwha...
6                                            buy tiago
7                                       good geologist
8                                          use instead
9      motorola company hack charter motorolla dcx3400
Name: question1, dtype: object

## Difference in Question length

In [13]:
df['len_q1'] = df['question1'].apply(lambda x: len(str(x)))
df['len_q2'] = df['question2'].apply(lambda x: len(str(x)))

df['len_word_q1'] = df['question1'].apply(lambda x: len(str(x).split()))
df['len_word_q2'] = df['question2'].apply(lambda x: len(str(x).split()))
df['len_word_q2'] = df['len_word_q2'].fillna(0)

df['avg_world_len1'] = df['len_q1'] / df['len_word_q1']
df['avg_world_len2'] = df['len_q2'] / df['len_word_q2']
df['diff_avg_word'] = df['avg_world_len1'] - df['avg_world_len2']

In [14]:
def median_normalizer(feature):
    median = feature.median()
    feature = feature.apply(lambda x: (x-median)/np.sqrt(feature.sum()))
    return feature

## Question to TfidfVector

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
vectorizer = TfidfVectorizer(lowercase=False, ngram_range=(1,2), 
                             max_features=10000, max_df=0.5, min_df=2, use_idf=True)
%time vectorizer.fit(df['question1'].append(df['question2']))

CPU times: user 23.3 s, sys: 640 ms, total: 24 s
Wall time: 24.1 s


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=0.5, max_features=10000, min_df=2,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [17]:
df['q1_vect'] = list(vectorizer.transform(df['question1']))
df['q2_vect'] = list(vectorizer.transform(df['question2']))

## Dimensionality reduction with LSA

In [18]:
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [19]:
svd = TruncatedSVD(n_components=150, random_state=42)
lsa = make_pipeline(svd, Normalizer(norm='l2', copy=False))

In [59]:
all_vect = df['q1_vect'].append(df['q2_vect'], ignore_index=True)
%time lsa.fit_transform(np.asarray(all_vect))

In [60]:
df['q1_vect'] = list(lsa.transform(df['q1_vect']))
df['q2_vect'] = list(lsa.transform(df['q2_vect']))

## Cosine Similarity

In [62]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

In [64]:
def question_cosine_similarity(df):
    return cosine_similarity(df['q1_vect'], df['q2_vect'])[0]

In [65]:
%time df['tf_similarity'] = df.apply(question_cosine_similarity, axis=1)['id']

CPU times: user 5min 7s, sys: 4.54 s, total: 5min 11s
Wall time: 5min 17s


## Save as pickle

In [66]:
df.to_pickle('input/train.p')

In [67]:
df = pd.read_pickle('input/train.p')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404290 entries, 0 to 404289
Data columns (total 16 columns):
id                404290 non-null int64
qid1              404290 non-null int64
qid2              404290 non-null int64
question1         404290 non-null object
question2         404290 non-null object
is_duplicate      404290 non-null int64
len_q1            404290 non-null int64
len_q2            404290 non-null int64
len_word_q1       404290 non-null int64
len_word_q2       404290 non-null int64
avg_world_len1    404205 non-null float64
avg_world_len2    404216 non-null float64
diff_avg_word     404151 non-null float64
q1_vect           404290 non-null object
q2_vect           404290 non-null object
tf_similarity     404290 non-null float64
dtypes: float64(4), int64(8), object(4)
memory usage: 52.4+ MB
