# Create a new dataset with advanced features

#### Final dataset = BOW + Custom Features + Token Features + Length Features + Fuzzy Features


### Table of Contents
<ul>
    <li><a href="#start">Let's get started</a></li>
    <li><a href="#gather">Gather</a></li>
    <li><a href="#pp">Preprocess</a></li>
    <li><a href="#bow">Bag of Words</a></li>
    <li><a href="#cf">Custom Features</a></li>
    <li><a href="#tf">Token Features</a></li>
    <li><a href="#lf">Length Based Features</a></li>
    <li><a href="#ff">Fuzzy Features</a></li>
    <li><a href="#sd">Save Dataset</a></li>
</ul>

<a id='start'></a>
### Let's get started

In [1]:
import warnings
warnings.filterwarnings('ignore')
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import re
from bs4 import BeautifulSoup

<a id='gather'></a>
### Gather

In [3]:
df = pd.read_csv(os.path.join('data', 'preprocessed', 'train.csv'), index_col=0)
print(df.shape)
df.head()

(404287, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


<a id='pp'></a>
### Preprocess

In [4]:
def preprocess(q):
    q = str(q).lower().strip()
    
    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')
    
    # The pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]', '')
    
    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)
    
    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")
    
    # Removing HTML tags
    q = BeautifulSoup(q)
    q = q.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()

    
    return q

In [5]:
df['question1'] = df['question1'].apply(lambda x: preprocess(x))
df['question2'] = df['question2'].apply(lambda x: preprocess(x))
print(df.shape)
df.head()

(404287, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,1,3,4,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...,0
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when 23 24 math is divi...,0
4,4,9,10,which one dissolve in water quikly sugar salt...,which fish would survive in salt water,0


<a id='bow'></a>
### Bag Of Words

In [6]:
questions = df['question1'].tolist() + df['question2'].tolist()
len(questions)

808574

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
question1, question2 = np.vsplit(cv.fit_transform(questions).toarray(), 2)

In [8]:
custom_bag_of_words_df = pd.concat([pd.DataFrame(question1), pd.DataFrame(question2)], axis=1)
custom_bag_of_words_df.shape

(404287, 5000)

In [9]:
custom_bag_of_words_df.columns = ['q1_' + str(i) for i in range(2500)] + ['q2_' + str(i) for i in range(2500, 5000)]

In [10]:
custom_bag_of_words_df['is_duplicate'] = df['is_duplicate'].values
custom_bag_of_words_df.head()

Unnamed: 0,q1_0,q1_1,q1_2,q1_3,q1_4,q1_5,q1_6,q1_7,q1_8,q1_9,...,q2_4991,q2_4992,q2_4993,q2_4994,q2_4995,q2_4996,q2_4997,q2_4998,q2_4999,is_duplicate
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<a id='cf'></a>
### Custom Features
- q1len
- q2len
- q1words
- q2words
- wordscommon
- wordstotal
- wordsshare

In [11]:
custom_bag_of_words_df['q1_len'] = df['question1'].str.len()
custom_bag_of_words_df['q2_len'] = df['question2'].str.len()
custom_bag_of_words_df['q1_word_len'] = df['question1'].apply(lambda x: len(x.split(' ')))
custom_bag_of_words_df['q2_word_len'] = df['question2'].apply(lambda x: len(x.split(' ')))
custom_bag_of_words_df['common_words'] = df.apply(lambda x: len(set(x['question1'].split(' ')).intersection(set(x['question2'].split(' ')))), axis=1)
custom_bag_of_words_df['total_words'] = df.apply(lambda x: len(set(x['question1'].split(' ')).union(set(x['question2'].split(' ')))), axis=1)
custom_bag_of_words_df['common_words_ratio'] = custom_bag_of_words_df['common_words'] / custom_bag_of_words_df['total_words']

In [12]:
custom_bag_of_words_df.head()

Unnamed: 0,q1_0,q1_1,q1_2,q1_3,q1_4,q1_5,q1_6,q1_7,q1_8,q1_9,...,q2_4998,q2_4999,is_duplicate,q1_len,q2_len,q1_word_len,q2_word_len,common_words,total_words,common_words_ratio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,65.0,56.0,14.0,12.0,11.0,12.0,0.916667
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,50.0,87.0,12.0,17.0,8.0,18.0,0.444444
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,72.0,58.0,14.0,10.0,4.0,20.0,0.2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,49.0,58.0,12.0,16.0,1.0,21.0,0.047619
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,75.0,38.0,15.0,7.0,4.0,17.0,0.235294


<a id='tf'></a>
### Token Features
- cwc_min - num of common words / min(words(q1), words(q2))
- cwc_max - num of common words / max(words(q1), words(q2))
- csc_min - num of stop words / min(stopwords(q1), stopwords(q2))
- csc_max - num of stop words / max(stopwords(q1), stopwords(q2))
- ctc_min - num of common tokens / min(tokens(q1), tokens(q2))
- ctc_max - num of common tokens / max(tokens(q1), tokens(q2))
- last_word_equal
- first_word_equal

In [13]:
from nltk.corpus import stopwords
# Token Features
def extract_token_features(row):
  q1, q2 = row['question1'], row['question2']
  ret_features = [0.0] * 8
  q1_tokens = q1.split()
  q2_tokens = q2.split()
  stop_words = set(stopwords.words('english'))
  if len(q1_tokens) == 0 or len(q2_tokens) == 0:
    return ret_features
  q1_words = set([word for word in q1_tokens if word not in stop_words])
  q2_words = set([word for word in q2_tokens if word not in stop_words])

  q1_stop_words = set([word for word in q1_tokens if word in stop_words])
  q2_stop_words = set([word for word in q2_tokens if word in stop_words])

  common_word_count = len(q1_words.intersection(q2_words))
  common_stop_word_count = len(q1_stop_words.intersection(q2_stop_words))
  common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))

  SAFE_DIV = 0.0001
  # cwc_min - num of common words / min(words(q1), words(q2))
  ret_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
  # cwc_max - num of common words / max(words(q1), words(q2))
  ret_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
  # csc_min - num of stop words / min(stopwords(q1), stopwords(q2))
  ret_features[2] = common_stop_word_count / (min(len(q1_stop_words), len(q2_stop_words)) + SAFE_DIV)
  # csc_max - num of stop words / max(stopwords(q1), stopwords(q2))
  ret_features[3] = common_stop_word_count / (max(len(q1_stop_words), len(q2_stop_words)) + SAFE_DIV)
  # ctc_min - num of common tokens / min(tokens(q1), tokens(q2))
  ret_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
  # ctc_max - num of common tokens / max(tokens(q1), tokens(q2))
  ret_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
  # last_word_equal
  ret_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
  # first_word_equal
  ret_features[7] = int(q1_tokens[0] == q2_tokens[0])

  return ret_features
  

In [14]:
token_features = df.apply(extract_token_features, axis=1)
custom_bag_of_words_df['cwc_min'] = list(map(lambda x: x[0], token_features))
custom_bag_of_words_df['cwc_max'] = list(map(lambda x: x[1], token_features))
custom_bag_of_words_df['csc_min'] = list(map(lambda x: x[2], token_features))
custom_bag_of_words_df['csc_max'] = list(map(lambda x: x[3], token_features))
custom_bag_of_words_df['ctc_min'] = list(map(lambda x: x[4], token_features))
custom_bag_of_words_df['ctc_max'] = list(map(lambda x: x[5], token_features))
custom_bag_of_words_df['last_word_eq'] = list(map(lambda x: x[6], token_features))
custom_bag_of_words_df['first_word_eq'] = list(map(lambda x: x[7], token_features))

print(custom_bag_of_words_df.shape)
custom_bag_of_words_df.head()

(404287, 5016)


Unnamed: 0,q1_0,q1_1,q1_2,q1_3,q1_4,q1_5,q1_6,q1_7,q1_8,q1_9,...,total_words,common_words_ratio,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq
0,0,0,0,0,0,0,0,0,0,0,...,12.0,0.916667,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0
1,0,0,0,0,0,0,0,0,0,0,...,18.0,0.444444,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,1.0
2,0,0,0,0,0,0,0,0,0,0,...,20.0,0.2,0.399992,0.333328,0.399992,0.249997,0.399996,0.285712,0.0,1.0
3,0,0,0,0,0,0,0,0,0,0,...,21.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,17.0,0.235294,0.399992,0.199998,0.99995,0.666644,0.57142,0.30769,0.0,1.0


<a id='lf'></a>
### Length Based Features
- mean_len
- abs_len_diff
- longest_substr_ratio

In [15]:
import distance

def extract_length_features(row):
  # Length Based Features
  q1, q2 = row['question1'], row['question2']
  ret_features = [0.0] * 3
  q1_tokens = q1.split()
  q2_tokens = q2.split()
  if len(q1_tokens) == 0 or len(q2_tokens) == 0:
    return ret_features
  # abs_len_diff
  ret_features[0] = abs(len(q1_tokens) - len(q2_tokens))
  # mean_len
  ret_features[1] = abs(len(set(q1_tokens)) + len(set(q2_tokens))) / 2
  # longest_substr_ratio
  strs = list(distance.lcsubstrings(q1, q2))
  ret_features[2] = len(strs[0]) if len(strs) != 0 else 0 / (min(len(q1_tokens), len(q2_tokens)) + 1)
  return ret_features

In [16]:
length_features = df.apply(extract_length_features, axis=1)
custom_bag_of_words_df['abs_len_diff'] = list(map(lambda x: x[0], length_features))
custom_bag_of_words_df['mean_len'] = list(map(lambda x: x[1], length_features))
custom_bag_of_words_df['longest_substr_ratio'] = list(map(lambda x: x[2], length_features))

print(custom_bag_of_words_df.shape)
custom_bag_of_words_df.head()

(404287, 5019)


Unnamed: 0,q1_0,q1_1,q1_2,q1_3,q1_4,q1_5,q1_6,q1_7,q1_8,q1_9,...,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio
0,0,0,0,0,0,0,0,0,0,0,...,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0,2.0,11.5,56.0
1,0,0,0,0,0,0,0,0,0,0,...,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,1.0,5.0,12.0,30.0
2,0,0,0,0,0,0,0,0,0,0,...,0.333328,0.399992,0.249997,0.399996,0.285712,0.0,1.0,4.0,12.0,10.0
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,10.0,2.0
4,0,0,0,0,0,0,0,0,0,0,...,0.199998,0.99995,0.666644,0.57142,0.30769,0.0,1.0,6.0,10.0,6.0


<a id='ff'></a>
### Fuzzy Features (Fuzzywuzzy)
- fuzz_ratio
- fuzz_partial_ratio
- token_sort_ratio
- token_set_ratio

In [17]:
from fuzzywuzzy import fuzz

def extract_fuzzy_features(row):
  # Fuzzy Features
  q1, q2 = row['question1'], row['question2']
  ret_features = [0.0] * 4
  if q1 is None or q2 is None:
    return ret_features
  q1_tokens = q1.split()
  q2_tokens = q2.split()
  if len(q1_tokens) == 0 or len(q2_tokens) == 0:
    return ret_features
  # fuzz_ratio
  ret_features[0] = fuzz.QRatio(q1, q2)
  # fuzz_partial_ratio
  ret_features[1] = fuzz.partial_ratio(q1, q2)
  # token_sort_ratio
  ret_features[2] = fuzz.token_sort_ratio(q1, q2)
  # token_set_ratio
  ret_features[3] = fuzz.token_set_ratio(q1, q2)
  return ret_features

In [18]:
fuzzy_features = df.apply(extract_fuzzy_features, axis=1)
custom_bag_of_words_df['fuzz_ratio'] = list(map(lambda x: x[0], fuzzy_features))
custom_bag_of_words_df['fuzz_partial_ratio'] = list(map(lambda x: x[1], fuzzy_features))
custom_bag_of_words_df['token_sort_ratio'] = list(map(lambda x: x[2], fuzzy_features))
custom_bag_of_words_df['token_set_ratio'] = list(map(lambda x: x[3], fuzzy_features))

print(custom_bag_of_words_df.shape)
custom_bag_of_words_df.head()

(404287, 5023)


Unnamed: 0,q1_0,q1_1,q1_2,q1_3,q1_4,q1_5,q1_6,q1_7,q1_8,q1_9,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio,fuzz_ratio,fuzz_partial_ratio,token_sort_ratio,token_set_ratio
0,0,0,0,0,0,0,0,0,0,0,...,0.785709,0.0,1.0,2.0,11.5,56.0,93.0,100.0,93.0,100.0
1,0,0,0,0,0,0,0,0,0,0,...,0.466664,0.0,1.0,5.0,12.0,30.0,66.0,74.0,63.0,86.0
2,0,0,0,0,0,0,0,0,0,0,...,0.285712,0.0,1.0,4.0,12.0,10.0,43.0,46.0,63.0,63.0
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,10.0,2.0,9.0,11.0,25.0,28.0
4,0,0,0,0,0,0,0,0,0,0,...,0.30769,0.0,1.0,6.0,10.0,6.0,35.0,55.0,47.0,67.0


<a id='sd'></a>
### Save Dataset

In [19]:
custom_bag_of_words_df.to_csv(os.path.join('data', 'preprocessed', 'custom_train.csv'))