In [1]:
import pandas as pd
import re
import nltk

In [10]:
qs = pd.read_csv("org_ques.csv", usecols=[3, 4, 5]).dropna()
qs = pd.concat([qs[qs["is_duplicate"] == 1].sample(100000), qs[qs["is_duplicate"] == 0].sample(100000)])

In [11]:
qs

Unnamed: 0,question1,question2,is_duplicate
67,Can we ever store energy produced in lightning?,Is it possible to store the energy of lightning?,1
70021,Where can I get flexible horse fence solution ...,Where can I found very flexible horse fence sy...,1
307818,How should I prepare for SBI PO 2017?,How do I prepare for SBI PO 2017?,1
49510,What are the reasons why many black people hav...,Why do some black people have yellow sclera?,1
130681,What were the major effects of the cambodia ea...,What were the major effects of the cambodia ea...,1
...,...,...,...
125614,What laptop is best for deep learning experime...,What is the best laptop I can get to learn dee...,0
329641,Can I ride horses in WorldCraft (Android)?,How can you get a Town Hall 9 account for free?,0
178280,Which anime are a must watch?,What are the best anime series to watch? I hav...,0
403293,What is the highest temperature a human being ...,What's the highest temperature someone has had...,0


In [12]:
y = qs["is_duplicate"]

In [13]:
y.to_csv("duplicates.csv", index=False)

In [14]:
def preprocess(q):

    q = str(q).lower().strip()

    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')

    # The pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]', '')

    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)

    contractions = {
        "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "can't've": "cannot have",
        "'cause": "because",
        "could've": "could have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hadn't've": "had not have",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'd've": "he would have",
        "he'll": "he will",
        "he'll've": "he will have",
        "he's": "he is",
        "how'd": "how did",
        "how'd'y": "how do you",
        "how'll": "how will",
        "how's": "how does",
        "I'd": "I would",
        "I'd've": "I would have",
        "I'll": "I will",
        "I'll've": "I will have",
        "I'm": "I am",
        "I've": "I have",
        "isn't": "is not",
        "it'd": "it would",
        "it'd've": "it would have",
        "it'll": "it will",
        "it'll've": "it will have",
        "it's": "it is",
        "let's": "let us",
        "ma'am": "madam",
        "mayn't": "may not",
        "might've": "might have",
        "mightn't": "might not",
        "mightn't've": "might not have",
        "must've": "must have",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "needn't": "need not",
        "needn't've": "need not have",
        "o'clock": "of the clock",
        "oughtn't": "ought not",
        "oughtn't've": "ought not have",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "shan't've": "shall not have",
        "she'd": "she would",
        "she'd've": "she would have",
        "she'll": "she will",
        "she'll've": "she will have",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "shouldn't've": "should not have",
        "so've": "so have",
        "so's": "so is",
        "that'd": "that had",
        "that'd've": "that would have",
        "that's": "that is",
        "there'd": "there would",
        "there'd've": "there would have",
        "there's": "there is",
        "they'd": "they would",
        "they'd've": "they would have",
        "they'll": "they will",
        "they'll've": "they will have",
        "they're": "they are",
        "they've": "they have",
        "to've": "to have",
        "wasn't": "was not",
        "we'd": "we would",
        "we'd've": "we would have",
        "we'll": "we will",
        "we'll've": "we will have",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "what'll": "what will",
        "what'll've": "what will have",
        "what're": "what are",
        "what's": "what is",
        "what've": "what have",
        "when's": "when is",
        "when've": "when have",
        "where'd": "where did",
        "where's": "where is",
        "where've": "where have",
        "who'll": "who will",
        "who'll've": "who will have",
        "who's": "who is",
        "who've": "who have",
        "why's": "why is",
        "why've": "why have",
        "will've": "will have",
        "won't": "will not",
        "won't've": "will not have",
        "would've": "would have",
        "wouldn't": "would not",
        "wouldn't've": "would not have",
        "y'all": "you all",
        "y'all'd": "you all would",
        "y'all'd've": "you all would have",
        "y'all're": "you all are",
        "y'all've": "you all have",
        "you'd": "you would",
        "you'd've": "you would have",
        "you'll": "you will",
        "you'll've": "you will have",
        "you're": "you are",
        "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]
    
        q_decontracted.append(word)
    
    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")
    
    # Removing HTML tags
    # q = BeautifulSoup(q)
    # q = q.get_text()
    # 
    # q = textblob.TextBlob(q).correct().string
    
    # Remove punctuations
    pattern = re.compile("\W")
    q = re.sub(pattern, ' ', q).strip()
    
    return q

  pattern = re.compile("\W")


In [15]:
qs["question1"] = qs["question1"].apply(preprocess)

In [16]:
qs["question2"] = qs["question2"].apply(preprocess)

In [17]:
qs['q1_len'] = qs['question1'].str.len()
qs['q2_len'] = qs['question2'].str.len()
qs['q1_num_words'] = qs['question1'].apply(lambda row: len(row.split(" ")))
qs['q2_num_words'] = qs['question2'].apply(lambda row: len(row.split(" ")))

def common_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
    return len(w1 & w2)

qs['word_common'] = qs.apply(common_words, axis=1)

def total_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
    return len(w1) + len(w2)

qs['word_total'] = qs.apply(total_words, axis=1)
qs['word_share'] = round(qs['word_common']/qs['word_total'],2)

In [18]:
from nltk.corpus import stopwords
nltk.download('stopwords')

def fetch_token_features(row):

    q1 = row['question1']
    q2 = row['question2']

    SAFE_DIV = 0.0001

    STOP_WORDS = stopwords.words("english")

    token_features = [0.0]*8

    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    # Get the non-stopwords in Questions
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])

    #Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])

    # Get the common non-stopwords from Question pair
    common_word_count = len(q1_words.intersection(q2_words))

    # Get the common stopwords from Question pair
    common_stop_count = len(q1_stops.intersection(q2_stops))

    # Get the common Tokens from Question pair
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))


    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)

    # Last word of both question is same or not
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])

    # First word of both question is same or not
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])

    return token_features
token_features = qs.apply(fetch_token_features, axis=1)

qs["cwc_min"]       = list(map(lambda x: x[0], token_features))
qs["cwc_max"]       = list(map(lambda x: x[1], token_features))
qs["csc_min"]       = list(map(lambda x: x[2], token_features))
qs["csc_max"]       = list(map(lambda x: x[3], token_features))
qs["ctc_min"]       = list(map(lambda x: x[4], token_features))
qs["ctc_max"]       = list(map(lambda x: x[5], token_features))
qs["last_word_eq"]  = list(map(lambda x: x[6], token_features))
qs["first_word_eq"] = list(map(lambda x: x[7], token_features))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
import distance

def fetch_length_features(row):

    q1 = row['question1']
    q2 = row['question2']

    length_features = [0.0]*3

    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return length_features

    # Absolute length features
    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))

    #Average Token Length of both Questions
    length_features[1] = (len(q1_tokens) + len(q2_tokens))/2

    strs = list(distance.lcsubstrings(q1, q2))
    if len(strs) > 0:
        length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)
    else:
        length_features[2] = 0

    return length_features

length_features = qs.apply(fetch_length_features, axis=1)

qs['abs_len_diff'] = list(map(lambda x: x[0], length_features))
qs['mean_len'] = list(map(lambda x: x[1], length_features))
qs['longest_substr_ratio'] = list(map(lambda x: x[2], length_features))

In [20]:
from fuzzywuzzy import fuzz

def fetch_fuzzy_features(row):

    q1 = row['question1']
    q2 = row['question2']

    fuzzy_features = [0.0]*4

    # fuzz_ratio
    fuzzy_features[0] = fuzz.QRatio(q1, q2)

    # fuzz_partial_ratio
    fuzzy_features[1] = fuzz.partial_ratio(q1, q2)

    # token_sort_ratio
    fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)

    # token_set_ratio
    fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)

    return fuzzy_features
fuzzy_features = qs.apply(fetch_fuzzy_features, axis=1)

# Creating new feature columns for fuzzy features
qs['fuzz_ratio'] = list(map(lambda x: x[0], fuzzy_features))
qs['fuzz_partial_ratio'] = list(map(lambda x: x[1], fuzzy_features))
qs['token_sort_ratio'] = list(map(lambda x: x[2], fuzzy_features))
qs['token_set_ratio'] = list(map(lambda x: x[3], fuzzy_features))



In [24]:
qs

Unnamed: 0,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio,fuzz_ratio,fuzz_partial_ratio,token_sort_ratio,token_set_ratio
67,can we ever store energy produced in lightning,is it possible to store the energy of lightning,1,46,47,8,9,3,17,0.18,...,0.333330,1.0,0.0,1.0,8.5,0.212766,60,65,67,67
70021,where can i get flexible horse fence solution ...,where can i found very flexible horse fence sy...,1,55,68,10,12,7,22,0.32,...,0.583328,1.0,1.0,2.0,11.0,0.410714,73,67,65,83
307818,how should i prepare for sbi po 2017,how do i prepare for sbi po 2017,1,36,32,8,8,7,16,0.44,...,0.874989,1.0,1.0,0.0,8.0,0.787879,91,92,85,95
49510,what are the reasons why many black people hav...,why do some black people have yellow sclera,1,59,43,11,8,5,19,0.26,...,0.454541,0.0,0.0,3.0,9.5,0.590909,63,79,69,79
130681,what were the major effects of the cambodia ea...,what were the major effects of the cambodia ea...,1,124,120,22,22,16,36,0.44,...,0.714282,0.0,1.0,0.0,21.0,0.793388,93,93,92,94
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125614,what laptop is best for deep learning experiments,what is the best laptop i can get to learn dee...,0,49,66,8,14,6,22,0.27,...,0.428568,0.0,1.0,6.0,11.0,0.300000,63,67,63,80
329641,can i ride horses in worldcraft android,how can you get a town hall 9 account for free,0,40,46,8,11,1,19,0.05,...,0.090908,0.0,0.0,4.0,9.0,0.097561,26,30,31,33
178280,which anime are a must watch,what are the best anime series to watch i hav...,0,28,78,6,17,3,21,0.14,...,0.187499,0.0,0.0,10.0,11.0,0.241379,38,61,46,70
403293,what is the highest temperature a human being ...,what is the highest temperature someone has ha...,0,57,57,10,10,5,20,0.25,...,0.499995,0.0,1.0,0.0,10.0,0.551724,70,70,67,74


In [25]:
qs.to_csv('ques_with_heuristic_feat.csv', index=False)