In [43]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns
from fuzzywuzzy import fuzz
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import tensorflow as tf



In [44]:
import tensorflow_hub as hub
import tensorflow_text as text



In [45]:
encoder_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
preprocessing_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [46]:
bert_preprocess_model = hub.KerasLayer(preprocessing_url)

In [47]:
bert_encoder = hub.KerasLayer(encoder_url)

In [48]:
df = pd.read_csv('train.csv')


In [49]:
df.isnull().sum() ##Only minor data is missing nut still
df = df.dropna()

In [50]:
n = 1000 #no of samples for training
df = df.sample(n,random_state=2)

## Balancing dataset

In [51]:
df.is_duplicate.value_counts()/df.is_duplicate.count()

0    0.62
1    0.38
Name: is_duplicate, dtype: float64

In [52]:
df_du = df[df.is_duplicate==1]
df_ndu = df[df.is_duplicate==0]

In [53]:
df_ndu = df_ndu.sample(df_du.shape[0])
df = pd.concat([df_du,df_ndu],axis=0)


In [54]:
df.is_duplicate.value_counts()/df.is_duplicate.count()

1    0.5
0    0.5
Name: is_duplicate, dtype: float64

In [55]:
df = df.sample(df.shape[0])


In [56]:
def preprocessing(q):
    q = str(q).lower().strip()
    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')
    
    # The pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]', '')
    
    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)
    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would","it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have","wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }
    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")
    
    q = BeautifulSoup(q)
    q = q.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()
    return q

    
     


    
    
    
    

In [57]:
## Feature Engineering

In [58]:
df_ = pd.concat([df['question1'],df['question2'],df['is_duplicate']],axis=1)
# df_.head(100)

In [59]:
##applying preprocessing
df_['question1']=df_['question1'].apply(preprocessing)

df_['question2']=df_['question2'].apply(preprocessing)

In [60]:


def process_features(row):
    
    q1 = row['question1']
    q2 = row['question2']
    from nltk.corpus import stopwords 
    sw = stopwords.words("english")
    features = [0.0]*10
    q1_tokens = word_tokenize(q1)#tokenization
    q2_tokens = word_tokenize(q2)
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return features
    common_token_count = len(list([tk for tk in q1_tokens if tk in q2_tokens]))
    
    q1_words = list([word for word in q1_tokens if word not in sw])##words
    q2_words = list([word for word in q2_tokens if word not in sw])
    
    q1_stopword = list([word for word in q1_tokens if word in sw])##stopwords
    q2_stopword = list([word for word in q2_tokens if word in sw])
    

    common_word_count = len(list([word for word in q1_words if word in q2_words]))
    common_stopword_count = len(list([stopword for stopword in q1_stopword if stopword in q2_stopword]))
    common_token_count = len(list([token for token in q1_tokens if token in q2_tokens]))
    
    features[0] = abs(len(q1_tokens)-len(q2_tokens))#lendiff
    features[1] = (len(q2_tokens)+len(q1_tokens))/2 #meanlen
    
    features[2] = common_word_count / (min(len(q1_words), len(q2_words)) + 0.0001) #mincommonwords
    features[3] = common_word_count / (max(len(q1_words), len(q2_words)) + 0.0001) #maxcommonwords
    features[4] = common_stopword_count / (min(len(q1_stopword), len(q2_stopword)) + 0.0001)#mincommonstopword
    features[5] = common_stopword_count / (max(len(q1_stopword), len(q2_stopword)) + 0.0001)#maxcommonsw
    features[6] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + 0.0001)#mintk
    features[7] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + 0.0001)#maxtk
    
    features[8] = int(q1_tokens[-1] == q2_tokens[-1])#lastwordequal
    features[9] = int(q1_tokens[0] == q2_tokens[0]) #firstwordequal
    
    
    return features

In [61]:
features = df_.apply(process_features, axis=1)


In [62]:
df_["len1"]  = list(map(lambda x: x[0], features))
df_["len2"]  = list(map(lambda x: x[1], features))
df_["cwc_min"]  = list(map(lambda x: x[2], features))
df_["cwc_max"]  = list(map(lambda x: x[3], features))
df_["csc_min"]  = list(map(lambda x: x[4], features))
df_["csc_max"]  = list(map(lambda x: x[5], features))
df_["ctc_min"]  = list(map(lambda x: x[6], features))
df_["ctc_max"]  = list(map(lambda x: x[7], features))
df_["last_word_eq"]  = list(map(lambda x: x[8], features))
df_["first_word_eq"]  = list(map(lambda x: x[9], features))


## Fuzzy Features

In [63]:
def process_fuzzy_features(row):
    
    q1 = row['question1']
    q2 = row['question2']
    
    fuzzy_features = [0.0]*7
    
    # fuzz_ratio
    fuzzy_features[0] = fuzz.QRatio(q1, q2)
    
    #W_ratio
    fuzzy_features[1] = fuzz.WRatio(q1,q2)

    # fuzz_partial_ratio
    fuzzy_features[2] = fuzz.partial_ratio(q1, q2)
    
    #partial_token_set_ratio
    fuzzy_features[3] = fuzz.partial_token_set_ratio(q1, q2)
    
    #partial_token_sort_ratio
    fuzzy_features[4]=fuzz.partial_token_sort_ratio(q1,q2)
    

    # token_set_ratio
    fuzzy_features[5] = fuzz.token_set_ratio(q1, q2)

    # token_sort_ratio
    fuzzy_features[6] = fuzz.token_sort_ratio(q1, q2)

    return fuzzy_features

In [64]:
fuzzy_features = df_.apply(process_fuzzy_features, axis=1)

# Creating new feature columns for fuzzy features
df_['fuzz_ratio'] = list(map(lambda x: x[0], fuzzy_features))
df_['w_fuzz_ratio'] = list(map(lambda x: x[1], fuzzy_features))
df_['fuzz_partial_ratio'] = list(map(lambda x: x[2], fuzzy_features))
df_['partial_token_set_ratio'] = list(map(lambda x: x[3], fuzzy_features))
df_['partial_token_sort_ratio'] = list(map(lambda x: x[4], fuzzy_features))


df_['token_set_ratio'] = list(map(lambda x: x[5], fuzzy_features))
df_['token_sort_ratio'] = list(map(lambda x: x[6], fuzzy_features))

## Normalisation

In [65]:
df_.columns

Index(['question1', 'question2', 'is_duplicate', 'len1', 'len2', 'cwc_min',
       'cwc_max', 'csc_min', 'csc_max', 'ctc_min', 'ctc_max', 'last_word_eq',
       'first_word_eq', 'fuzz_ratio', 'w_fuzz_ratio', 'fuzz_partial_ratio',
       'partial_token_set_ratio', 'partial_token_sort_ratio',
       'token_set_ratio', 'token_sort_ratio'],
      dtype='object')

In [66]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(df_.drop(['question1','question2','is_duplicate'],axis=1))

df_scaled = pd.DataFrame(x_scaled,columns = ['len1', 'len2', 'cwc_min',
       'cwc_max', 'csc_min', 'csc_max', 'ctc_min', 'ctc_max', 'last_word_eq',
       'first_word_eq', 'fuzz_ratio', 'w_fuzz_ratio', 'fuzz_partial_ratio',
       'partial_token_set_ratio', 'partial_token_sort_ratio',
       'token_set_ratio', 'token_sort_ratio'],index=df_.index)

In [67]:
y=df_.is_duplicate.values

## Embedding using BERT

In [68]:

questions = list(df_['question1']) + list(df_['question2'])


In [69]:
def get_sentence_embedding(text_input):
    preprocessed_text = bert_preprocess_model(text_input)
    return bert_encoder(preprocessed_text)['pooled_output']



In [70]:
q1_arr, q2_arr = np.vsplit(get_sentence_embedding(questions).toarray(),2)

ResourceExhaustedError: Exception encountered when calling layer "keras_layer_3" (type KerasLayer).

Graph execution error:

OOM when allocating tensor with shape[4560,12,128,128] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[{{node transformer/layer_4/self_attention/einsum/Einsum}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_restored_function_body_106860]

Call arguments received:
  • inputs={'input_mask': 'tf.Tensor(shape=(4560, 128), dtype=int32)', 'input_type_ids': 'tf.Tensor(shape=(4560, 128), dtype=int32)', 'input_word_ids': 'tf.Tensor(shape=(4560, 128), dtype=int32)'}
  • training=False

In [None]:
pd.options.display.max_columns = None
df1 = pd.DataFrame(q1_arr, index= df_.index)
df2 = pd.DataFrame(q2_arr, index= df_.index)
df2.columns=range(3000,6000,1)
df_vectors = pd.concat([df1,df2], axis=1,)





In [None]:
final_x_df = pd.concat([df_vectors,df_scaled],axis=1)