In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from nltk.tokenize import word_tokenize
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### 1. Imprting Data 

In [3]:
df = pd.read_csv("/content/drive/MyDrive/Deep_Learning/NLP/Duplicate_question/train.csv")

In [4]:
df.sample(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
183882,183882,8461,11397,How do I improve my English speaking?,How can I improve in English?,1
330761,330761,8154,4416,What is an easy way make money online?,What are some easy ways to make done extra mon...,1
60571,60571,105902,23891,are aliens real or are they fake ?,Do aliens exists?,1
153087,153087,156757,240385,Can I pay my personal loan EMI in bulk (1/3) o...,After 3 years I have repaid the written off am...,0
127767,127767,205630,205631,What are some good places to visit in Manhattan?,What are some of the best places to visit in M...,1


### 2. Preprocessing

In [17]:
## Tokenizinf Sequences 
def tokenize(s):
    tokens = []
    tokens = [word_tokenize(str(sentence)) for sentence in s]

    rm1 = []
    for w in tokens:
        sm = re.sub('[^A-Za-z]',' ', str(w))
        x = re.split("\s", sm)
        rm1.append(x)
        
    return rm1


def lower_case(s):
    #Removing whitespaces    
    for sent in s:
        while '' in sent:
            sent.remove('')

    # Lowercasing
    low = []
    for i in s:
        i = [x.lower() for x in i]
        low.append(i)
        
    return low

In [16]:
## Lemmatization : 
def lemmatize(s):
    lemma = []
    wnl = WordNetLemmatizer()
    for doc in s:
        tokens = [wnl.lemmatize(w) for w in doc]
        lemma.append(tokens)

    # Removing Stopwords
    filter_words = []
    Stopwords = set(stopwords.words('english'))

    #ab = spell('nd')
    for sent in lemma:
        tokens = [w for w in sent if w not in Stopwords]
        filter_words.append(tokens)

    space = ' ' 
    sentences = []
    for sentence in filter_words:
        sentences.append(space.join(sentence))
        
    return sentences

In [5]:
## Training Tokenizer on corpus 
MAX_NB_WORDS = 200000
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(list(df['question1'].values.astype(str))+list(df['question2'].values.astype(str)))

In [6]:
word_index = tokenizer.word_index
len(word_index)

95596

In [7]:
from sklearn.model_selection import train_test_split

X_train , X_test , Y_train , Y_test = train_test_split(df.drop(columns=['is_duplicate']),df['is_duplicate'] , test_size=0.20)

# X_train_q1 = tokenizer.texts_to_sequences(np.array(listq1))
X_train_q1 = tokenizer.texts_to_sequences(X_train['question1'].values.astype(str))
X_train_q1 = pad_sequences(X_train_q1, maxlen = 30, padding='post')

# X_train_q2 = tokenizer.texts_to_sequences(np.array(listq2))
X_train_q2 = tokenizer.texts_to_sequences(X_train['question2'].values.astype(str))
X_train_q2 = pad_sequences(X_train_q2, maxlen = 30, padding='post')


X_test_q1 = tokenizer.texts_to_sequences(X_test['question1'].values.astype(str))
X_test_q1 = pad_sequences(X_test_q1,maxlen = 30, padding='post')

X_test_q2 = tokenizer.texts_to_sequences(X_test['question2'].values.astype(str))
X_test_q2 = pad_sequences(X_test_q2, maxlen = 30, padding='post')

### 3. Model Training - Without GLove

In [8]:
# Model for Q1
import tensorflow as tf
from tensorflow.keras.layers import BatchNormalization

model_q1 = tf.keras.Sequential()
model_q1.add(Embedding(input_dim = len(word_index)+1,
                       output_dim = 200,
                      #weights = [embedding_matrix],
                      input_length = 30))
model_q1.add(LSTM(128, activation = 'tanh', return_sequences = True))
model_q1.add(Dropout(0.2))
model_q1.add(LSTM(128, return_sequences = True))
model_q1.add(LSTM(128))
model_q1.add(Dense(60, activation = 'tanh'))
model_q1.add(Dense(2, activation = 'sigmoid'))

In [9]:
# Model for Q2
model_q2 = tf.keras.Sequential()
model_q2.add(Embedding(input_dim = len(word_index)+1,
                       output_dim = 200,
                      #weights = [embedding_matrix],
                      input_length = 30))
model_q2.add(LSTM(128, activation = 'tanh', return_sequences = True))
model_q2.add(Dropout(0.2))
model_q2.add(LSTM(128, return_sequences = True))
model_q2.add(LSTM(128))
model_q2.add(Dense(60, activation = 'tanh'))
model_q2.add(Dense(2, activation = 'sigmoid'))

In [10]:
# Merging the output of the two models,i.e, model_q1 and model_q2
mergedOut = Multiply()([model_q1.output, model_q2.output])

mergedOut = Flatten()(mergedOut)
mergedOut = Dense(100, activation = 'relu')(mergedOut)
mergedOut = Dropout(0.2)(mergedOut)
mergedOut = Dense(50, activation = 'relu')(mergedOut)
mergedOut = Dropout(0.2)(mergedOut)
mergedOut = Dense(2, activation = 'sigmoid')(mergedOut)

In [11]:
new_model = Model([model_q1.input, model_q2.input], mergedOut)
new_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy',
                 metrics = ['accuracy'])
history = new_model.fit([X_train_q1,X_train_q2],Y_train, batch_size = 2000, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [38]:
y_pred = new_model.predict([X_test_q1, X_test_q2], batch_size=2000, verbose=1)
y_pred += new_model.predict([X_test_q1, X_test_q2], batch_size=2000, verbose=1)
y_pred /= 2



In [39]:
y_pred = np.argmax(y_pred,axis=1)

In [40]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, matthews_corrcoef

def metrics(y_true, y_pred):
    
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    
    return {'Accuracy': acc,
            'Precision': prec,
            'Recall': rec,
            'F1 Score': f1,
            'ROC AUC': roc_auc,
            'True Positive Rate': tpr,
            'False Positive Rate': fpr,
            'MCC': mcc}



In [41]:
metrics(y_true=Y_test.values, y_pred=y_pred)

{'Accuracy': 0.7403720101907046,
 'Precision': 0.6564877317047038,
 'Recall': 0.6177241564726442,
 'F1 Score': 0.6365163189334258,
 'ROC AUC': 0.714756172400934,
 'True Positive Rate': array([0.        , 0.61772416, 1.        ]),
 'False Positive Rate': array([0.        , 0.18821181, 1.        ]),
 'MCC': 0.43536262475890036}

### 3.2 Training - With Glove 

In [45]:
embedding_index = {}
with open('/content/drive/MyDrive/Deep_Learning/NLP/Duplicate_question/glove.6B.100d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_index[word] = vectors
    f.close()

In [47]:
embedding_matrix = np.random.random((len(word_index)+1, 100))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [58]:
# Model for Q1
import tensorflow as tf
from tensorflow.keras.layers import BatchNormalization

model_q1 = tf.keras.Sequential()
model_q1.add(Embedding(input_dim = len(word_index)+1,
                       output_dim = 100,
                      weights = [embedding_matrix],
                      input_length = 30))
model_q1.add(LSTM(128, activation = 'tanh', return_sequences = True))
model_q1.add(Dropout(0.2))
model_q1.add(LSTM(128, return_sequences = True))
model_q1.add(LSTM(128))
model_q1.add(Dense(60, activation = 'tanh'))
model_q1.add(Dense(2, activation = 'sigmoid'))

In [59]:
# Model for Q2
model_q2 = tf.keras.Sequential()
model_q2.add(Embedding(input_dim = len(word_index)+1,
                       output_dim = 100,
                      weights = [embedding_matrix],
                      input_length = 30))
model_q2.add(LSTM(128, activation = 'tanh', return_sequences = True))
model_q2.add(Dropout(0.2))
model_q2.add(LSTM(128, return_sequences = True))
model_q2.add(LSTM(128))
model_q2.add(Dense(60, activation = 'tanh'))
model_q2.add(Dense(2, activation = 'sigmoid'))

In [60]:
# Merging the output of the two models,i.e, model_q1 and model_q2
mergedOut = Multiply()([model_q1.output, model_q2.output])

mergedOut = Flatten()(mergedOut)
mergedOut = Dense(100, activation = 'relu')(mergedOut)
mergedOut = Dropout(0.2)(mergedOut)
mergedOut = Dense(50, activation = 'relu')(mergedOut)
mergedOut = Dropout(0.2)(mergedOut)
mergedOut = Dense(2, activation = 'sigmoid')(mergedOut)

In [61]:
new_model = Model([model_q1.input, model_q2.input], mergedOut)
new_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy',
                 metrics = ['accuracy'])
history = new_model.fit([X_train_q1,X_train_q2],Y_train, batch_size = 2000, epochs = 30,validation_data=([X_test_q1, X_test_q2],Y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [62]:
y_pred = new_model.predict([X_test_q1, X_test_q2], batch_size=2000, verbose=1)
y_pred += new_model.predict([X_test_q1, X_test_q2], batch_size=2000, verbose=1)
y_pred /= 2



In [63]:
y_pred = np.argmax(y_pred,axis=1)

In [64]:
metrics(y_true=Y_test.values, y_pred=y_pred)

{'Accuracy': 0.7616315021395533,
 'Precision': 0.6755308460044209,
 'Recall': 0.6778464847425729,
 'F1 Score': 0.6766866843358942,
 'ROC AUC': 0.7441324318354953,
 'True Positive Rate': array([0.        , 0.67784648, 1.        ]),
 'False Positive Rate': array([0.        , 0.18958162, 1.        ]),
 'MCC': 0.4879173390047374}