In [1]:
#z import!
import numpy as np 
import pandas as pd 
import os

import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.callbacks import EarlyStopping,ModelCheckpoint,ReduceLROnPlateau


from keras.layers import Dense, Embedding, LSTM, Input, Lambda
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical 

import keras.backend as K
from keras.optimizers import Adadelta
import re

**Loading Training Data**

The training data is loaded from the 'train.csv' file using Pandas. The dataset is then displayed to provide a quick overview of the initial rows.


In [2]:
train_data = pd.read_csv('./train.csv')
pd.set_option('display.max_colwidth',None)
print(f'shape{train_data.shape}')
train_data.head()

shape(16000, 4)


Unnamed: 0,ID,Sentence1,Sentence2,Class
0,0,"But instead of returning to heaven , Annie decided to join Chris in hell forever .","But instead of returning to Heaven , Chris chooses to join Annie forever in Hell .",0
1,1,"In 2016 , Bacardi announced new branding and plans to sell their version of Havana Club nationally , which will be burned in Florida and bottled in Puerto Rico .","In 2016 , Bacardi announced new branding and plans to sell their version of Havana Club nationally , which will be distilled in Puerto Rico and bottled in Florida .",0
2,2,"Kweku Baako Jnr had four children . One of them , Baako is a journalist and editor of the `` New Crusading Guide '' newspaper .","Baako had four children , one of whom was Kweku Baako Jnr , a journalist and editor of the newspaper `` New Crusading Guide '' .",0
3,3,"As with the Navy , the Army has a single-track system , where officers from other Navy communities transfer over to Foreign Area Officer permanently .","Like the army , the Navy has a single-track system , where officers from other Navy communities permanently transfer to Foreign Area Officer .",0
4,4,Sissi units have fewer crew served weapons and more sniper rifles than regular infantry .,Sissi units have more weapons served by the crew and fewer sniper rifles than regular infantry .,0


**Loading Test Data**

The test data is loaded from the 'test.csv' file using Pandas. The dataset is then displayed to offer an initial glimpse of the data structure.

In [3]:
test_data = pd.read_csv('./test.csv')
pd.set_option('display.max_colwidth',None)
print(f'shape{test_data.shape}')
test_data.head()

shape(4700, 3)


Unnamed: 0,ID,Sentence1,Sentence2
0,16000,How do I increase the rank on my website in SEO?,What are some tips on how to increase rank in search engines?
1,16001,I'm a boy. I masturbate. I'm 13. Is it bad to masturbate?,Why do I masturbate when I'm stressed?
2,16002,"The fourth season was premiered on 7 June 2010 , and like the third season was the system of competition for mixed couples .","The third season was premiered on June 7 , 2010 . Like the fourth season the system of the competition was in mixed couples ."
3,16003,"In 2003 , he moved to London and lived there for sixteen months before returning to South Africa in September 2004 .",He moved to South Africa in 2003 and lived there 16 months before returning to London in September 2004 .
4,16004,"Shawn told Shawn that his mother was not dead and his father was still married and on the day of the wedding of Colleen and Santo , Shawn told Colleen .","Stefano told Shawn that his mother was not dead and his father was still married and on the day of Colleen and Santo 's wedding , Shawn told Colleen ."


**Extracting Training Sentences (Column 1)**

The sentences from the first column of the training data are extracted and stored in the 'train_1' list. This list is then displayed using the print function, providing a sample of the sentences in the first column.


In [4]:
train_1 = train_data.iloc[:,1]
train_1 = list(train_1)
print(train_1[:10])

['But instead of returning to heaven , Annie decided to join Chris in hell forever .', 'In 2016 , Bacardi announced new branding and plans to sell their version of Havana Club nationally , which will be burned in Florida and bottled in Puerto Rico .', "Kweku Baako Jnr had four children . One of them , Baako is a journalist and editor of the `` New Crusading Guide '' newspaper .", 'As with the Navy , the Army has a single-track system , where officers from other Navy communities transfer over to Foreign Area Officer permanently .', 'Sissi units have fewer crew served weapons and more sniper rifles than regular infantry .', 'However , the United States later forced other nations and American companies to restrict trade with Cuba with foreign subsidiaries .', 'Deutsche Bahn opened a new underground tunnel to the new railway station Filderstadt on 29 September 2001 .', 'Some Hebrew and Aramaic abbreviations may not be included here ; more may be found in the List of Hebrew abbreviations an

**Extracting Training Sentences (Column 2)**

Similarly, sentences from the second column of the training data are extracted and stored in the 'train_2' list. The content of this list is displayed using the print function, presenting a sample of sentences from the second column.


In [5]:
train_2 = data.iloc[:,2]
train_2 = list(train_2)
print(train_2[:10])

NameError: name 'data' is not defined

In [59]:
full_train = train_1 + train_2
print(full_train[:100])

['But instead of returning to heaven , Annie decided to join Chris in hell forever .', 'In 2016 , Bacardi announced new branding and plans to sell their version of Havana Club nationally , which will be burned in Florida and bottled in Puerto Rico .', "Kweku Baako Jnr had four children . One of them , Baako is a journalist and editor of the `` New Crusading Guide '' newspaper .", 'As with the Navy , the Army has a single-track system , where officers from other Navy communities transfer over to Foreign Area Officer permanently .', 'Sissi units have fewer crew served weapons and more sniper rifles than regular infantry .', 'However , the United States later forced other nations and American companies to restrict trade with Cuba with foreign subsidiaries .', 'Deutsche Bahn opened a new underground tunnel to the new railway station Filderstadt on 29 September 2001 .', 'Some Hebrew and Aramaic abbreviations may not be included here ; more may be found in the List of Hebrew abbreviations an

**Text Tokenization Setup**

A Tokenizer is initialized with a vocabulary size of 5000 words. It is configured to filter out specific characters, convert text to lowercase, and split text based on predefined characters. This tokenizer will be used to convert textual data into numerical sequences for further processing.


In [60]:
num_words = 5000
tokenizer = Tokenizer(num_words=num_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                                   lower=True,split=' ')


**Tokenization on Training Data**

The Tokenizer is fitted on the entire training dataset (`full_train`), extracting unique tokens and building a vocabulary. The number of unique tokens found is printed along with the dictionary mapping words to their respective indices. This information is crucial for the subsequent conversion of text data into sequences of numerical values.


In [62]:
tokenizer.fit_on_texts(full_train)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print(word_index) # print the mapping between unique word and index.

Found 29965 unique tokens.


**Text to Sequence Conversion and Padding for Sentence1**

The sentences from the 'Sentence1' column in the training data are converted into sequences of numerical values using the previously fitted tokenizer. The resulting sequences are then padded to a specified maximum length (`maxlen`) to ensure uniform dimensions. The printed output displays the original and padded sequences for the first sentence, providing insight into the preprocessing steps applied.


In [65]:
X_1 = tokenizer.texts_to_sequences(train_data['Sentence1'].values)
print(X_1[0])
maxlen = 60
X_1 = pad_sequences(X_1, maxlen=maxlen)
print("Padded Sequences: ")
print(X_1)
print(X_1[0])

[43, 601, 4, 2280, 7, 2339, 1891, 7, 2018, 557, 2, 4211]
Padded Sequences: 
[[   0    0    0 ...  557    2 4211]
 [   0    0    0 ...    2 1656 2340]
 [   0    0    0 ...   22   15  882]
 ...
 [   0    0    0 ...    4  287   23]
 [   0    0    0 ...  364   75    1]
 [   0    0    0 ... 1035    2 1074]]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0   43  601    4 2280    7 2339 1891    7
 2018  557    2 4211]


In [66]:
X_1.shape

(16000, 60)

**Text to Sequence Conversion and Padding for Sentence2**

Similar to 'Sentence1', the sentences from the 'Sentence2' column in the data are converted into sequences of numerical values using the pre-fitted tokenizer. The resulting sequences are then padded to a specified maximum length (`maxlen`) to ensure uniform dimensions. The printed output displays the original and padded sequences for the first sentence, offering insight into the preprocessing steps applied to 'Sentence2'.


In [67]:
X_2 = tokenizer.texts_to_sequences(train_data['Sentence2'].values)
print(X_2[0])
maxlen = 60
X_2 = pad_sequences(X_2, maxlen=maxlen)
print("Padded Sequences: ")
print(X_2)
print(X_2[0])

[43, 601, 4, 2280, 7, 557, 7, 2018, 2339, 4211, 2]
Padded Sequences: 
[[   0    0    0 ... 2339 4211    2]
 [   0    0    0 ...    3    2  463]
 [   0    0    0 ...  882   22   15]
 ...
 [   0    0    0 ...    4  287   23]
 [   0    0    0 ...  364   75    1]
 [   0    0    0 ... 1035    2 1074]]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0   43  601    4 2280    7  557    7
 2018 2339 4211    2]


In [68]:
X_2.shape

(16000, 60)

**Training Data Splitting**

The training data is split into training and validation sets using a specified portion (`training_portion`). The labels corresponding to the sentences are extracted from the fourth column of the dataset and stored in the variable 'y'. This step is crucial for training the model and evaluating its performance on unseen data during the training process.


In [69]:
training_portion = 0.8
y = list(train_data.iloc[:,3])

**Text Tokenization and Padding (Test Data - Sentence1)**

For the test data, the sentences from 'Sentence1' are tokenized using the previously fitted tokenizer. The resulting sequences are then padded to ensure uniform length, with a maximum length specified by 'maxlen'. This processing is essential to prepare the test data for input into the trained model, maintaining consistency with the training data format.


In [70]:
X_test1 = tokenizer.texts_to_sequences(test_data['Sentence1'].values)
print(X_t1[0])
maxlen = 60
X_test1 = pad_sequences(X_test1, maxlen=maxlen)
print("Padded Sequences: ")
print(X_test1)
print(X_test1[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0   36   44   31  852    1 1694
   10  139 1733    2]
Padded Sequences: 
[[   0    0    0 ...  139 1733    2]
 [   0    0    0 ...   17 1098    7]
 [   0    0    0 ...  989   11 1292]
 ...
 [   0    0    0 ...   22   87   48]
 [   0    0    0 ...  552   23  104]
 [   0    0    0 ...    8 2005    7]]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0   36   44   31  852    1 1694
   10  139 1733    2]


In [71]:
X_test1.shape

(4700, 60)

**Text Tokenization and Padding (Test Data - Sentence2)**

Similarly, for the test data, the sentences from 'Sentence2' are tokenized using the previously fitted tokenizer. The resulting sequences are then padded to ensure uniform length, with a maximum length specified by 'maxlen'. This preprocessing step ensures that the test data is formatted appropriately for input into the trained model, maintaining consistency with the training data.


In [76]:
X_test2 = tokenizer.texts_to_sequences(test_data['Sentence2'].values)
print(X_test2[0])
maxlen = 60
X_test2 = pad_sequences(X_test2, maxlen=maxlen)
print("Padded Sequences: ")
print(X_test2)
print(X_test2[0])

[24, 18, 89, 10, 36, 7, 852, 1694, 2, 2306]
Padded Sequences: 
[[   0    0    0 ... 1694    2 2306]
 [   0    0    0 ...   31   52 2430]
 [   0    0    0 ...    8    2 1292]
 ...
 [   0    0    0 ...   48    2  210]
 [   0    0    0 ...   10   81  552]
 [   0    0    0 ... 2005    7 2207]]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0   24   18   89   10   36    7
  852 1694    2 2306]


In [77]:
X_test2.shape

(4700, 60)

**Train-Validation Data Splitting**

The training dataset is split into training and validation sets to facilitate model training and evaluation. The split is performed based on the specified 'training_portion,' ensuring a portion of the data is reserved for validation. This division allows the model to learn from the training set and assess its performance on unseen data during validation, helping to prevent overfitting and ensure generalization.


In [78]:
training_size = int(len(X_1)*training_portion)

X_train1 = X_1[:training_size,:]
X_train2 = X_2[:training_size,:]
y_train  = y[:training_size]
X_val1   = X_1[training_size:,:]
X_val2   = X_2[training_size:,:]
y_val    = y[training_size:]


In [79]:
print(X_train1.shape)
print(X_train2.shape)
len(y_train)

(12800, 60)
(12800, 60)


12800

**Model Configuration Parameters**

The following parameters are crucial for configuring the Siamese LSTM model:

- `embedding_dim`: The dimensionality of the word embeddings. Adjusting this parameter can impact the model's ability to capture semantic relationships.

- `lstm_out`: The number of LSTM units in the output layer. This parameter determines the complexity of the LSTM layer and influences the model's learning capacity.

- `gradient_clipping_norm`: The normalization value for gradient clipping. This technique helps stabilize training by preventing exploding gradients.

- `batch_size`: The number of samples used in each iteration during training. It affects the model's training speed and memory consumption.

- `n_epoch`: The number of training epochs. An epoch represents one complete pass through the entire training dataset. Adjust this parameter based on training convergence.


In [81]:
embedding_dim = 40 #Change to observe effects
lstm_out = 256
gradient_clipping_norm = 2.50
batch_size = 128
n_epoch = 5


**Callback Configuration**

The code sets up callbacks to monitor the model during training:

- `ReduceLROnPlateau`: This callback dynamically adjusts the learning rate when a monitored metric plateaus. It helps improve convergence and training efficiency.

- `EarlyStopping`: Monitors the validation loss and stops training when the loss stops decreasing, preventing overfitting.

- `ModelCheckpoint`: Saves the model's weights during training based on the best validation loss. The saved model can be used for further analysis or deployment.

These callbacks collectively enhance the training process, ensuring optimal model performance.


In [82]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=0.001)

earlystop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

modelcheckpoint = ModelCheckpoint("weights.{epoch:02d}-{val_loss:.2f}.h5", monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto',  save_freq='epoch')

callbacks = [earlystop,modelcheckpoint,reduce_lr]

**Siamese LSTM Model Overview**

This code defines a Siamese LSTM model for paraphrase detection. It comprises:

- **Inputs**: Two sequences processed by shared embedding and LSTM layers.

- **Outputs**: Manhattan distance measures similarity between LSTM outputs.

- **Compilation**: Adadelta optimizer, mean squared error loss, and accuracy metric.

- **Summary**: Model architecture is summarized for quick reference.

The Siamese LSTM detects paraphrases by learning sentence pair similarity.


In [83]:
def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))



left_input = Input(shape=(maxlen,), dtype='int32')
right_input = Input(shape=(maxlen,), dtype='int32')

embedding_layer = Embedding(num_words, embedding_dim, input_length=maxlen, trainable=False)

# Embedded version of the inputs
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

# Since this is a siamese network, both sides share the same LSTM
shared_lstm = LSTM(lstm_out)

left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)

malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_output, right_output])


malstm = Model([left_input, right_input], [malstm_distance])

# Adadelta optimizer, with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm,learning_rate=1.0,rho=0.95)

malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

# Use tf.compat.v1.executing_eagerly_outside_functions instead of tf.executing_eagerly_outside_functions
# tf.compat.v1.executing_eagerly_outside_functions

print(malstm.summary())

 


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 60)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 60)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 60, 40)               200000    ['input_1[0][0]',             
                                                                     'input_2[0][0]']             
                                                                                                  
 lstm (LSTM)                 (None, 256)                  304128    ['embedding[0][0]',      

**Training the Siamese LSTM Model**

The code trains the Siamese LSTM model using the fit() function. It takes training inputs (X_train1, X_train2), labels (y_train), and other parameters like batch size, epochs, and validation data.

Callbacks, including early stopping, model checkpointing, and learning rate reduction, are employed during training.

The training progress is stored in the malstm_trained variable.


In [84]:



malstm_trained = malstm.fit([X_train1, X_train2], np.array(y_train), batch_size=batch_size, epochs=n_epoch,
                            validation_data=([X_val1, X_val2], np.array(y_val)), callbacks=callbacks)

Epoch 1/5


Epoch 2/5


  saving_api.save_model(


Epoch 3/5
Epoch 4/5
Epoch 5/5


**Loading Pre-trained Siamese LSTM Model Weights**

The code loads pre-trained weights for a Siamese LSTM model from the file "weights.01-0.26.h5". After successful loading, it prints "Loaded model from disk".


In [85]:
malstm.load_weights("weights.01-0.26.h5")
print("Loaded model from disk")

Loaded model from disk


**Model Evaluation on Validation Data**

The code evaluates the Siamese LSTM model on the validation data ([X_val1, X_val2], np.array(y_val)) using the pre-defined loss function. The batch size for evaluation is set to 'batch_size'. The 'earlystop' callback is used during evaluation.


In [86]:
loss = malstm.evaluate([X_val1,X_val2], np.array(y_val), batch_size = batch_size, callbacks=[earlystop])
print(loss)

[0.26060256361961365, 0.5712500214576721]


**Model Prediction on Test Data**

The code predicts the output using the trained Siamese LSTM model on the test data ([X_test1, X_test2]). The predictions are stored in the 'y_pred' variable.


In [107]:
y_pred = malstm.predict([X_test1,X_test2])
y_pred



array([[0.3727825 ],
       [0.29832506],
       [0.38921458],
       ...,
       [0.39701924],
       [0.5299454 ],
       [0.55466247]], dtype=float32)

**Thresholding Predictions**

The code applies a threshold of 0.5 to the predicted values in 'y_pred', creating binary predictions. The results are stored in 'y_out', where True indicates paraphrase and False indicates non-paraphrase.


In [109]:
y_out = y_pred>0.5
y_out

array([[False],
       [False],
       [False],
       ...,
       [False],
       [ True],
       [ True]])

 **Binary Classification:**
   
   For further analysis, we convert the predicted probabilities into binary classifications using a threshold of 0.5.


In [113]:
y_out = [1 if p>= .5 else 0 for p in y_pred]

y_out

[0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,


In [114]:
np.max(y_pred)

1.0

In [112]:
np.min(y_pred)

0.061577827

In [115]:

len(y_pred)

4700

In [116]:
test_data.shape

(4700, 4)

In [117]:



test_data['Class'] = y_out

In [102]:
test_data_2 = test_data[['ID', 'Class']]

In [103]:




test_data_2.index = test_data_2.index + 1

In [104]:
test_data_2.to_csv("submission_1.csv",index=False)

In [106]:
test_data_2.head()

Unnamed: 0,ID,Class
1,16000,0
2,16001,0
3,16002,0
4,16003,1
5,16004,1


In [71]:
from IPython.display import HTML
import pandas as pd
import numpy as np
import base64
def create_download_link(df, title = "Download CSV file", filename = "data.csv"):
  csv = df.to_csv(index=False)
  b64 = base64.b64encode(csv.encode())
  payload = b64.decode()
  html = '<a download="{filename}" href="data:text/csv;base64,{payload}"target="_blank">{title}</a>'
  html = html.format(payload=payload,title=title,filename=filename)
  return HTML(html)
create_download_link(test_data_2)