In [4]:
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Dataset Parameters

In [6]:
num_train_docs = 287113 + 13368
num_test_docs = 11490

num_neg_ex = 1

docs_per = 0.4

num_train_docs = int(2 * num_neg_ex * num_train_docs * docs_per)
num_test_docs = int(2 * num_neg_ex * num_test_docs * docs_per)

print(f"Number of training documents used:\t{num_train_docs}")
print(f"Number of testing documents used:\t{num_test_docs}")

data_path = '../data'

Number of training documents used:	240384
Number of testing documents used:	9192


In [7]:
embedding_size = 100 

## Loading Dataset

In [8]:
df = pd.read_csv(f"{data_path}/train.csv", nrows=num_train_docs)
train_df, val_df = train_test_split(df, test_size=0.15, random_state=42)

print(f"{len(train_df)},  {len(val_df)}")

# Extract input, summary, and label for training set
train_docs_input = train_df['input'].to_list()
train_docs_summary = train_df['summary'].to_list()
train_y = train_df['label'].to_list()

# Extract input, summary, and label for validation set
val_docs_input = val_df['input'].to_list()
val_docs_summary = val_df['summary'].to_list()
val_y = val_df['label'].to_list()
print(f"{len(train_docs_input)},  {len(val_docs_input)}")

204326,  36058
204326,  36058


### Preprocessing and Tokenization

In [9]:
def preprocess_text(text):
    text = text.lower() # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation and special characters using regex
    tokens = word_tokenize(text) # Tokenize the text
    stop_words = set(stopwords.words('english')) # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]

    return tokens

In [10]:
tokenized_data_input = [preprocess_text(sentence) for sentence in train_docs_input]
tokenized_data_summary = [preprocess_text(sentence) for sentence in train_docs_summary]

In [11]:
tokenized_data_input_val = [preprocess_text(sentence) for sentence in val_docs_input]
tokenized_data_summary_val = [preprocess_text(sentence) for sentence in val_docs_summary]

### Generate Word Embeddings

In [12]:
all_tokenized_data = tokenized_data_input + tokenized_data_summary

word2vec_model = Word2Vec(sentences=all_tokenized_data, vector_size=embedding_size, window=5, min_count=1, workers=4) # Train the Word2Vec model

### Generate Doc Embeddings

In [13]:
def get_word_vector(word):
    try:
        return word2vec_model.wv[word]
    except KeyError:
        # Handle out-of-vocabulary words
        return np.zeros(embedding_size)

#### Training docs

In [14]:
# Get document embeddings for all documents
document_embeddings_input = []

for doc in tokenized_data_input:
    document_embedding = np.mean([get_word_vector(word) for word in doc], axis=0)
    document_embeddings_input.append(document_embedding)

document_embeddings_summary = []

for doc in tokenized_data_summary:
    document_embedding = np.mean([get_word_vector(word) for word in doc], axis=0)
    document_embeddings_summary.append(document_embedding)

In [15]:
document_embeddings_input = np.array(document_embeddings_input)
document_embeddings_summary = np.array(document_embeddings_summary)

#### Validation docs

In [16]:
# Get document embeddings for all documents
document_embeddings_input_val = []

for doc in tokenized_data_input_val:
    document_embedding = np.mean([get_word_vector(word) for word in doc], axis=0)
    document_embeddings_input_val.append(document_embedding)

document_embeddings_summary_val = []

for doc in tokenized_data_summary_val:
    document_embedding = np.mean([get_word_vector(word) for word in doc], axis=0)
    document_embeddings_summary_val.append(document_embedding)

In [17]:
document_embeddings_input_val = np.array(document_embeddings_input_val)
document_embeddings_summary_val = np.array(document_embeddings_summary_val)

## Siamese network

In [18]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
import numpy as np




In [44]:
class SiameseNetwork:
    def __init__(self, embedding_size, hidden_layers, learning_rate, num_of_epochs, batch_size):
        self.embedding_size = embedding_size
        self.hidden_layers = hidden_layers
        self.learning_rate = learning_rate
        self.num_of_epochs = num_of_epochs
        self.batch_size = batch_size
        self.model = self.build_model()

    def euclidean_distance(self, vects):
        x, y = vects
        sum_square = K.sum(K.square(x - y), axis=1, keepdims=True)
        return K.sqrt(K.maximum(sum_square, K.epsilon()))

    def eucl_dist_output_shape(self, shapes):
        shape1, shape2 = shapes
        return (shape1[0], 1)

    def build_model(self):
        input_a = Input(shape=(self.embedding_size,))
        input_b = Input(shape=(self.embedding_size,))
        shared_layer = Dense(self.hidden_layers, activation='relu')
        encoded_a = shared_layer(input_a)
        encoded_b = shared_layer(input_b)
        
        distance = Lambda(self.euclidean_distance, output_shape=self.eucl_dist_output_shape)([encoded_a, encoded_b])
        output = Dense(1, activation='sigmoid')(distance)
        model = Model(inputs=[input_a, input_b], outputs=output)
        model.compile(optimizer=Adam(learning_rate=self.learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
        return model

    def fit(self, document_embeddings_input, document_embeddings_summary, train_y):
        # training
        input_data = [np.array(document_embeddings_input), np.array(document_embeddings_summary)]
        labels_array = np.array(train_y)
        self.model.fit(input_data, labels_array, epochs=self.num_of_epochs, batch_size=self.batch_size)

    def evaluate(self, document_embeddings, summary_embeddings, labels):

        input_data = [np.array(document_embeddings), np.array(summary_embeddings)]
        labels = np.array(labels)

        #predictions = self.model.predict(input_data)
        predictions = self.model.evaluate(input_data, labels)
        return predictions
    
    #def predict(self, document_embeddings, summary_embeddings):

        input_data = [np.array(document_embeddings), np.array(summary_embeddings)]

        predictions = self.mpdel.predict(document_embeddings, summary_embeddings)

        return predictions

## Model Hyperparameters and Training

In [20]:
num_of_epochs = 20
learning_rate = 0.01
batch_size = 40
hidden_layers = 64
# for embedding_size check above  

In [45]:
siamese_net = SiameseNetwork(embedding_size, hidden_layers, learning_rate, num_of_epochs, batch_size)
siamese_net.fit(document_embeddings_input, document_embeddings_summary, train_y)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [23]:
import pickle
with open('siamese_net.pkl', 'wb') as outp:
    pickle.dump(siamese_net, outp, pickle.HIGHEST_PROTOCOL)

## Evaluate the model (val set)

In [24]:
loss, accuracy = siamese_net.evaluate(document_embeddings_input_val, document_embeddings_summary_val, val_y)

print(f'Val Loss: {loss:.4f}')
print(f'Val Accuracy: {accuracy * 100:.2f}%')

Val Loss: 0.1056
Val Accuracy: 96.05%


## Evaluate the model (test set)

In [25]:
test_df = pd.read_csv(f"{data_path}/test.csv", nrows=num_train_docs)

print(f"{len(test_df)}")

# Extract input, summary, and label for testing set
test_docs_input = test_df['input'].to_list()
test_docs_summary = test_df['summary'].to_list()
test_y = test_df['label'].to_list()

22980


In [26]:
tokenized_data_input_test = [preprocess_text(sentence) for sentence in test_docs_input]
tokenized_data_summary_test = [preprocess_text(sentence) for sentence in test_docs_summary]

In [27]:
# Get document embeddings for all documents
document_embeddings_input_test = []

for doc in tokenized_data_input_test:
    document_embedding = np.mean([get_word_vector(word) for word in doc], axis=0)
    document_embeddings_input_test.append(document_embedding)

document_embeddings_summary_test = []

for doc in tokenized_data_summary_test:
    document_embedding = np.mean([get_word_vector(word) for word in doc], axis=0)
    document_embeddings_summary_test.append(document_embedding)

In [28]:
document_embeddings_input_test = np.array(document_embeddings_input_test)
document_embeddings_summary_test = np.array(document_embeddings_summary_test)

In [29]:
loss, accuracy = siamese_net.evaluate(document_embeddings_input_test, document_embeddings_summary_test, test_y)

print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Loss: 0.1230
Test Accuracy: 95.41%


In [47]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

predictions = siamese_net.model.predict([np.array(document_embeddings_input_test), np.array(document_embeddings_summary_test)], 1)
#predictions = siamese_net.predict(document_embeddings_input_test, document_embeddings_summary_test)

# Convert predicted probabilities to binary predictions (0 or 1)
binary_predictions = (predictions > 0.5).astype(int)

# Calculate F1 score
f1 = f1_score(test_y, binary_predictions)
ac = accuracy_score(test_y, binary_predictions)
precision = precision_score(test_y, binary_predictions)
recall = recall_score(test_y, binary_predictions)

print(f'Loss: {loss}, Accuracy: {ac}, F1 Score: {f1}')
print(f'precision: {precision}, recall: {recall}')

Loss: 0.12302155047655106, Accuracy: 0.9604873803307223, F1 Score: 0.9610367318915207
precision: 0.9478584729981379, recall: 0.9745865970409051
