In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Lambda, RepeatVector, TimeDistributed
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.models import Model
import tensorflow.keras as keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
import string



# Process and load the dataset

In [3]:
df = pd.read_csv("/kaggle/input/unveiling-complex-text-relations-through-splitti/train.csv")
# test_df = pd.read_csv("/kaggle/input/unveiling-complex-text-relations-through-splitti/test.csv")
# cleanse data of punctuation
df['complex_sentence'] = df['complex_sentence'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
df['simple_sentence_1'] = df['simple_sentence_1'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
df['simple_sentence_2'] = df['simple_sentence_2'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))

In [4]:
complex_list = df['complex_sentence'].values.tolist()
simpl_sent1_list = df['simple_sentence_1'].values.tolist()
simpl_sent2_list = df['simple_sentence_2'].values.tolist()
max_len_complex = max([len(x.split(" ")) for x in complex_list])
max_len_simpl_sent1 = max([len(x.split(" ")) for x in simpl_sent1_list])
max_len_simpl_sent2 = max([len(x.split(" ")) for x in simpl_sent2_list])
max_tok_size = max(max_len_complex, max(max_len_simpl_sent1, max_len_simpl_sent2))

In [5]:
new_list = [complex_list[i] + " " + simpl_sent1_list[i] + " " + simpl_sent2_list[i] for i in range(len(complex_list))]
fit_text = new_list
tokenizer = Tokenizer()
tokenizer.fit_on_texts(fit_text)

In [6]:
# Define the Siamese neural network architecture
def create_siamese_network(max_sequence_length, embedding_dim, tokenizer_len):
    # Input layer for the first sentence
    input_a = Input(shape=(max_sequence_length,1), name='input_a')
    
    # Input layer for the second sentence
    input_b = Input(shape=(max_sequence_length,1), name='input_b')
    
    # Shared embedding layer
    vocabulary_size = tokenizer_len
    # embedding_layer = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim)
    
    # Shared LSTM layer
    # lstm_layer = LSTM(128)
    
    # Apply embedding layer to both inputs
    # encoded_a = embedding_layer(input_a)
    # encoded_b = embedding_layer(input_b)
    
    # Encoder part
    encoder_a = LSTM(64, activation='relu')(input_a)
    encoder_b = LSTM(64, activation='relu')(input_b)
    
    # Merge the two encoded representations using a distance function (e.g., Euclidean or Manhattan)
    # merged_layer = tf.keras.layers.Lambda(lambda x: tf.keras.backend.abs(x[0] - x[1]))([encoded_a, encoded_b])
    merged_layer = Concatenate(axis=1, name='encoder_ab_output')([encoder_a,encoder_b])
    
    # Decoder part
    decoder1 = RepeatVector(max_sequence_length)(merged_layer)
    decoder1 = LSTM(max_sequence_length, activation='relu', return_sequences=True)(decoder1)
    output_layer = TimeDistributed(Dense(1))(decoder1)
    # Dense layer for the final similarity prediction
    # output_layer = Dense(1, activation='sigmoid')(merged_layer)
    
    # Create the Siamese style autoencoder model
    siamese_model = Model(inputs=[input_a, input_b], outputs=output_layer)
    
    siamese_model.compile(loss='mse', optimizer='adam')
    print(siamese_model.summary())
    return siamese_model

In [7]:
siamese_model = create_siamese_network(max_tok_size, embedding_dim=100, tokenizer_len=len(tokenizer.word_index))

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_a (InputLayer)        [(None, 51, 1)]              0         []                            
                                                                                                  
 input_b (InputLayer)        [(None, 51, 1)]              0         []                            
                                                                                                  
 lstm (LSTM)                 (None, 64)                   16896     ['input_a[0][0]']             
                                                                                                  
 lstm_1 (LSTM)               (None, 64)                   16896     ['input_b[0][0]']             
                                                                                              

In [8]:
tok_complex_list = tokenizer.texts_to_sequences(complex_list)
tok_simpl_sent1_list = tokenizer.texts_to_sequences(simpl_sent1_list)
tok_simpl_sent2_list = tokenizer.texts_to_sequences(simpl_sent2_list)

In [9]:
tok_complex_list = pad_sequences(tok_complex_list, maxlen=max_tok_size)
tok_simpl_sent1_list = pad_sequences(tok_simpl_sent1_list, maxlen=max_tok_size)
tok_simpl_sent2_list = pad_sequences(tok_simpl_sent2_list, maxlen=max_tok_size)

In [10]:
print(type(tok_complex_list))

<class 'numpy.ndarray'>


In [11]:
print(len(tokenizer.index_word))
print(len(tokenizer.index_word.keys()))
print([x/len(tokenizer.index_word) for x in tok_complex_list[0]])

598614
598614
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.83757813883404e-05, 4.009261393819724e-05, 0.017811143742044122, 3.1739986034406146e-05, 5.8468395326537636e-05, 0.0007433838834374071, 1.002315348454931e-05, 0.0004243134975125874, 4.009261393819724e-05, 0.002804812450093048, 1.002315348454931e-05, 0.0011342868693348302, 1.6705255807582181e-06, 0.008591513061839516, 0.04204211729094207, 6.6821023230328725e-06, 0.006130828881382661, 5.011576742274655e-06, 0.0003090472324402704, 3.3410511615164364e-05, 0.00015201782784899785, 0.02171182097311456, 1.002315348454931e-05, 1.6705255807582181e-06, 0.00022719147898311767, 5.011576742274655e-06, 0.0010741479484275343, 4.343366509971367e-05, 0.007465578820408477, 6.6821023230328725e-06, 0.0023387358130615055, 0.00041763139518955454, 1.002315348454931e-05, 7.35031255533616e-05, 0.019862549155215212, 0.019984497522610563, 5.011576742274655e-06, 6.849154881108694e-05, 0.009473550568479856, 0.002243515854958287]


In [12]:
tok_max = len(tokenizer.index_word)
tok_complex_list = tok_complex_list/tok_max
tok_simpl_sent1_list = tok_simpl_sent1_list/tok_max
tok_simpl_sent2_list = tok_simpl_sent2_list/tok_max

In [13]:
tok_complex_list = tok_complex_list.reshape(-1, tok_complex_list.shape[1], 1)
tok_simpl_sent1_list = tok_simpl_sent1_list.reshape(-1, tok_simpl_sent1_list.shape[1], 1)
tok_simpl_sent2_list = tok_simpl_sent2_list.reshape(-1, tok_simpl_sent2_list.shape[1], 1)

In [14]:
val_per = int(.8 * len(tok_complex_list))
tot = len(tok_complex_list)
tok_complex_list_train, tok_complex_list_test = tok_complex_list[:val_per], tok_complex_list[val_per:]
tok_simpl_sent1_list_train, tok_simpl_sent1_list_test = tok_simpl_sent1_list[:val_per], tok_simpl_sent1_list[val_per:]
tok_simpl_sent2_list_train, tok_simpl_sent2_list_test = tok_simpl_sent2_list[:val_per], tok_simpl_sent2_list[val_per:]

In [15]:
siamese_model.fit([tok_simpl_sent1_list_train, tok_simpl_sent2_list_train], tok_complex_list_train, epochs=1, batch_size=32, validation_split=0.2)



<keras.src.callbacks.History at 0x7b508a799ae0>

In [16]:
# siamese_model.eval([tok_simpl_sent1_list_test, tok_simpl_sent2_list_test], tok_complex_list_test, batch_size=32)

In [17]:
predictions = siamese_model.predict([tok_simpl_sent1_list, tok_simpl_sent2_list])



In [18]:
print(predictions.shape)
print(predictions)

(989944, 51, 1)
[[[ 0.00142158]
  [-0.00136959]
  [-0.0014879 ]
  ...
  [ 0.00382957]
  [ 0.00347358]
  [ 0.00312072]]

 [[ 0.00145989]
  [-0.00132661]
  [-0.00144749]
  ...
  [ 0.00397272]
  [ 0.00361276]
  [ 0.00325778]]

 [[ 0.00118401]
  [-0.00165574]
  [-0.00183484]
  ...
  [ 0.00206076]
  [ 0.0017772 ]
  [ 0.00149084]]

 ...

 [[ 0.00173203]
  [-0.00099463]
  [-0.00113886]
  ...
  [ 0.0037113 ]
  [ 0.00336881]
  [ 0.00304229]]

 [[ 0.00110889]
  [-0.00200516]
  [-0.00240583]
  ...
  [ 0.00387932]
  [ 0.00348037]
  [ 0.00309482]]

 [[ 0.00478513]
  [ 0.00237383]
  [ 0.00226031]
  ...
  [ 0.0073313 ]
  [ 0.0073956 ]
  [ 0.007451  ]]]


In [19]:
siamese_model.save('merge_sentence.keras')

In [20]:
from tensorflow.keras.models import Sequential, model_from_json
from tensorflow.keras.models import model_from_json
from tensorflow.keras.models import load_model

# serialize model to JSON
#  the keras model which is trained is defined as 'model' in this example
model_json = siamese_model.to_json()


with open("model_merge.json", "w") as json_file:
    json_file.write(model_json)

# serialize weights to HDF5
siamese_model.save_weights("model_merge.h5")

In [21]:
predictions = predictions*tok_max

In [22]:
predictions = predictions.reshape(-1, predictions.shape[1])
predictions = predictions.astype(np.int64)
output_texts = tokenizer.sequences_to_texts(predictions)

In [23]:
dicty = { "sent1": simpl_sent1_list, "sent2": simpl_sent2_list, "predict": output_texts}
df_out = pd.DataFrame.from_dict(dicty)

In [24]:
df_out.to_csv("submission.csv")