In [1]:
!pip install tensorflow-text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-text
  Downloading tensorflow_text-2.12.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.12.0


In [2]:
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report


In [3]:
def extract_data(path):
  return pd.read_csv(path, sep = '\t', quoting=csv.QUOTE_NONE) 

In [4]:
def data_bal(path_train, path_test):
  df = extract_data(path_train)
  dft = extract_data(path_test)

  df_full = df.append(dft)

  data_1 = df_full[df_full['Quality']==1]
  data_2 = df_full[df_full['Quality']==0]

  df_bal = data_2.append(data_1[:1900])

  df_bal = df_bal.sample(frac = 1)
  df_bal.reset_index(inplace=True)

  return df_bal


In [5]:
def model_pipe():
  
  bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
  bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

  input_text1 = tf.keras.layers.Input(shape=(), dtype=tf.string, name='input_text1')
  input_text2 = tf.keras.layers.Input(shape=(), dtype=tf.string, name='input_text2')
  preprocessed_text1 = bert_preprocess(input_text1)
  preprocessed_text2 = bert_preprocess(input_text2)
  out1 = bert_encoder(preprocessed_text1)
  out2 = bert_encoder(preprocessed_text2)
  concatenated = tf.keras.layers.Concatenate(axis=-1)([out1['pooled_output'], out2['pooled_output']])
  dense1 = tf.keras.layers.Dense(256, activation='relu')(concatenated)
  dense2 = tf.keras.layers.Dense(128, activation='relu')(dense1)
  dropout1 = tf.keras.layers.Dropout(0.2)(dense2)
  output = tf.keras.layers.Dense(1, activation='sigmoid')(dropout1)

  model = tf.keras.Model(inputs=[input_text1,input_text2], outputs=output)

  METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
  ]

  model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='binary_crossentropy',
              metrics=METRICS)

  return model

In [6]:
def model_train(path_train, path_test):

  df_bal = data_bal(path_train, path_test)

  X_train, X_test, y_train, y_test = train_test_split(df_bal[['#1 String','#2 String']],df_bal['Quality'], stratify= df_bal['Quality'],test_size = 0.15)

  model = model_pipe()
  callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
  history = model.fit([X_train['#1 String'], X_train['#2 String']], y_train, epochs=7, batch_size=32,callbacks=[callback], validation_split=0.15)

  return history,model,X_test,y_test

  

In [7]:
history,model,X_test,y_test = model_train('/content/drive/MyDrive/msrp_project/MSRParaphraseCorpus/msr_paraphrase_train.txt',
                      '/content/drive/MyDrive/msrp_project/MSRParaphraseCorpus/msr_paraphrase_test.txt')

  df_full = df.append(dft)
  df_bal = data_2.append(data_1[:1900])


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [9]:
y_predicted = model.predict([X_test['#1 String'], X_test['#2 String']])
y_predicted = y_predicted.flatten()
import numpy as np
y_predicted = np.where(y_predicted > 0.5, 1, 0)

from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_predicted)
print(cm)
print(classification_report(y_test, y_predicted)) 

[[170 116]
 [122 163]]
              precision    recall  f1-score   support

           0       0.58      0.59      0.59       286
           1       0.58      0.57      0.58       285

    accuracy                           0.58       571
   macro avg       0.58      0.58      0.58       571
weighted avg       0.58      0.58      0.58       571

