In [1]:
import csv
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [2]:
def extract_data(path):
  return pd.read_csv(path, sep = '\t', quoting=csv.QUOTE_NONE) 

In [3]:
def data_bal(path_train, path_test):
  df = extract_data(path_train)
  dft = extract_data(path_test)

  df_full = df.append(dft)

  data_1 = df_full[df_full['Quality']==1]
  data_2 = df_full[df_full['Quality']==0]

  df_bal = data_2.append(data_1[:1900])

  df_bal = df_bal.sample(frac = 1)
  df_bal["merge"] = df_bal[["#1 String", "#2 String"]].apply("-".join, axis=1)
  df_bal.reset_index(inplace=True)

  return df_bal


In [4]:
def preproc(data_col):
  import re

  lemm = WordNetLemmatizer()
  corpus = []
  for i in range(len(data_col)):
    review = re.sub("[^a-zA-Z0-9]"," ",data_col[i]).lower().split()
    review = [lemm.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    corpus.append(" ".join(review))
  return corpus


In [5]:
def cv_data(df_bal):
  string_merge = preproc(df_bal['merge'])

  cv = CountVectorizer(max_features=2500,binary=True)
  X = cv.fit_transform(string_merge).toarray()
  y = df_bal['Quality']
  
  from sklearn.model_selection import train_test_split
  X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.15, random_state=42)

  return X_train, X_test, y_train, y_test
    
  



In [6]:
def model_pipe(inp_shape):
  input_text = tf.keras.layers.Input(shape=(inp_shape), name='input_text')
  dense1 = tf.keras.layers.Dense(256, activation='relu')(input_text)
  dropout1 = tf.keras.layers.Dropout(0.3)(dense1)
  dense2 = tf.keras.layers.Dense(128, activation='relu')(dropout1)
  dropout2 = tf.keras.layers.Dropout(0.3)(dense2)
  dense3 = tf.keras.layers.Dense(64, activation='relu')(dropout2)
  dropout3 = tf.keras.layers.Dropout(0.3)(dense3)
  dense4 = tf.keras.layers.Dense(32, activation='relu')(dropout3)
  dropout4 = tf.keras.layers.Dropout(0.3)(dense4)
  dense5 = tf.keras.layers.Dense(16, activation='relu')(dropout4)
  dropout5 = tf.keras.layers.Dropout(0.3)(dense5)

  output = tf.keras.layers.Dense(1, activation='sigmoid')(dropout5)
  model_nn = tf.keras.Model(inputs=[input_text], outputs=output)



  METRICS = [
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
  ]

  model_nn.compile(optimizer=tf.keras.optimizers.Adam(),
                loss='binary_crossentropy',
                metrics=METRICS)

  return model_nn

In [7]:
def model_train(train_path,test_path,model_save_path):
  df_bal = data_bal(train_path, test_path)
  X_train, X_test, y_train, y_test = cv_data(df_bal)
  model = model_pipe(X_train.shape[1])

  callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

  history = model.fit(X_train, y_train, epochs=5, batch_size=32,callbacks=[callback],
                       validation_split=0.15)
  print(X_train.shape)
  model.save('model_save_path')

  return history,model,X_test,y_test


In [8]:
history,model,X_test,y_test = model_train('/content/drive/MyDrive/msrp_project/MSRParaphraseCorpus/msr_paraphrase_train.txt',
                      '/content/drive/MyDrive/msrp_project/MSRParaphraseCorpus/msr_paraphrase_test.txt',
                      '/content/drive/MyDrive/msrp_project/MSRParaphraseCorpus/saved_model_nn')

  df_full = df.append(dft)
  df_bal = data_2.append(data_1[:1900])


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
(3230, 2500)


In [9]:
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()
y_predicted = np.where(y_predicted > 0.5, 1, 0)

print(confusion_matrix(y_test, y_predicted))
print(classification_report(y_test, y_predicted)) 

[[173 115]
 [104 179]]
              precision    recall  f1-score   support

           0       0.62      0.60      0.61       288
           1       0.61      0.63      0.62       283

    accuracy                           0.62       571
   macro avg       0.62      0.62      0.62       571
weighted avg       0.62      0.62      0.62       571

