# Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Install Hazm

In [None]:
!mkdir resources
!wget -q "https://github.com/sobhe/hazm/releases/download/v0.5/resources-0.5.zip" -P resources
!unzip -qq resources/resources-0.5.zip -d resources

!rm -rf /content/4ccae468eb73bf6c4f4de3075ddb5336
!rm -rf /content/preproc
!rm preprocessing.py utils.py
!mkdir -p /content/preproc
!git clone https://gist.github.com/4ccae468eb73bf6c4f4de3075ddb5336.git /content/preproc/
!mv /content/preproc/* /content/
!rm -rf /content/preproc

!pip install hazm

# Import

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from __future__ import unicode_literals
from hazm import *
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras import Sequential, Model
from keras.layers import Embedding, GlobalAveragePooling1D, Dense, concatenate, Input, Flatten, LSTM, SimpleRNN, GRU, Dropout, MaxPooling1D, Conv1D
from keras.utils.vis_utils import plot_model
from sklearn.model_selection import train_test_split
import re
import tensorflow as tf
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score




# Ready Dataset

In [None]:
path = '/content/drive/MyDrive/DL/Project 4/Dataset/PerSICK.csv'
df = pd.read_csv(path)

In [None]:
df = df.dropna()
df = df.reset_index()

In [None]:
df.head(10)

In [None]:

df['score'] = df['score'].round()

sentence1 = df['sentence1']
sentence2 = df['sentence2']


In [None]:
print(len(sentence1))

# Extract Subject

In [None]:
word_tokenizer = WordTokenizer()
sentence_tokenizer = SentenceTokenizer()

lexical_tokens_sent1 = [word_tokenizer.tokenize(sentence) for sentence in sentence1]
lexical_tokens_sent2 = [word_tokenizer.tokenize(sentence) for sentence in sentence2]


In [None]:
print(lexical_tokens_sent1[0])

In [None]:
print(lexical_tokens_sent2[0])

In [None]:
try:
    tagger = POSTagger(model='resources/postagger.model')
except:
    print('Instatiating POSTagger failed.')
pos_tagged_sent1 = [tagger.tag(lexical_token) for lexical_token in lexical_tokens_sent1]
pos_tagged_sent2 = [tagger.tag(lexical_token) for lexical_token in lexical_tokens_sent2]



In [None]:
pos_tagg_sent1 = []
pos_tagg_sent2 = []
#convert sentence to only tags
def convert_to_tag(tagged_sentencs):
  change_tag = {'AJ': 'A', 'PRO': 'R', 'Ne': 'N', 'AJe': 'A', 'NUM': 'U', 'CONJ': 'C', 'DET': 'D'}
  print(len(tagged_sentencs[0]))
  sentence_tag = []
  for i in range(len(tagged_sentencs)):
    sent_tag_temp = ''.join([each[1] if each[1] not in change_tag else change_tag[each[1]] for each in tagged_sentencs[i]])

    sentence_tag.append(sent_tag_temp)
  return sentence_tag

pos_tagg_sent1 = convert_to_tag(pos_tagged_sent1)
pos_tagg_sent2 = convert_to_tag(pos_tagged_sent2)



In [None]:
print(len(pos_tagg_sent1[7]))
print(len(pos_tagged_sent1[7]))
print(pos_tagg_sent1[7])
print(pos_tagged_sent1[7])


In [None]:
subj_pos_sent1 = []
subj_pos_sent2 = []
SBJ_pattern = '^(?!P)(P|U(P(A)?)?)?((N((A)|(N)+)*(A|N|R))|(N(A)?)|(R))(C)?'
#find subject from only tags sentence

def find_pos(pattern, tag_sentence):
  subj_pos = []
  for i in range(len(tag_sentence)):
     pos_subj_temp = re.search(pattern, tag_sentence[i])
     subj_pos.append(pos_subj_temp)
  return subj_pos

subj_pos_sent1 = find_pos(SBJ_pattern, pos_tagg_sent1)
subj_pos_sent2 = find_pos(SBJ_pattern, pos_tagg_sent2)

In [None]:
print(subj_pos_sent1[9])

In [None]:
subj_index_sent1 = []
subj_index_sent2 = []

def find_pos_index(subj_pos):
  pos_index = []
  for i in range(len(subj_pos)):
    if subj_pos[i] is not None:
      pos_index.append(subj_pos[i].span())
    else:
      pos_index.append(None)
  return pos_index

subj_index_sent1 = find_pos_index(subj_pos_sent1)
subj_index_sent2 = find_pos_index(subj_pos_sent2)

print(len(subj_index_sent1))

In [None]:
print(pos_tagged_sent1[4])

In [None]:
subj_sent1 = []
subj_sent2 = []
#find subject of sentece from their index
def find_words(words_index, tag_sentences):
  words = []
  words_temp = []
  for i in range(len(words_index)):
    if words_index[i] is not None:
      for j in range(words_index[i][0], words_index[i][1]):
        words_temp.append(tag_sentences[i][j][0])
      words.append(words_temp)
      words_temp = []
    else:
      words.append(['NA'])
  return words

subj_sent1 = find_words(subj_index_sent1, pos_tagged_sent1)
subj_sent2 = find_words(subj_index_sent2, pos_tagged_sent2)

print(len(subj_sent1))
print(len(subj_sent2))

In [None]:
print(subj_sent1[2])
print(subj_sent2[2])


In [None]:
str_subj_sent1 = []
str_subj_sent2 = []
#convert the subject word to string
def convert_to_string(sentence):
  str_sent = []
  for i in range(len(sentence)):
    str_temp = ''
    for j in range(len(sentence[i])):
      str_temp = str_temp + ' ' + sentence[i][j]
    str_sent.append(str_temp)
  return str_sent

str_subj_sent1 = convert_to_string(subj_sent1)
str_subj_sent2 = convert_to_string(subj_sent2)


In [None]:
print(len(str_subj_sent1))
print(len(str_subj_sent2))

In [None]:
print(str_subj_sent1[7])
print(str_subj_sent1[7])

In [None]:

subj_class_sent1 = []
subj_class_sent2 = []
#classify every sentence based on their subject
def find_subject_class(sentence):
  subj_class = []
  for i in range(len(sentence)):
    if 'پسر' in sentence[i] or 'مرد' in sentence[i]:
      subj_class.append(6) 
    elif 'زن' in sentence[i] or 'زن' in sentence[i]:
      subj_class.append(7)  
    elif 'کودک' in sentence[i]:
      subj_class.append(8)   
    elif 'سگ' in sentence[i] or 'گربه' in sentence[i]:
      subj_class.append(9)
    elif 'NA' in sentence[i]:
      subj_class.append(11)
    else:
       subj_class.append(10)
  return subj_class

subj_class_sent1 = find_subject_class(str_subj_sent1)
subj_class_sent2 = find_subject_class(str_subj_sent2)


In [None]:
print(subj_class_sent1[:50])
print(subj_class_sent2[:50])

In [None]:
subj_class_sent = []
#classify every two sentence based on their subject 
for i in range(len(subj_class_sent1)):
  if subj_class_sent1[i] != subj_class_sent2[i]:
    subj_class_sent.append(11)
  else:
    subj_class_sent.append(subj_class_sent1[i])



In [None]:
print(subj_class_sent)

# Preprocess Y

In [None]:
#output of each row
y = [[int(y1), y2] for y1, y2 in zip(df['score'], subj_class_sent)]


In [None]:
print(len(y))

3101


In [None]:
print(y[7])
print(sentence1[7])
print(sentence2[7])

In [None]:
#convert y to multi label binary
mlb = MultiLabelBinarizer()
mlb.fit(y)
y_OH = mlb.transform(y)
mlb.classes_
print(y_OH[7])

# Implement with one hot encoding

In [None]:
#build word vocabulary from most frequency word in dataset sentence
tk = Tokenizer(
    num_words=1500,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    split=' ',
    oov_token='UNK'
)
tk.fit_on_texts(sentence1)

In [None]:
#map each word to its vocabulary index
X_cop1 = tk.texts_to_sequences(sentence1)
X_cop2 = tk.texts_to_sequences(sentence2)
X_cop1[0]
X_cop2[0]

In [None]:
max_len = max([len(sentence.split()) for sentence in sentence1])
max_len

In [None]:
#fix sentences length
X_pad1 = pad_sequences(X_cop1, maxlen=20, padding='post')
X_pad2 = pad_sequences(X_cop2, maxlen=20, padding='post')

X_pad1.shape

In [None]:
#convert sentences to one hot encode
X_OH1 = to_categorical(X_pad1, num_classes=1500)
X_OH2 = to_categorical(X_pad2, num_classes=1500)

X_OH1.shape

In [None]:
X = [[s1, s2] for s1, s2 in zip(X_OH1, X_OH2)]


## Split Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_OH, test_size=0.2, shuffle=10)


In [None]:
X_train1 = [s1 for s1, s2 in X_train ]
X_train2= [s2 for s1, s2 in X_train ]
X_test1 = [s1 for s1, s2 in X_test ]
X_test2= [s2 for s1, s2 in X_test ]

X_train1 = np.array(X_train1)
X_train2 = np.array(X_train2)
X_test1 = np.array(X_test1)
X_test2 = np.array(X_test2)

X_test1.shape
X_train1.shape

## RNN Model

In [None]:
model1_in = Input(shape=X_train1.shape[1:])
model1_out = SimpleRNN(50)(model1_in)  

model1 = Model(model1_in, model1_out)

model2_in = Input(shape=X_train2.shape[1:])
model2_out = SimpleRNN(50)(model2_in) 

model2 = Model(model2_in, model2_out)

concatenated = concatenate([model1_out, model2_out])
layer = Flatten()(concatenated)

layer = Dense(36, activation='relu')(layer)
layer = Dense(28, activation='relu')(layer)
out = Dense(11, activation='sigmoid', name='output_layer')(layer)

model = Model([model1_in, model2_in], out)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])


plot_model(model)


## GRU Model

In [None]:
model1_in = Input(shape=X_train1.shape[1:])

model1_out = GRU(50)(model1_in)              
model1 = Model(model1_in, model1_out)

model2_in = Input(shape=X_train2.shape[1:])
model2_out = GRU(50)(model2_in) 

model2 = Model(model2_in, model2_out)


concatenated = concatenate([model1_out, model2_out])

layer = Dense(30, activation='relu')(concatenated)
layer = Dense(26, activation='relu')(layer)

out = Dense(11, activation='sigmoid', name='output_layer')(layer)

model = Model([model1_in, model2_in], out)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])


plot_model(model)


## LSTM Model

In [None]:
model1_in = Input(shape=X_train1.shape[1:])

model1_out = LSTM(50)(model1_in)              
model1 = Model(model1_in, model1_out)

model2_in = Input(shape=X_train2.shape[1:])
model2_out = LSTM(50)(model2_in) 

model2 = Model(model2_in, model2_out)


concatenated = concatenate([model1_out, model2_out])

layer = Dense(32, activation='relu')(concatenated)
layer = Dense(25, activation='relu')(layer)

out = Dense(11, activation='sigmoid', name='output_layer')(layer)

model = Model([model1_in, model2_in], out)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])


plot_model(model)


## Train Model

In [None]:
history = model.fit([X_train1, X_train2], y=y_train, epochs=150,
             validation_split=0.2)

## Evaluate train and test set 

In [None]:
loss, accuracy = model.evaluate([X_train1, X_train2], y_train)
print('Accuracy of train set: %.2f' % (accuracy*100))
print('Loss of train set: %.3f' % (loss))

In [None]:
loss, accuracy = model.evaluate([X_test1, X_test2], y_test)
print('Accuracy of test set: %.2f' % (accuracy*100))
print('Loss of test set: %.3f' % (loss))

In [None]:
X_test = [X_test1, X_test2]

# Implement with word embedding

In [None]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [None]:
sentences= []
#join every two sentence
for i in range(len(sentence1)):
  sent = sentence1[i] + ' ' + sentence2[i]
  sentences.append(sent)

In [None]:
print(sentences[0])

In [None]:
max_len = max([len(sentence.split()) for sentence in sentences])
max_len

In [None]:
words_len = 40
voc_size = 2000

In [None]:
onehot_repr=[one_hot(words, voc_size) for words in sentences] 
pad_sent = pad_sequences(onehot_repr,padding='pre', maxlen=words_len)


In [None]:
print(onehot_repr[0])
print(pad_sent)

## Split Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(pad_sent, y_OH, test_size=0.2, shuffle=10)


In [None]:
X_train.shape

## RNN Model

In [None]:

opt = tf.keras.optimizers.Adam(learning_rate=0.0001) 

model=Sequential()
model.add(Embedding(voc_size, 100, input_length=words_len))
model.add(SimpleRNN(50,  return_sequences=True,  input_shape=X_train.shape[1:]))
model.add(SimpleRNN(30,  return_sequences=True,  input_shape=X_train.shape[1:]))
model.add(SimpleRNN(20))

model.add(Dense(35, activation='relu'))
model.add(Dense(28, activation='relu'))
model.add(Dense(28, activation='relu'))
model.add(Dense(28, activation='relu'))

model.add(Dense(11, activation='sigmoid', name='output_layer'))


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])

plot_model(model)

## GRU Model

In [None]:

opt = tf.keras.optimizers.Adam(learning_rate=0.0001) 

model=Sequential()
model.add(Embedding(voc_size, 100, input_length=words_len))
model.add(GRU(80,  return_sequences=True,  input_shape=X_train.shape[1:]))
model.add(GRU(50,  return_sequences=True,  input_shape=X_train.shape[1:]))
model.add(SimpleRNN(20))

model.add(Dense(35, activation='relu'))
model.add(Dense(28, activation='relu'))
model.add(Dense(28, activation='relu'))
model.add(Dense(25, activation='relu'))

model.add(Dense(11, activation='sigmoid', name='output_layer'))


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])

plot_model(model)

## LSTM Model

In [None]:

opt = tf.keras.optimizers.Adam(learning_rate=0.0001) 

model=Sequential()
model.add(Embedding(voc_size, 100, input_length=words_len))
#model.add(LSTM(80,  return_sequences=True,  input_shape=X_train.shape[1:]))
model.add(LSTM(30))

model.add(Dense(35, activation='relu'))
model.add(Dense(30, activation='relu'))
model.add(Dense(28, activation='relu'))
model.add(Dense(25, activation='relu'))

model.add(Dense(11, activation='sigmoid', name='output_layer'))


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])

plot_model(model)

## Train Model

In [None]:
history = model.fit(X_train, y=y_train, epochs=150,
             validation_split=0.2)

## Evaluate train and test set 

In [None]:
yhat = model.predict(X_train)
yhat = yhat.round()
print(y_train[1800])
print(yhat[1800])


In [None]:
loss, accuracy = model.evaluate(X_train, y_train)
print('Accuracy of train set: %.2f' % (accuracy*100))
print('Loss of train set: %.3f' % (loss))

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print('Accuracy of test set: %.2f' % (accuracy*100))
print('Loss of test set: %.3f' % (loss))

# Plot

In [None]:
history = history.history

In [None]:
plt.plot(history['binary_accuracy'])
plt.xlabel('Epoch')
plt.ylabel('accuracy')
plt.show()
plt.plot(history['loss'], 'green')
plt.xlabel('Epoch')
plt.ylabel('loss')
plt.show()

In [None]:
plt.plot(history['val_binary_accuracy'])
plt.xlabel('Epoch')
plt.ylabel('accuracy')
plt.show()
plt.plot(history['val_loss'], 'green')
plt.xlabel('Epoch')
plt.ylabel('loss')
plt.show()

Test

In [None]:

labels =['1', '2', '3', '4' , '5', '6' , '7' , '8', '9', '10', '11']
y_predict = model.predict(X_test)
y_predict = y_predict.round()

print(classification_report(y_test, y_predict, target_names=labels))

In [None]:
m = tf.keras.metrics.BinaryAccuracy()
m.update_state([[1], [1], [0], [0]], [[0.98], [1], [0.8], [0.8]])
m.result().numpy()
