## Exercise 3: Part 1

*In this task, we define semantic textual similarity (STS) as a supervised regression task in which the semantic similarity
of two pieces of text (typically sentences) should be determined.*

In [11]:
import numpy as np
import os

*We build a reader function for the labeled data sets, which for a given filename returns a list of scores, a list of
first sentences, and a list of second sentences.*

In [12]:
def dataset_reader(file_path):
    """
    Reads the input vectors and the corresponding labels from a file
    @param file_path: String / path to files
    @return: two numpy arrays / 1. input vectors, 2. labels
    """
    vectors = []
    sentences1 = []
    sentences2 = []
    with open(file_path, "r",encoding='utf8') as f:
        for line in f:
            lines = line.replace('\t','\n').split('\n')
            
            sentences1.append(lines[1])
            sentences2.append(lines[2])
            # Convert string values to float and add bias
            vector = np.array(lines[0].split(), dtype='float')
            vectors.append(vector)
        
    return np.array(vectors,dtype=float), sentences1, sentences2

In [13]:
scores_dev, sentences1_dev, sentences2_dev = dataset_reader('data-dev.txt')
scores_train, sentences1_train, sentences2_train = dataset_reader('data-train.txt')

In [14]:
print('first sentence pair: {} {} '.format(sentences1_train[0],sentences2_train[0]))
print('similarity score of first sentence pair: {}'.format(scores_train[0]))

first sentence pair: A brown dog is running through the field. a brown dog with his tongue wagging as he runs through a field 
similarity score of first sentence pair: [0.8]


*We build a function to read in the first 40000 lines of the 300-dim vector embeddings*

In [16]:
def load_vectors(fname):
    fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    count=0
    for line in fin:
        count+=1
        if count==40001:
            break
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data

In [17]:
wiki_vector = load_vectors('wiki-news-300d-1M.vec')

*We tokenize every sentence with the word_tokenize function of nltk package*

In [18]:
from nltk.tokenize import word_tokenize

In [19]:
def tokenize(sentences):
    token_sent = []
    for sentence in sentences:
        token_sent.append(word_tokenize(sentence))
    
    return token_sent
    

In [20]:
token_sent1_train = tokenize(sentences1_train)
token_sent2_train = tokenize(sentences2_train)

In [21]:
token_sent1_dev = tokenize(sentences1_dev)
token_sent2_dev = tokenize(sentences2_dev)

In [22]:
print('1st sentence:{}'.format(token_sent1_train[0]))

1st sentence:['A', 'brown', 'dog', 'is', 'running', 'through', 'the', 'field', '.']


In [23]:
len(wiki_vector)

40000

*We map every tokens with the 300-dim vector from the embeddings*

In [24]:
def map_tokens(tokenized_sent):
    embedding_vectors = {}
    for tokenized_sent in tokenized_sent:
        for tokens in tokenized_sent:
            if tokens in wiki_vector.keys():
                embedding_vectors[tokens] = wiki_vector.get(tokens)
            else:
                embedding_vectors[tokens] = np.zeros((300))
    return embedding_vectors

In [25]:
embedding_vectors_sent1 = map_tokens(token_sent1_train)
embedding_vectors_sent2 = map_tokens(token_sent2_train)

In [26]:
embedding_sent1_dev = map_tokens(token_sent1_dev)
embedding_sent2_dev = map_tokens(token_sent2_dev)

*We average the embeddings to make every line item to be of the same size*

In [27]:
def avg_embeddings(tokenized_sentences,embedding_vectors):
    avg_embeddings = []
    for tokenized_sent in tokenized_sentences:
        
        sum_vec = np.zeros((300),dtype=float)
        for tokens in tokenized_sent:
            sum_vec =  sum_vec + embedding_vectors.get(tokens)
            
        avg_embeddings.append(sum_vec/len(tokenized_sent))
    
    return np.array(avg_embeddings)
    

In [28]:
avg_vectors_sent1 = avg_embeddings(token_sent1_train,embedding_vectors_sent1)
avg_vectors_sent2 = avg_embeddings(token_sent2_train,embedding_vectors_sent2)

In [29]:
avg_vectors_sent1_dev = avg_embeddings(token_sent1_dev,embedding_sent1_dev)
avg_vectors_sent2_dev = avg_embeddings(token_sent2_dev,embedding_sent2_dev)

In [31]:
print('averaged embeddings of 1st sequence:{}'.format(avg_vectors_sent1[0][:20]))

averaged embeddings of 1st sequence:[-0.04035556  0.04834444 -0.02134444  0.05437778 -0.04855556 -0.02447778
 -0.02275556 -0.00113333  0.00976667  0.01081111  0.03703333  0.01817778
  0.05385556  0.05241111  0.03544444 -0.00903333  0.02505556  0.04105556
 -0.02614444  0.01518889]


*We build a neural network with two hidden layers and dropout layer using the keras library and print the mean squared error*

In [35]:
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate

input1 = Input(shape=(300,))
input2 = Input(shape=(300,))
merged = Concatenate(axis=1)([input1, input2])
dense1 = Dense(300, input_dim=2, activation='relu', use_bias=True)(merged)
dropout = Dropout(0.3)(dense1)
output = Dense(1, activation='sigmoid')(dropout)
model = Model(inputs=[input1,input2], outputs=output)

In [36]:
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 600)          0           input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 300)          180300      concatenate_1[0][0]   

In [37]:
model.compile(loss='MeanSquaredError',
            optimizer='adam', metrics=['MeanSquaredError'])

model.fit([avg_vectors_sent1,avg_vectors_sent2],scores_train,epochs=300,batch_size=100,verbose=2)

#model.evaluate([avg_vectors_sent1,avg_vectors_sent2],scores_train)

Epoch 1/300
30/30 - 0s - loss: 0.0901 - mean_squared_error: 0.0901
Epoch 2/300
30/30 - 0s - loss: 0.0858 - mean_squared_error: 0.0858
Epoch 3/300
30/30 - 0s - loss: 0.0821 - mean_squared_error: 0.0821
Epoch 4/300
30/30 - 0s - loss: 0.0796 - mean_squared_error: 0.0796
Epoch 5/300
30/30 - 0s - loss: 0.0768 - mean_squared_error: 0.0768
Epoch 6/300
30/30 - 0s - loss: 0.0747 - mean_squared_error: 0.0747
Epoch 7/300
30/30 - 0s - loss: 0.0716 - mean_squared_error: 0.0716
Epoch 8/300
30/30 - 0s - loss: 0.0695 - mean_squared_error: 0.0695
Epoch 9/300
30/30 - 0s - loss: 0.0670 - mean_squared_error: 0.0670
Epoch 10/300
30/30 - 0s - loss: 0.0643 - mean_squared_error: 0.0643
Epoch 11/300
30/30 - 0s - loss: 0.0611 - mean_squared_error: 0.0611
Epoch 12/300
30/30 - 0s - loss: 0.0585 - mean_squared_error: 0.0585
Epoch 13/300
30/30 - 0s - loss: 0.0570 - mean_squared_error: 0.0570
Epoch 14/300
30/30 - 0s - loss: 0.0538 - mean_squared_error: 0.0538
Epoch 15/300
30/30 - 0s - loss: 0.0516 - mean_squared_err

Epoch 122/300
30/30 - 0s - loss: 0.0070 - mean_squared_error: 0.0070
Epoch 123/300
30/30 - 0s - loss: 0.0070 - mean_squared_error: 0.0070
Epoch 124/300
30/30 - 0s - loss: 0.0070 - mean_squared_error: 0.0070
Epoch 125/300
30/30 - 0s - loss: 0.0070 - mean_squared_error: 0.0070
Epoch 126/300
30/30 - 0s - loss: 0.0069 - mean_squared_error: 0.0069
Epoch 127/300
30/30 - 0s - loss: 0.0069 - mean_squared_error: 0.0069
Epoch 128/300
30/30 - 0s - loss: 0.0069 - mean_squared_error: 0.0069
Epoch 129/300
30/30 - 0s - loss: 0.0069 - mean_squared_error: 0.0069
Epoch 130/300
30/30 - 0s - loss: 0.0070 - mean_squared_error: 0.0070
Epoch 131/300
30/30 - 0s - loss: 0.0070 - mean_squared_error: 0.0070
Epoch 132/300
30/30 - 0s - loss: 0.0069 - mean_squared_error: 0.0069
Epoch 133/300
30/30 - 0s - loss: 0.0069 - mean_squared_error: 0.0069
Epoch 134/300
30/30 - 0s - loss: 0.0069 - mean_squared_error: 0.0069
Epoch 135/300
30/30 - 0s - loss: 0.0068 - mean_squared_error: 0.0068
Epoch 136/300
30/30 - 0s - loss: 0

Epoch 241/300
30/30 - 0s - loss: 0.0057 - mean_squared_error: 0.0057
Epoch 242/300
30/30 - 0s - loss: 0.0058 - mean_squared_error: 0.0058
Epoch 243/300
30/30 - 0s - loss: 0.0058 - mean_squared_error: 0.0058
Epoch 244/300
30/30 - 0s - loss: 0.0057 - mean_squared_error: 0.0057
Epoch 245/300
30/30 - 0s - loss: 0.0057 - mean_squared_error: 0.0057
Epoch 246/300
30/30 - 0s - loss: 0.0058 - mean_squared_error: 0.0058
Epoch 247/300
30/30 - 0s - loss: 0.0059 - mean_squared_error: 0.0059
Epoch 248/300
30/30 - 0s - loss: 0.0057 - mean_squared_error: 0.0057
Epoch 249/300
30/30 - 0s - loss: 0.0058 - mean_squared_error: 0.0058
Epoch 250/300
30/30 - 0s - loss: 0.0057 - mean_squared_error: 0.0057
Epoch 251/300
30/30 - 0s - loss: 0.0058 - mean_squared_error: 0.0058
Epoch 252/300
30/30 - 0s - loss: 0.0057 - mean_squared_error: 0.0057
Epoch 253/300
30/30 - 0s - loss: 0.0056 - mean_squared_error: 0.0056
Epoch 254/300
30/30 - 0s - loss: 0.0057 - mean_squared_error: 0.0057
Epoch 255/300
30/30 - 0s - loss: 0

<tensorflow.python.keras.callbacks.History at 0x212c3b4e700>

In [38]:
print('Mean squared error on develpment set is: {}'.format(model.evaluate([avg_vectors_sent1_dev,avg_vectors_sent2_dev],scores_dev)[1]))


 1/32 [..............................] - ETA: 0s - loss: 0.0703 - mean_squared_error: 0.0703
Mean squared error on develpment set is: 0.07066523283720016
