## Installing Sentence Transformer and other models/frameworks

In [None]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np
import spacy
from scipy import stats
from sklearn import linear_model

from sentence_transformers import SentenceTransformer, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

import torch
from torch.utils.data import DataLoader

import tarfile             # to extract tar.gz file
from csv import QUOTE_NONE # Instructs writer objects to never quote fields
import nltk
from nltk.tokenize import word_tokenize
from sklearn.pipeline import make_pipeline
from gensim.models.doc2vec import Doc2Vec, TaggedDocument  # non-contextual embedding
from sklearn.preprocessing import PolynomialFeatures       # for model 1
import pickle

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Load dataset
Downloaded and unziped the dataset from this link http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


**(1) UNZIP THE DATASET**

In [None]:
DATA_PATH = "<specify drive path to the downloaded zipped dataset"

file = tarfile.open(DATA_PATH + 'Stsbenchmark.tar.gz')
file.extractall(DATA_PATH) # extracting file
file.close()

**(2) Complete the code in `read_sts_csv()`.**

In [None]:
INPUT_PATH = "<path to unzipped folder"

In [None]:
def read_sts_csv(dataset_type="train", columns=['source', 'type', 'year', 'id', 'score', 'sent_a', 'sent_b']):
  path = INPUT_PATH + "sts-"+ dataset_type + ".csv"
  """
  Take the input path and return the dataframe
  """
  return pd.read_csv(path,sep="\t",names=columns,quoting=QUOTE_NONE)

**(3) Create 3 dataframes one each for train, test and val and print their final shapes.**

In [None]:
df_train = read_sts_csv("train") # create the train dataframe
df_dev = read_sts_csv("dev")     # create the dev dataframes
df_test = read_sts_csv("test")   # create the test dataframe

In [None]:
df_train.head()

Unnamed: 0,source,type,year,id,score,sent_a,sent_b
0,main-captions,MSRvid,2012test,1,5.0,A plane is taking off.,An air plane is taking off.
1,main-captions,MSRvid,2012test,4,3.8,A man is playing a large flute.,A man is playing a flute.
2,main-captions,MSRvid,2012test,5,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,main-captions,MSRvid,2012test,6,2.6,Three men are playing chess.,Two men are playing chess.
4,main-captions,MSRvid,2012test,9,4.25,A man is playing the cello.,A man seated is playing the cello.


In [None]:
df_dev.head()

Unnamed: 0,source,type,year,id,score,sent_a,sent_b
0,main-captions,MSRvid,2012test,0,5.0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.
1,main-captions,MSRvid,2012test,2,4.75,A young child is riding a horse.,A child is riding a horse.
2,main-captions,MSRvid,2012test,3,5.0,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.
3,main-captions,MSRvid,2012test,7,2.4,A woman is playing the guitar.,A man is playing guitar.
4,main-captions,MSRvid,2012test,8,2.75,A woman is playing the flute.,A man is playing a flute.


In [None]:
df_test.head()

Unnamed: 0,source,type,year,id,score,sent_a,sent_b
0,main-captions,MSRvid,2012test,24,2.5,A girl is styling her hair.,A girl is brushing her hair.
1,main-captions,MSRvid,2012test,33,3.6,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.
2,main-captions,MSRvid,2012test,45,5.0,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.
3,main-captions,MSRvid,2012test,63,4.2,A man is cutting up a cucumber.,A man is slicing a cucumber.
4,main-captions,MSRvid,2012test,66,1.5,A man is playing a harp.,A man is playing a keyboard.


**Final shapes of all datasets**

In [None]:
print("Shape of train dataset : ",df_train.shape)
print("Shape of dev dataset   : ",df_dev.shape)
print("Shape of test dataset  : ",df_test.shape)

Shape of train dataset :  (5749, 7)
Shape of dev dataset   :  (1500, 7)
Shape of test dataset  :  (1379, 7)


**PATH FOR PICKLED FILES**

In [None]:
PKL_PATH = "/content/gdrive/MyDrive/Semester 3 IIITD/NLP/NLP Assignments/Assignment 03/A3a_Tarini_Simran/Pickled_Files/"

## Hyperparameters

In [None]:
NON_CONTEXTUAL_MODEL_TYPE = Doc2Vec
CONTEXTUAL_MODEL_TYPE = SentenceTransformer
HUGGING_FACE_SENTENCE_TRANSFORMER_MODEL = 'nli-roberta-base-v2'
INPUT_PATH = "<drive path to unzipped dataset>"
BATCH_SIZE = 16
OUT_DIM_DENSE = 16
NUM_EPOCHS = 2

## CONFIGURATION 1: Non-contextual Embeddings + ML Regression
1 Load the non-contextual embedding model

2 Get features for the sentences using the embedding model loaded before

2 Using features as X and score as Y, train a ML based regression model

3 Print the correlation scores on the dev and test set predictions using trained model



In [None]:
def get_feature_model1(data_frame):
    """
    Input a data frame and return the embedding vectors for the each sentence column using non_cont_model1,
    Return 2 matrices each of shape (#_samples, #size_of_word_emb).
    """

    num_samples = data_frame.shape[0]   # number of samples
    vec_embedding_dim = OUT_DIM_DENSE   # embedding vector dimension

    matrix_a = np.zeros(shape=(num_samples,vec_embedding_dim))  # matrix 1
    matrix_b = np.zeros(shape=(num_samples,vec_embedding_dim))  # matrix 2

    sent_a = data_frame['sent_a']       # first sentence
    sent_b = data_frame['sent_b']       # second sentence

    # tokenized version of sentences
    tokenized_sent = []
    for sent in sent_a:
        tokenized_sent.append(word_tokenize(sent.lower()))
    for sent in sent_b:
        tokenized_sent.append(word_tokenize(sent.lower()))

    # necessary for Doc2Vec
    tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]
    # Doc2Vec model
    model = non_cont_model1(tagged_data, vector_size = vec_embedding_dim, window = 3, min_count = 0, epochs = 100)

    # filling values for matrix
    for idx,sent in enumerate(tokenized_sent[:num_samples]):
        matrix_a[idx] = model.infer_vector(sent)

    for idx,sent in enumerate(tokenized_sent[num_samples:]):
        matrix_b[idx] = model.infer_vector(sent)

    return matrix_a, matrix_b



non_cont_model1 = NON_CONTEXTUAL_MODEL_TYPE

feature_1_train, feature_2_train = get_feature_model1(df_train)
feature_1_test, feature_2_test = get_feature_model1(df_test)
feature_1_dev, feature_2_dev = get_feature_model1(df_dev)

X_train, Y_train = np.concatenate((feature_1_train, feature_2_train), axis = 1), df_train['score']
X_test, Y_test = np.concatenate((feature_1_test, feature_2_test), axis = 1), df_test['score']
X_dev, Y_dev = np.concatenate((feature_1_dev, feature_2_dev), axis = 1), df_dev['score']

**Saving pickle files**

In [None]:
# TRAINING
with open(PKL_PATH+'X_train_conf1', 'wb') as files:
    pickle.dump(X_train, files)

with open(PKL_PATH+'Y_train_conf1', 'wb') as files:
    pickle.dump(Y_train, files)

# DEVELOPMENT
with open(PKL_PATH+'X_dev_conf1', 'wb') as files:
    pickle.dump(X_dev, files)

with open(PKL_PATH+'Y_dev_conf1', 'wb') as files:
    pickle.dump(Y_dev, files)

# TEST
with open(PKL_PATH+'X_test_conf1', 'wb') as files:
    pickle.dump(X_test, files)

with open(PKL_PATH+'Y_test_conf1', 'wb') as files:
    pickle.dump(Y_test, files)

**Load pickle files**

In [None]:
# TRAINING
with open(PKL_PATH+'X_train_conf1' , 'rb') as f:
    X_train = pickle.load(f)

with open(PKL_PATH+'Y_train_conf1' , 'rb') as f:
    Y_train = pickle.load(f)

# DEVELOPMENT
with open(PKL_PATH+'X_dev_conf1' , 'rb') as f:
    X_dev = pickle.load(f)

with open(PKL_PATH+'Y_dev_conf1' , 'rb') as f:
    Y_dev = pickle.load(f)

# TEST
with open(PKL_PATH+'X_test_conf1' , 'rb') as f:
    X_test = pickle.load(f)

with open(PKL_PATH+'Y_test_conf1' , 'rb') as f:
    Y_test = pickle.load(f)

**ML Regression**

In [None]:
# Initiate a regression model and train it
degree = 2
model1 = make_pipeline(PolynomialFeatures(degree),linear_model.LinearRegression()) #polynomialfeatures to increase features and capture Non-Linear Relationships to some extent
model1.fit(X_train, Y_train)

Y_train_pred = model1.predict(X_train)
Y_test_pred = model1.predict(X_test)
Y_dev_pred = model1.predict(X_dev)

# Print spearmanr correlation on the predicted output of the dev and test sets.
print("Spearmanr correlation for train set  : ", stats.spearmanr(Y_train, Y_train_pred).correlation)
print("Spearmanr correlation for dev set    : ", stats.spearmanr(Y_dev, Y_dev_pred).correlation)
print("Spearmanr correlation for test set   : ", stats.spearmanr(Y_test, Y_test_pred).correlation)

Spearmanr correlation for train set  :  0.5043283420520559
Spearmanr correlation for dev set    :  0.41813180814455864
Spearmanr correlation for test set   :  0.3041898614546014


## CONFIGURATION 2: Contextual Embeddings + ML Regression
1 Load the contextual embedding model

2 Get feature for the sentences using the embedding model loaded before

2 Using features as X and score as Y, train a ML based regression model

3 Print the correlation scores on the dev and test set predictions using trained model

In [None]:
def get_feature_model2(data_frame):
    """
    Input a data frame and return the embedding vectors for the each sentence column using model2,
    Return 2 matrices each of shape (#_samples, #size_of_word_emb).
    """

    sent_a = data_frame['sent_a']       # first sentence
    sent_b = data_frame['sent_b']       # second sentence

    matrix_a = non_cont_model2.encode(sent_a)
    matrix_b = non_cont_model2.encode(sent_b)

    return matrix_a, matrix_b


non_cont_model2 = CONTEXTUAL_MODEL_TYPE(HUGGING_FACE_SENTENCE_TRANSFORMER_MODEL)

feature_1_train, feature_2_train = get_feature_model2(df_train)
feature_1_test, feature_2_test = get_feature_model2(df_test)
feature_1_dev, feature_2_dev = get_feature_model2(df_dev)

X_train, Y_train = np.concatenate((feature_1_train, feature_2_train), axis = 1), df_train['score']
X_test, Y_test = np.concatenate((feature_1_test, feature_2_test), axis = 1), df_test['score']
X_dev, Y_dev = np.concatenate((feature_1_dev, feature_2_dev), axis = 1), df_dev['score']

**Saving pickle files**

In [None]:
# TRAINING
with open(PKL_PATH+'X_train_conf2', 'wb') as files:
    pickle.dump(X_train, files)

with open(PKL_PATH+'Y_train_conf2', 'wb') as files:
    pickle.dump(Y_train, files)

# DEVELOPMENT
with open(PKL_PATH+'X_dev_conf2', 'wb') as files:
    pickle.dump(X_dev, files)

with open(PKL_PATH+'Y_dev_conf2', 'wb') as files:
    pickle.dump(Y_dev, files)

# TEST
with open(PKL_PATH+'X_test_conf2', 'wb') as files:
    pickle.dump(X_test, files)

with open(PKL_PATH+'Y_test_conf2', 'wb') as files:
    pickle.dump(Y_test, files)

**Loading pickle files**

In [None]:
# TRAINING
with open(PKL_PATH+'X_train_conf2' , 'rb') as f:
    X_train = pickle.load(f)

with open(PKL_PATH+'Y_train_conf2' , 'rb') as f:
    Y_train = pickle.load(f)

# DEVELOPMENT
with open(PKL_PATH+'X_dev_conf2' , 'rb') as f:
    X_dev = pickle.load(f)

with open(PKL_PATH+'Y_dev_conf2' , 'rb') as f:
    Y_dev = pickle.load(f)

# TEST
with open(PKL_PATH+'X_test_conf2' , 'rb') as f:
    X_test = pickle.load(f)

with open(PKL_PATH+'Y_test_conf2' , 'rb') as f:
    Y_test = pickle.load(f)

In [None]:
# Initiate a regression model and train it
model2 = linear_model.BayesianRidge()
model2.fit(X_train, Y_train)

Y_train_pred = model2.predict(X_train)
Y_test_pred = model2.predict(X_test)
Y_dev_pred = model2.predict(X_dev)

# Print spearmanr correlation on the predicted output of the dev and test sets.
print("Spearmanr correlation for train set  : ", stats.spearmanr(Y_train, Y_train_pred).correlation)
print("Spearmanr correlation for dev set    : ", stats.spearmanr(Y_dev, Y_dev_pred).correlation)
print("Spearmanr correlation for test set   : ", stats.spearmanr(Y_test, Y_test_pred).correlation)

Spearmanr correlation for train set  :  0.3915587559424884
Spearmanr correlation for dev set    :  0.15747591992373486
Spearmanr correlation for test set   :  0.27193815587431336


## CONFIGURATION 3: Fine-Tune a Contextual Embeddings Model
1 Prepare data samples to be for the DL model to consume

2 Create the data loader, one each for train/dev/test data_input sample set

3 Initialize model consisting of the following 3 components - `base_LM`, a `pooling_layer` and a `dense_layer`

4 Define loss function

5 Fit the model

6 Print the correlation scores on the dev and test set predictions

In [None]:
# -------------------------------------------
# Prepare data samples to be for the DL model to consume
# -------------------------------------------
def form_data(data_frame):
    """
    Input a data frame and return the dataloder.
    """
    #1. convert to InputExample
    dataSamples = []
    sent1 = data_frame['sent_a']
    sent2 = data_frame['sent_b']
    score = data_frame['score']
    for i in range(len(data_frame)):
      #InputExample creates embeddings for sent1 and sent2 using model3 and uses cosine similarity to get predicted similarity
      #Loss is simply squared difference between predicted similarity and score
      #Need to scale score to between -1, 1
      dataSamples.append(InputExample(texts=[sent1[i], sent2[i]], label=float(score[i])))

    #2. convert to DataLoader
    dataloader = DataLoader(dataSamples, shuffle=True, batch_size=BATCH_SIZE)

    return dataloader

# -------------------------------------------
# Obtain predicted scores for input sentence pairs
# -------------------------------------------
def get_model_predicts(data_type, trained_model):
    """
    Input the dataset list and return a list of cosine similarity scores. Use the fitted final_trainable_model for obtaining encodings.
    """

    if data_type == "train":
        df = df_train
    elif data_type == "dev":
        df = df_dev
    elif data_type == "test":
        df = df_test
    else:
        print("wrong input value for data_type")
        return

    evaluater = EmbeddingSimilarityEvaluator(df['sent_a'], df['sent_b'], df['score'])
    return trained_model.evaluate(evaluater)

In [None]:
# -------------------------------------------
# Create the data loader, one each for train/dev/test data_input sample set
# -------------------------------------------
dataloader_train = form_data(df_train)
dataloader_dev = form_data(df_dev)
dataloader_test = form_data(df_test)

# -------------------------------------------
# Initialize model
# -------------------------------------------
base_model = models.Transformer("sentence-transformers/"+HUGGING_FACE_SENTENCE_TRANSFORMER_MODEL)
layer_pooling = models.Pooling(base_model.get_word_embedding_dimension())
layer_dense = models.Dense(in_features=layer_pooling.get_sentence_embedding_dimension(), out_features=layer_pooling.get_sentence_embedding_dimension(), activation_function=torch.nn.ReLU())
model3 = SentenceTransformer(modules=[base_model, layer_pooling, layer_dense])

# -------------------------------------------
# Define the loss function
# -------------------------------------------
loss = losses.CosineSimilarityLoss(model3)

# Freeze all layers of LM except last
count = 0
for param in base_model.parameters():
  count += 1
  if count<166:
    param.requires_grad = False

# -------------------------------------------
# Fit the model
# -------------------------------------------
#EmbeddingSimilarityEvaluator computes embeddings for sent1 and sent2 samples according to model3 -> compares the embeddings according to a metric
#cosine is metric used by default, others can be mse -> now we have 2 columns : predicted similarity scores and gt scores. Compute pearson &
#spearman correlation coeff for these 2 columns to find positive/negative/no correlation between predictions and gt.
#here we dont need to scale gt scores to -1,1 range since we are finding correlation/trend
evaluator = EmbeddingSimilarityEvaluator(df_dev['sent_a'], df_dev['sent_b'], df_dev['score'])
model3.fit(train_objectives=[(dataloader_train, loss)], epochs=NUM_EPOCHS, warmup_steps=25, evaluator=evaluator, evaluation_steps=300)

**Saving pickle file for model3**

In [None]:
with open(PKL_PATH+'model3', 'wb') as files:
    pickle.dump(model3, files)

**Loading pickle file for model3**

In [None]:
with open(PKL_PATH+'model3' , 'rb') as f:
    model3 = pickle.load(f)

**Correlation scores**

In [None]:
score_train = get_model_predicts("train", model3)
score_dev = get_model_predicts("dev", model3)
score_test = get_model_predicts("test", model3)

**Saving correlation scores**

In [None]:
with open(PKL_PATH+'score_train_conf3', 'wb') as files:
    pickle.dump(score_train, files)

with open(PKL_PATH+'score_dev_conf3', 'wb') as files:
    pickle.dump(score_dev, files)

with open(PKL_PATH+'score_test_conf3', 'wb') as files:
    pickle.dump(score_test, files)

**Loading correlation scores**

In [None]:
with open(PKL_PATH+'score_train_conf3' , 'rb') as f:
    score_train = pickle.load(f)

with open(PKL_PATH+'score_dev_conf3' , 'rb') as f:
    score_dev = pickle.load(f)

with open(PKL_PATH+'score_test_conf3' , 'rb') as f:
    score_test = pickle.load(f)

**Printing Results**

In [None]:
# -------------------------------------------
# Get the correlation scores on the dev and test set predictions
# -------------------------------------------
print("Spearmanr correlation for train set  : ",score_train)
print("Spearmanr correlation for dev set    : ",score_dev)
print("Spearmanr correlation for test set   : ",score_test)

Spearmanr correlation for train set  :  0.8090177334258251
Spearmanr correlation for dev set    :  0.8318641423390898
Spearmanr correlation for test set   :  0.8155045187853697
