## Installing Sentence Transsformer and other models/frameworks

In [164]:
# !pip3 install sentence_transformers
# !pip3 install gensim
# Kindly add all your installations and versions if any in this cell.

## Importing necessary libraries. 
In the final version all imports should be stricly enlisted here.

In [165]:
import pandas as pd
import csv
import numpy as np
import spacy
from scipy import stats
from sklearn.ensemble import GradientBoostingRegressor

import gensim.downloader

from sentence_transformers import SentenceTransformer, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

import torch 
from torch.utils.data import DataLoader
import re

## Load dataset: 7 marks
1 Download and unzip the dataset from this link http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz  **1 mark**

2 Complete the code in `read_sts_csv()`. **4.5 marks**

3 Create 3 dataframes one each for train, test and val and print their final shapes. **1.5 marks**

In [166]:
def preprocess(sentence):
    sentence = sentence.lower()
    sentence = re.sub(' +', ' ',sentence) #remove extra whitespaces
    sentence =re.sub('[^\w\s]','',sentence) #keep only words and spaces
    return sentence

In [167]:
INPUT_PATH='content/'
def read_sts_csv(dataset_type="train", columns=['source', 'type', 'year', 'id', 'score', 'sent_a', 'sent_b'],preprocessing=False,verbose=True):
  path = INPUT_PATH + "sts-"+ dataset_type + ".csv"
  """
  Take the input path and return the dataframe
  """
  if verbose == True:
    print(path)

  df=pd.read_csv(path,sep='\t',quoting=csv.QUOTE_NONE,names=columns)
  if preprocessing==True:
    df['sent_a'] = df['sent_a'].apply(preprocess)
    df['sent_b'] = df['sent_b'].apply(preprocess)
  return df

df_train = read_sts_csv("train",preprocessing=False)
df_dev = read_sts_csv("dev",preprocessing=False)
df_test = read_sts_csv("test",preprocessing=False)

content/sts-train.csv
content/sts-dev.csv
content/sts-test.csv


## Hyperparameters: 5 Marks
Update this cell with you choosen parameters except, NUM_EPOCHS

In [168]:
NON_CONEXTUAL_MODEL_TYPE = "glove-wiki-gigaword-50"
CONEXTUAL_MODEL_TYPE = "sentence-transformers/stsb-distilbert-base"
HUGGING_FACE_SENTENCE_TRANSFORMER_MODEL = "" # USE THE HUGGAING FACE VERSION OF SENTENCE_TRANSFORMER_TYPE
INPUT_PATH = "content/"

BATCH_SIZE = ""
OUT_DIM_DENSE = ""
NUM_EPOCHS = 2 ## THIS IS FIXED DO NOT CHANGE

# You are free to add your own hyperparameters as well.

## CONFIGURATION 1: Non-contextual Embeddings + ML Regression: 8 marks
1 Load the non-contextual embedding model in variable `non_cont_model1`. **1 marks**

2 Get feature for the sentences using the LM model loaded before. Add the code in the `get_feature_model1()` **2 marks**

2 Using features as X and score as Y, train a ML based regression model (`model1`). You are free to choose any sklearn based regression method, and its hyperparameters. **3.5 marks**

3 Print the correlation scores on the dev and test set predictions using trained `model1`. **1.5 mark**



In [169]:
glove = gensim.downloader.load(NON_CONEXTUAL_MODEL_TYPE)

def get_sentence_vector(sentence):
    sentence_vector = []
    for word in sentence.split():
        if word in glove:
            sentence_vector.append(glove[word])
    return np.mean(sentence_vector, axis=0)


def get_sentence_vectors(sentences):
    return np.array([get_sentence_vector(sentence) for sentence in sentences])

In [170]:
# non_cont_model1 
non_cont_model1 = get_sentence_vectors

def get_feature_model1(data_frame):
  """
  Input a data frame and return the embedding vectors for the each sentence column using non_cont_model1,
  Return 2 matrices each of shape (#_samples, #size_of_word_emb).
  """
  return non_cont_model1(data_frame['sent_a']), non_cont_model1(data_frame['sent_b'])


df_train = read_sts_csv("train",preprocessing=True,verbose=False)
df_dev = read_sts_csv("dev",preprocessing=True,verbose=False)
df_test = read_sts_csv("test",preprocessing=True,verbose=False)

# feature_1_<dataset_type>, feature_2_<dataset_type> = get_feature_model2(data_frame)
feature_1_train, feature_2_train = get_feature_model1(df_train)
feature_1_dev, feature_2_dev = get_feature_model1(df_dev)
feature_1_test, feature_2_test = get_feature_model1(df_test)

X_train = np.concatenate((feature_1_train, feature_2_train), axis=1)
X_dev = np.concatenate((feature_1_dev, feature_2_dev), axis=1)
X_test = np.concatenate((feature_1_test, feature_2_test), axis=1)


y_train = df_train['score'].values
y_dev = df_dev['score'].values
y_test = df_test['score'].values



# Initiate a regression model and train it.
print("TRAINING :")
reg = GradientBoostingRegressor(random_state=0,verbose=True)
reg.fit(X_train, y_train)


# Print spearmanr correlation on the predicted output of the dev and test sets.
print("\nEVALUATION :")
print("Spearman Correlation on Dev Set : ",stats.spearmanr(y_dev, reg.predict(X_dev)).correlation)
print("Spearman Correlation on Test Set : ",stats.spearmanr(y_test, reg.predict(X_test)).correlation)


TRAINING :
      Iter       Train Loss   Remaining Time 
         1           2.1139           13.83s
         2           2.0857           14.47s
         3           2.0590           13.21s
         4           2.0363           12.56s
         5           2.0069           12.15s
         6           1.9891           11.83s
         7           1.9689           11.53s
         8           1.9455           11.28s
         9           1.9282           11.07s
        10           1.9089           10.93s
        20           1.7569            9.67s
        30           1.6532            8.55s
        40           1.5750            7.41s
        50           1.5004            6.18s
        60           1.4485            4.94s
        70           1.3838            3.71s
        80           1.3436            2.47s
        90           1.2942            1.23s
       100           1.2597            0.00s

EVALUATION :
Spearman Correlation on Dev Set :  0.43416547183416593
Spearman Correlatio

## CONFIGURATION 2: Contextual Embeddings + ML Regression: 7 marks
1 Load the contextual embedding model in variable `non_cont_model2`. **1 marks**

2 Get feature for the sentences using the LM model loaded before. Add the code in the `get_feature_model2()` **2 marks**

2 Using features as X and score as Y, train a ML based regression model (`model2`). You are free to choose any sklearn based regression method, and its hyperparameters. **3.5 marks**

3 Print the correlation scores on the dev and test set predictions using trained `model2`. **1.5 mark**

Useful references: https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [171]:
# non_cont_model2
non_cont_model2 = SentenceTransformer(CONEXTUAL_MODEL_TYPE)

def get_feature_model2(df):
    """
    Input a data frame and return the embedding vectors for the each sentence column using non_cont_model2,
    Return 2 matrices each of shape (#_samples, #size_of_word_emb).
    """
    return non_cont_model2.encode(df['sent_a']), non_cont_model2.encode(df['sent_b'])


df_train = read_sts_csv("train",preprocessing=True,verbose=False)
df_dev = read_sts_csv("dev",preprocessing=True,verbose=False)
df_test = read_sts_csv("test",preprocessing=True,verbose=False)


# feature_1_<dataset_type>, feature_2_<dataset_type> = get_feature_model2(data_frame)
feature_1_train, feature_2_train = get_feature_model2(df_train)
feature_1_dev, feature_2_dev = get_feature_model2(df_dev)
feature_1_test, feature_2_test = get_feature_model2(df_test)


# X_<dataset_type>, Y_<dataset_type> = 
X_train = np.concatenate((feature_1_train, feature_2_train), axis=1)
X_dev = np.concatenate((feature_1_dev, feature_2_dev), axis=1)
X_test = np.concatenate((feature_1_test, feature_2_test), axis=1)


y_train = df_train['score'].values
y_dev = df_dev['score'].values
y_test = df_test['score'].values


# Initiate a regression model and train it.
print("TRAINING :")
reg2 = GradientBoostingRegressor(random_state=0,verbose=True)
reg2.fit(X_train, y_train)

# Print spearmanr correlation on the predicted output of the dev and test sets.
print("\nEVALUATION :")
print("Spearman Correlation on Dev Set : ",stats.spearmanr(y_dev, reg2.predict(X_dev)).correlation)
print("Spearman Correlation on Test Set : ",stats.spearmanr(y_test, reg2.predict(X_test)).correlation)

TRAINING :
      Iter       Train Loss   Remaining Time 
         1           2.1042            3.00m
         2           2.0682            2.99m
         3           2.0262            2.96m
         4           1.9845            2.93m
         5           1.9397            2.94m
         6           1.8976            2.91m
         7           1.8495            2.88m
         8           1.8155            2.84m
         9           1.7879            2.81m
        10           1.7494            2.78m
        20           1.4535            2.50m
        30           1.2247            2.16m
        40           1.0729            1.85m
        50           0.9364            1.54m
        60           0.8515            1.24m
        70           0.7770           55.81s
        80           0.7146           37.28s
        90           0.6606           18.66s
       100           0.6025            0.00s

EVALUATION :
Spearman Correlation on Dev Set :  0.7321888818647269
Spearman Correlation

### SAVING ML BASED REGRESSOR MODELS

In [None]:
import pickle
# Save the model
with open('models/reg.pkl', 'wb') as f:
    pickle.dump(reg, f)

with open('models/reg2.pkl', 'wb') as f:
    pickle.dump(reg2, f)

## CONFIGURATION 3: Fine-Tune a Contextual Embeddings Model: 18 marks
1 Prepare data samples to be for the DL model to consume. Add the code in the `form_data()`. **4 marks**

3 Create the data loader, one each for train/dev/test data_input sample set obtained from `form_input_example()`. **1.5 marks**

4 Initiate `model3` consisting of **atleast** the following 3 components - `base_LM`, a `pooling_layer` and a `dense_layer`. Use appropriate activation function in dense. **Atleast** one layer of `base_LM` should be set to trainable. **5 marks**

6 Initiate the `loss`. **0.5 marks**

7 Fit the `model3`. Use `NUM_EPOCHS = 2`. **MAX_NUM_EPOCHS allowed will be 3**. **2 marks** 

8 Complete the `get_model_predicts()` to obtain predicted scores for input sentence pairs. **3.5 marks** 

9 Print the correlation scores on the dev and test set predictions. **1.5 mark**

Useful References: https://huggingface.co/blog/how-to-train-sentence-transformers 

In [172]:
def form_data(data_frame):
  """
  Input a data frame and return the dataloder.
  """

def get_model_predicts(data_type, trained_model):
  """
  Input the dataset list and return a list of cosine similarity scores. Use the fitted final_trainable_model for obtaining encodings.
  """

# dataloader_<dataset_type> = form_data(data_frame)
# base_model = 
# layer_ppoling = 
# layer_dense = 
# model3 = 
# loss =

# Fit the model3.
# Print spearman correlation on the predicted output of the dev and test sets.