## 0. Set the environment

Set up spark session

In [6]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop3.2"
!pip install pyspark




Install sentence-transformers

In [7]:
!pip install sentence-transformers



In [123]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
spark = SparkSession.builder.getOrCreate()

## 1. Regression

### Prepare the path and the model

In [4]:
#sts = spark.read.format("csv").option("header", "true").option("inferSchema", "false").load("/content/drive/MyDrive/Colab Notebooks/datasets/sts-dev.csv")

In [9]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
model_name = 'bert-base-uncased'
columns = ["genre", "file", "years", "No", "score", "sentence1", "sentence2"]

In [119]:
## Set the save path
from datetime import datetime
model_save_path = 'MyDrive/output/training_stsbenchmark_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model_save_path_classification = 'MyDrive/output/training_snli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model_save_path_combination = 'MyDrive/output/training_combi_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [12]:
# Use Huggingface/transformers model for mapping tokens to embeddings
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
word_embedding_model = models.Transformer(model_name)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

###Load the train, dev and test data

In [129]:
train_path = "/content/drive/MyDrive/Colab Notebooks/datasets/sts-train.csv"
dev_path = '/content/drive/MyDrive/Colab Notebooks/datasets/sts-dev.csv'
test_path = '/content/drive/MyDrive/Colab Notebooks/datasets/sts-test.csv'

###Data cleaning

In [138]:
schema = StructType([
    StructField("genre", StringType(), True),
    StructField("file", StringType(), True),
    StructField("years", StringType(), True),
    StructField("No", IntegerType(), True),
    StructField("score", FloatType(), True),
    StructField("sentence1", StringType(), True),
    StructField("sentence2", StringType(), True)])
train = spark.read.csv(train_path, sep ='\t', header=False, schema=schema)
test = spark.read.csv(dev_path, sep ='\t', header=False, schema=schema)
dev = spark.read.csv(test_path, sep ='\t', header=False, schema=schema)
for i in train:
  print(i, train.filter(i.isNull()).count())

Column<'genre'> 0
Column<'file'> 0
Column<'years'> 0
Column<'No'> 0
Column<'score'> 0
Column<'sentence1'> 0
Column<'sentence2'> 6


In [140]:
train.filter(train['Sentence2'].isNull()).show()

+----------+----------+---------+---+-----+--------------------+---------+
|     genre|      file|    years| No|score|           sentence1|sentence2|
+----------+----------+---------+---+-----+--------------------+---------+
|main-forum|deft-forum|     2014|  1|  0.8|Then the captain ...|     null|
|main-forum|deft-forum|     2014|103|  1.8|Oh, you're such a...|     null|
| main-news|    MSRpar| 2012test|252|  2.2|"And about eight ...|     null|
| main-news|    MSRpar| 2012test|684|  4.0|"Right from the b...|     null|
| main-news|    MSRpar|2012train|454|  3.6|Unlike many early...|     null|
| main-news|    MSRpar|2012train|531|  4.0|It was a final te...|     null|
+----------+----------+---------+---+-----+--------------------+---------+



In [134]:
for i in dev:
  print(i, dev.filter(i.isNull()).count())

Column<'genre'> 0
Column<'file'> 0
Column<'years'> 0
Column<'No'> 0
Column<'score'> 0
Column<'sentence1'> 0
Column<'sentence2'> 3


In [135]:
for i in test:
  print(i, test.filter(i.isNull()).count())

Column<'genre'> 0
Column<'file'> 0
Column<'years'> 0
Column<'No'> 0
Column<'score'> 0
Column<'sentence1'> 0
Column<'sentence2'> 3


####Drop the null value of the dataset. 

In [155]:
from pyspark.sql import SparkSession
train.na.drop()
dev.na.drop()
test.na.drop()

DataFrame[genre: string, file: string, years: string, No: int, score: float, sentence1: string, sentence2: string]

###Load the dataset

In [158]:
pd_train = train.select("sentence1","sentence2","score").toPandas()
pd_dev = dev.select("sentence1","sentence2","score").toPandas()
pd_test = test.select("sentence1","sentence2","score").toPandas()

In [13]:
import csv
from sentence_transformers import SentenceTransformer, InputExample

train_path = "/content/drive/MyDrive/Colab Notebooks/datasets/sts-train.csv"
os.path.isfile(train_path)

train_samples = []

with open(train_path, newline='') as train:
    sts_train = csv.DictReader(train, delimiter='\t', fieldnames=columns, quoting=csv.QUOTE_NONE)
    for row in sts_train:
        score = float(row['score']) / 2.5 - 1 # range -1 ... 1
        input_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)
        train_samples.append(input_example)



In [127]:
dev_path = '/content/drive/MyDrive/Colab Notebooks/datasets/sts-dev.csv'
os.path.isfile(dev_path)

dev_samples = []

with open(dev_path, newline='') as dev:
    sts_dev = csv.DictReader(dev, delimiter='\t', fieldnames=columns, quoting=csv.QUOTE_NONE)
    for row in sts_dev:
        score = float(row['score']) / 2.5 - 1 # range -1 ... 1
        input_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)
        dev_samples.append(input_example)


for x in range(5):
    print (dev_samples[x]),

<InputExample> label: 1.0, texts: A man with a hard hat is dancing.; A man wearing a hard hat is dancing.
<InputExample> label: 0.8999999999999999, texts: A young child is riding a horse.; A child is riding a horse.
<InputExample> label: 1.0, texts: A man is feeding a mouse to a snake.; The man is feeding a mouse to the snake.
<InputExample> label: -0.040000000000000036, texts: A woman is playing the guitar.; A man is playing guitar.
<InputExample> label: 0.10000000000000009, texts: A woman is playing the flute.; A man is playing a flute.


In [14]:
for x in range(5):
    print (train_samples[x]),

<InputExample> label: 1.0, texts: A plane is taking off.; An air plane is taking off.
<InputExample> label: 0.52, texts: A man is playing a large flute.; A man is playing a flute.
<InputExample> label: 0.52, texts: A man is spreading shreded cheese on a pizza.; A man is spreading shredded cheese on an uncooked pizza.
<InputExample> label: 0.040000000000000036, texts: Three men are playing chess.; Two men are playing chess.
<InputExample> label: 0.7, texts: A man is playing the cello.; A man seated is playing the cello.


In [16]:
test_path = '/content/drive/MyDrive/Colab Notebooks/datasets/sts-test.csv'
os.path.isfile(test_path)

test_samples = []

with open(test_path, newline='') as test:
    sts_test = csv.DictReader(test, delimiter='\t', fieldnames=columns, quoting=csv.QUOTE_NONE)
    for row in sts_test:
        score = float(row['score']) / 2.5 - 1 # range from -1 to 1
        input_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)
        test_samples.append(input_example)


for x in range(5):
    print (test_samples[x])

<InputExample> label: 0.0, texts: A girl is styling her hair.; A girl is brushing her hair.
<InputExample> label: 0.43999999999999995, texts: A group of men play soccer on the beach.; A group of boys are playing soccer on the beach.
<InputExample> label: 1.0, texts: One woman is measuring another woman's ankle.; A woman measures another woman's ankle.
<InputExample> label: 0.6800000000000002, texts: A man is cutting up a cucumber.; A man is slicing a cucumber.
<InputExample> label: -0.4, texts: A man is playing a harp.; A man is playing a keyboard.


In [17]:
train_batch_size = 16
num_epochs = 4

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

###Define the parameters of the model

In [18]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


In [19]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

In [20]:
import math
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1) #10% of train data for warm-up

In [21]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

###Evaluate on the STS dataset. 

In [22]:
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
model.evaluate(test_evaluator)

0.68563446632948

0.68563446632948

Spearman correlation

In [161]:
rg_test = pd.read_csv(test_path, sep='\t', header=None, error_bad_lines=False, quoting=csv.QUOTE_NONE)
rg_test.columns = columns
rg_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   genre      1095 non-null   object 
 1   file       1095 non-null   object 
 2   years      1095 non-null   object 
 3   No         1095 non-null   int64  
 4   score      1095 non-null   float64
 5   sentence1  1095 non-null   object 
 6   sentence2  1095 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 60.0+ KB


b'Skipping line 626: expected 7 fields, saw 9\nSkipping line 627: expected 7 fields, saw 9\nSkipping line 628: expected 7 fields, saw 9\nSkipping line 629: expected 7 fields, saw 9\nSkipping line 630: expected 7 fields, saw 9\nSkipping line 631: expected 7 fields, saw 9\nSkipping line 632: expected 7 fields, saw 9\nSkipping line 633: expected 7 fields, saw 9\nSkipping line 634: expected 7 fields, saw 9\nSkipping line 635: expected 7 fields, saw 9\nSkipping line 636: expected 7 fields, saw 9\nSkipping line 637: expected 7 fields, saw 9\nSkipping line 638: expected 7 fields, saw 9\nSkipping line 639: expected 7 fields, saw 9\nSkipping line 640: expected 7 fields, saw 9\nSkipping line 641: expected 7 fields, saw 9\nSkipping line 642: expected 7 fields, saw 9\nSkipping line 643: expected 7 fields, saw 9\nSkipping line 644: expected 7 fields, saw 9\nSkipping line 645: expected 7 fields, saw 9\nSkipping line 646: expected 7 fields, saw 9\nSkipping line 647: expected 7 fields, saw 9\nSkipping

In [163]:
sentence1_embeded = model.encode(rg_test['sentence1'], convert_to_numpy=True, batch_size=train_batch_size)
sentence2_embeded = model.encode(rg_test['sentence2'], convert_to_numpy=True, batch_size=train_batch_size)

In [172]:
import sklearn
cos_similarity = 1 - sklearn.metrics.pairwise.paired_cosine_distances(sentence1_embeded, sentence2_embeded)
print(cos_similarity)

[0.82460696 0.97656184 0.9657241  ... 0.9431007  0.9777431  0.9825696 ]


In [173]:
from scipy import stats
Spear = stats.spearmanr(cos_similarity, rg_test["score"])
print(Spear)

SpearmanrResult(correlation=0.7622869769469252, pvalue=9.861815796866046e-209)


##2. Classification

###Load the train, dev and test data

In [24]:
import pandas as pd
import os
train_path_classification = "/content/drive/MyDrive/Colab Notebooks/datasets/snli_1.0_train.jsonl"
os.path.isfile(train_path_classification)
snli_train = pd.read_json(train_path_classification, lines=True)

In [174]:
snli_train.info


<bound method DataFrame.info of        annotator_labels  ...                                    sentence2_parse
0             [neutral]  ...  (ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
1       [contradiction]  ...  (ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
2          [entailment]  ...  (ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
3             [neutral]  ...  (ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB...
4          [entailment]  ...  (ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN...
...                 ...  ...                                                ...
550147  [contradiction]  ...  (ROOT (S (NP (CD four) (NNS kids)) (VP (VBD wo...
550148        [neutral]  ...  (ROOT (S (S (NP (CD four) (JJ homeless) (NNS c...
550149        [neutral]  ...  (ROOT (S (NP (NP (DT A) (NN man)) (PP (IN in) ...
550150  [contradiction]  ...  (ROOT (S (NP (NP (DT A) (NN man)) (PP (IN in) ...
550151     [entailment]  ...  (ROOT (S (PP (IN On) (NP (DT the) (JJ beautifu...

[550152

In [175]:
snli_train.isnull().sum().sum()

0

In [28]:
train_samples_classification = []
for i in range(len(snli_train)):
    if snli_train['gold_label'][i] == 'entailment':
        label = 0
    elif snli_train['gold_label'][i] == 'neutral':
        label = 1
    elif snli_train['gold_label'][i] == 'contradiction':
        label = 2
    input_example = InputExample(texts=[snli_train['sentence1'][i], snli_train['sentence2'][i]], label=label)
    train_samples_classification.append(input_example)

In [29]:
for x in range(6):
    print (train_samples_classification[x]),

<InputExample> label: 1, texts: A person on a horse jumps over a broken down airplane.; A person is training his horse for a competition.
<InputExample> label: 2, texts: A person on a horse jumps over a broken down airplane.; A person is at a diner, ordering an omelette.
<InputExample> label: 0, texts: A person on a horse jumps over a broken down airplane.; A person is outdoors, on a horse.
<InputExample> label: 1, texts: Children smiling and waving at camera; They are smiling at their parents
<InputExample> label: 0, texts: Children smiling and waving at camera; There are children present
<InputExample> label: 2, texts: Children smiling and waving at camera; The kids are frowning


In [30]:
dev_path = "/content/drive/MyDrive/Colab Notebooks/datasets/snli_1.0_dev.jsonl"
os.path.isfile(dev_path)
snli_dev = pd.read_json(dev_path, lines=True)

In [176]:
snli_dev.isnull().sum().sum()

0

In [31]:
dev_samples_classification = []
for i in range(len(snli_dev)):
    if snli_dev['gold_label'][i] == 'entailment':
        label = 0
    elif snli_dev['gold_label'][i] == 'neutral':
        label = 1
    elif snli_dev['gold_label'][i] == 'contradiction':
        label = 2
    input_example = InputExample(texts=[snli_dev['sentence1'][i], snli_dev['sentence2'][i]], label=label)
    dev_samples_classification.append(input_example)

In [32]:
for x in range(6):
    print (dev_samples_classification[x]),

<InputExample> label: 1, texts: Two women are embracing while holding to go packages.; The sisters are hugging goodbye while holding to go packages after just eating lunch.
<InputExample> label: 0, texts: Two women are embracing while holding to go packages.; Two woman are holding packages.
<InputExample> label: 2, texts: Two women are embracing while holding to go packages.; The men are fighting outside a deli.
<InputExample> label: 0, texts: Two young children in blue jerseys, one with the number 9 and one with the number 2 are standing on wooden steps in a bathroom and washing their hands in a sink.; Two kids in numbered jerseys wash their hands.
<InputExample> label: 1, texts: Two young children in blue jerseys, one with the number 9 and one with the number 2 are standing on wooden steps in a bathroom and washing their hands in a sink.; Two kids at a ballgame wash their hands.
<InputExample> label: 2, texts: Two young children in blue jerseys, one with the number 9 and one with the

In [33]:
test_path = "/content/drive/MyDrive/Colab Notebooks/datasets/snli_1.0_test.jsonl"
os.path.isfile(test_path)
snli_test = pd.read_json(test_path, lines=True)
snli_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   annotator_labels        10000 non-null  object
 1   captionID               10000 non-null  object
 2   gold_label              10000 non-null  object
 3   pairID                  10000 non-null  object
 4   sentence1               10000 non-null  object
 5   sentence1_binary_parse  10000 non-null  object
 6   sentence1_parse         10000 non-null  object
 7   sentence2               10000 non-null  object
 8   sentence2_binary_parse  10000 non-null  object
 9   sentence2_parse         10000 non-null  object
dtypes: object(10)
memory usage: 781.4+ KB


In [34]:
test_samples_classification = []
for i in range(1500):
    if snli_test['gold_label'][i] == 'entailment':
        label = 0
    elif snli_test['gold_label'][i] == 'neutral':
        label = 1
    elif snli_test['gold_label'][i] == 'contradiction':
        label = 2
    input_example = InputExample(texts=[snli_test['sentence1'][i], snli_test['sentence2'][i]], label=label)
    test_samples_classification.append(input_example)

In [35]:
print(len(train_samples_classification))

550152


In [36]:
for x in range(6):
    print (test_samples_classification[x]),

<InputExample> label: 1, texts: This church choir sings to the masses as they sing joyous songs from the book at a church.; The church has cracks in the ceiling.
<InputExample> label: 0, texts: This church choir sings to the masses as they sing joyous songs from the book at a church.; The church is filled with song.
<InputExample> label: 2, texts: This church choir sings to the masses as they sing joyous songs from the book at a church.; A choir singing at a baseball game.
<InputExample> label: 1, texts: A woman with a green headscarf, blue shirt and a very big grin.; The woman is young.
<InputExample> label: 0, texts: A woman with a green headscarf, blue shirt and a very big grin.; The woman is very happy.
<InputExample> label: 2, texts: A woman with a green headscarf, blue shirt and a very big grin.; The woman has been shot.


As there is no NaN value, we don't need to clean the datasets.

###Define the parameters of the model

In [43]:
train_batch_size = 16
num_epochs = 2

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [38]:
from torch.utils.data import DataLoader
train_dataloader_classification = DataLoader(train_samples_classification, shuffle=True, batch_size=train_batch_size)
train_loss_classification = losses.SoftmaxLoss(model=model,sentence_embedding_dimension=model.get_sentence_embedding_dimension(),num_labels=3)

In [39]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
evaluator_classification = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples_classification, name='snli_dev')

In [40]:
import math
warmup_steps_classification = math.ceil(len(train_dataloader_classification) * num_epochs  * 0.1) #10% of train data for warm-up

In [44]:
model.fit(train_objectives=[(train_dataloader_classification, train_loss_classification)],
          evaluator=evaluator_classification,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps_classification,
          output_path=model_save_path_classification)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/34385 [00:00<?, ?it/s]

Iteration:   0%|          | 0/34385 [00:00<?, ?it/s]

###Evaluation on the STS dataset

In [49]:
model_classification = SentenceTransformer(model_save_path_classification)
test_evaluator_classification = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='snli-test')
model_classification.evaluate(test_evaluator_classification)

0.6368268906555097

0.6368268906555097

In [183]:
sentence1_embeded_classification = model_classification.encode(rg_test['sentence1'], convert_to_numpy=True, batch_size=train_batch_size)
sentence2_embeded_classification  = model_classification.encode(rg_test['sentence2'], convert_to_numpy=True, batch_size=train_batch_size)

In [184]:
cos_sim_snli = 1 - sklearn.metrics.pairwise.paired_cosine_distances(sentence1_embeded_classification, sentence2_embeded_classification)
print(cos_sim_snli)

[0.64237565 0.91269153 0.7664648  ... 0.6861202  0.7428557  0.9462644 ]


In [182]:
print(rg_test["score"])

0       2.5
1       3.6
2       5.0
3       4.2
4       1.5
       ... 
1090    0.4
1091    1.4
1092    4.8
1093    4.4
1094    5.0
Name: score, Length: 1095, dtype: float64


In [186]:
spearman_classfication = stats.spearmanr(cos_sim_snli, rg_test["score"])
print(spearman_classfication)

SpearmanrResult(correlation=0.6427276700539001, pvalue=1.2436257590791855e-128)


SpearmanrResult(correlation=0.6427276700539001, pvalue=1.2436257590791855e-128)

## 3.Combination

In this section, we use the previous model and tune it with the STS dataset.  

In [50]:
model_combination = SentenceTransformer(model_save_path_classification)

In [51]:
train_dataloader_combination = DataLoader(train_samples_classification, shuffle=True, batch_size=train_batch_size)
train_loss_combination = losses.SoftmaxLoss(model=model_combination,sentence_embedding_dimension=model_combination.get_sentence_embedding_dimension(),num_labels=3)
evaluator_combination = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts_dev')

In [52]:
model_combination.fit(train_objectives=[(train_dataloader_combination, train_loss_combination)],
          evaluator=evaluator_combination,
          epochs=1,
          evaluation_steps=1000,
          warmup_steps=warmup_steps_classification,
          output_path=model_save_path_combination)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/34385 [00:00<?, ?it/s]

In [53]:
test_evaluator_combination = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='combi-test')
model_combination.evaluate(test_evaluator_combination)

0.7268743676596611

In [187]:
sentence1_embeded_combination = model_combination.encode(rg_test['sentence1'], convert_to_numpy=True, batch_size=train_batch_size)
sentence2_embeded_combination  = model_combination.encode(rg_test['sentence2'], convert_to_numpy=True, batch_size=train_batch_size)

In [188]:
cos_sim_combi = 1 - sklearn.metrics.pairwise.paired_cosine_distances(sentence1_embeded_combination, sentence2_embeded_combination)
print(cos_sim_combi)

[0.9311626  0.9666035  0.9671594  ... 0.91042995 0.9818618  0.9823825 ]


In [189]:
spearman_combination = stats.spearmanr(cos_sim_combi, rg_test["score"])
print(spearman_combination)

SpearmanrResult(correlation=0.7656690827081282, pvalue=1.1117872348766913e-211)


As we can see, the evaluation improves from 0.6368268906555097 to 0.7268743676596611, and Spearman correlation improves from 0.6427276700539001 to 0.7656690827081282. So we prove the hypothesis. 

## 4. Semantic Search

In [54]:
news_train_path = "/content/drive/MyDrive/Colab Notebooks/datasets/News_Category_Dataset_v2.json"
os.path.isfile(news_train_path)

news_trainsets = pd.read_json(news_train_path, lines=True)


In [55]:
news_trainsets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   category           200853 non-null  object        
 1   headline           200853 non-null  object        
 2   authors            200853 non-null  object        
 3   link               200853 non-null  object        
 4   short_description  200853 non-null  object        
 5   date               200853 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.2+ MB


In [56]:
headlines = news_trainsets["headline"]

In [57]:
print(news_trainsets)

             category  ...       date
0               CRIME  ... 2018-05-26
1       ENTERTAINMENT  ... 2018-05-26
2       ENTERTAINMENT  ... 2018-05-26
3       ENTERTAINMENT  ... 2018-05-26
4       ENTERTAINMENT  ... 2018-05-26
...               ...  ...        ...
200848           TECH  ... 2012-01-28
200849         SPORTS  ... 2012-01-28
200850         SPORTS  ... 2012-01-28
200851         SPORTS  ... 2012-01-28
200852         SPORTS  ... 2012-01-28

[200853 rows x 6 columns]


In [58]:
headlines[0]

'There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV'

In [65]:
embedder = SentenceTransformer(model_save_path_combination)
news_embeddings = embedder.encode(headlines, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/6277 [00:00<?, ?it/s]

In [115]:
keyword = input("Please input your keyword\n")

Please input your keyword
Happy New Year


In [116]:
keyword_embedding = embedder.encode(keyword, convert_to_tensor=True)
#print(keyword_embedding)

In [117]:
import torch
from sentence_transformers import util
util.pytorch_cos_sim
# We use cosine-similarity and torch.topk to find the highest 5 scores
#for k in range(5):
cos_score =  util.pytorch_cos_sim(keyword_embedding, news_embeddings)[0]
top_results = torch.topk(cos_score, k=5)

print("\n\n======================\n\n")
print("Keyword:", keyword)
print("\nTop 5 most similar sentences in corpus:")
print(top_results[1])







Keyword: Happy New Year

Top 5 most similar sentences in corpus:
tensor([169486, 134960, 102600, 102660, 134652], device='cuda:0')


In [118]:
hits = util.semantic_search(keyword_embedding, news_embeddings, top_k=5)
hits = hits[0]      #Get the hits for the first query
for hit in hits:
    print(headlines[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
    print(hit['corpus_id'])

Happy New Year! (Score: 0.9920)
169486
Welcome Happiness This New Year (Score: 0.9645)
134960
Happy New Year, Here I Come... (Score: 0.9488)
102600
Happy Economic New Year! (Score: 0.9475)
102660
A VERY Prosperous New Year! (Score: 0.9384)
134652


We input "Merry Christmas" and we can find top 5 most similar sentences from the corpus. 