<a href="https://colab.research.google.com/github/TING-JHEN/SimCSE_chinese/blob/main/sts_Unsupervised_SimCSE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

data:  https://github.com/pluto-junzeng/CNSD

In [None]:
#!pip install sentence_transformers

In [None]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
import datetime
import os
import gzip
import csv

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=print,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Training parameters
model_name = 'bert-base-chinese'
train_batch_size = 16
num_epochs = 3
max_seq_length = 32

In [None]:
# Save path to store our model
now = datetime.datetime.now(tz=datetime.timezone(datetime.timedelta(hours=8))).strftime("%Y-%m-%d_%H-%M-%S")
model_save_path = 'output/chinese_stsb_simcse-{}-{}-{}'.format(model_name, train_batch_size, now)

# Check if dataset exsist. If not, download and extract it
# sts_dataset_path = 'data/stsbenchmark.tsv.gz'
train_dataset_path = '/content/STS-B.train.data'
dev_dataset_path = '/content/STS-B.valid.data'
test_dataset_path = '/content/STS-B.test.data'

# train_samples is a list of InputExample objects where we pass the same sentence twice to texts, i.e. texts=[sent, sent]
train_samples = []
with open(train_dataset_path, 'r', encoding='utf8') as fIn:
    for line in fIn:
        line = line.strip().split('\t')
        if len(line) == 3:
            train_samples.append(InputExample(texts=[line[0], line[0]]))
            train_samples.append(InputExample(texts=[line[1], line[1]]))

dev_samples = []
with open(dev_dataset_path, 'r', encoding='utf8') as fIn:
    for line in fIn:
        line = line.strip().split('\t')
        score = float(line[2]) / 5.0  # Normalize score to range 0 ... 1
        if len(line) == 3:
            dev_samples.append(InputExample(texts=[line[0], line[1]],label=score))

test_samples = []
with open(test_dataset_path, 'r', encoding='utf8') as fIn:
    for line in fIn:
        line = line.strip().split('\t')
        score = float(line[2]) / 5.0  # Normalize score to range 0 ... 1
        if len(line) == 3:
            test_samples.append(InputExample(texts=[line[0], line[1]],label=score))

i = 87
print('len(train_samples)=',len(train_samples),train_samples[i])
print('len(dev_samples)=',len(dev_samples),dev_samples[i])
print('len(test_samples)=',len(test_samples),test_samples[i])

len(train_samples)= 10462 <InputExample> label: 0, texts: 一个人在弹吉他。; 一个人在弹吉他。
len(dev_samples)= 1458 <InputExample> label: 0.2, texts: 一个女人正在剥一些鱼。; 一个女人正在煎锅上倒一种黄色的混合物。
len(test_samples)= 1361 <InputExample> label: 0.6, texts: 熊猫躺在圆木上。; 一只熊猫躺着。


In [None]:
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev')
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='sts-test')

In [None]:
# Here we define our SentenceTransformer model
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
print(model)

# We train our model using the MultipleNegativesRankingLoss
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)
train_loss = losses.MultipleNegativesRankingLoss(model)

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
evaluation_steps = int(len(train_dataloader) * 0.1) #Evaluate every 10% of the data

print("Training sentences: {}".format(len(train_samples)))
print('evaluation_steps:',evaluation_steps)
print("Warmup-steps: {}".format(warmup_steps))

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SentenceTransformer(
  (0): Transformer({'max_seq_length': 32, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)
Training sentences: 10462
evaluation_steps: 65
Warmup-steps: 196


In [None]:
#print("Performance before training")
#print('dev_eval=',dev_evaluator(model))
#print('test_eval=',test_evaluator(model))

#dev_eval= 0.6437529042748259
#test_eval= 0.5415038524595274

In [None]:
# Train the model
for i in range(num_epochs):
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        evaluator=dev_evaluator,
        epochs=1,
        evaluation_steps=evaluation_steps,
        warmup_steps=warmup_steps,
        output_path=model_save_path,
        optimizer_params={'lr': 5e-5},
        use_amp=True         #Set to True, if your GPU supports FP16 cores
    )
    print('epochs=',i+1)
    print('dev_eval=',dev_evaluator(model))
    print('test_eval=',test_evaluator(model))

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/653 [00:00<?, ?it/s]

epochs= 1
dev_eval= 0.7231538061537403
test_eval= 0.6787762067574322


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/653 [00:00<?, ?it/s]

epochs= 2
dev_eval= 0.7398828073038063
test_eval= 0.689681852141984


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/653 [00:00<?, ?it/s]

epochs= 3
dev_eval= 0.7256450940070871
test_eval= 0.691419616829035


In [None]:
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################


#model = SentenceTransformer(model_save_path)
#test_eval = test_evaluator(model, output_path=model_save_path)
#test_eval

In [None]:
#余弦相似性
import numpy as np
embeddings1 = model.encode("熊猫躺在圆木上。")
embeddings2 = model.encode("一只熊猫躺着。")

cos_sim = embeddings1.dot(embeddings2) / (np.linalg.norm(embeddings1) * np.linalg.norm(embeddings2))
print(cos_sim)

0.7320684
