<a href="https://colab.research.google.com/github/NirantK/Hinglish/blob/sentence-transformer/SentenceTransformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# !pip install sentence-transformers
# !pip install nlpaug

In [0]:
# from google.colab import files
# uploaded = files.upload()

Saving train.json to train.json


In [0]:
# import nltk
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

In [0]:
# from sklearn.model_selection import train_test_split
# df = pd.read_json('train.json')
# df = df.sample(frac=1)
# train, dev = train_test_split(df, test_size=0.10, random_state=1)
# train.to_json("train.json")
# dev.to_json("dev.json")

# Hinglish Data Reader

In [0]:
from sentence_transformers.readers import InputExample
import pandas as pd
from pathlib import Path
import nlpaug.augmenter.word as naw
from nlpaug.util import Action

class HinglishReader(object):
    """
    Datareader for Hinglish Twitter dataset. 
    """
    def __init__(self, dataset_folder):
        self.dataset_folder=Path(dataset_folder)
        self.aug = naw.SynonymAug(aug_src='wordnet')
        

    def get_examples(self, filename, max_examples=0):
        """
        Converts the data into InputExamples. 
        """
        df=pd.read_json(self.dataset_folder/filename)
        df['sentiment'] = df['sentiment'].map({ 'neutral': 0,'negative' :1, 'positive':2})
        input_examples =df.apply(lambda x : InputExample(guid= x['uid'], texts= [x['text'],self.aug.augment(x['text'])], label=x['sentiment']), axis=1)
        return list(input_examples)


# Sentence Transformer

In [0]:
"""
The system trains BERT on the SNLI + MultiNLI (AllNLI) dataset
with softmax loss function. At every 1000 training steps, the model is evaluated on the
STS benchmark dataset
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator,LabelAccuracyEvaluator
from sentence_transformers.readers import *
import logging
from datetime import datetime

In [0]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

In [0]:
# Read the dataset
model_name = 'bert-base-uncased'
batch_size = 16
model_save_path = 'output/training_nli_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [0]:
hinglish_reader = HinglishReader('./')

In [0]:
# Use BERT for mapping tokens to embeddings
word_embedding_model = models.BERT(model_name)

2020-01-07 18:02:02 - loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /root/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
2020-01-07 18:02:02 - Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

2020-01-07 18:02:02 - loading weights file https://s3.amazonaws.com/models.huggingface.co/b

In [0]:
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

In [0]:
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

2020-01-07 18:02:04 - Use pytorch device: cuda


In [0]:
train_data = SentencesDataset(hinglish_reader.get_examples('train.json'), model=model)

Convert dataset: 100%|██████████| 11860/11860 [00:11<00:00, 1023.95it/s]

2020-01-07 18:02:34 - Num sentences: 11860
2020-01-07 18:02:34 - Sentences 0 longer than max_seqence_length: 0
2020-01-07 18:02:34 - Sentences 1 longer than max_seqence_length: 0





In [0]:
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=3)

2020-01-07 18:02:34 - Softmax loss: #Vectors concatenated: 3


In [0]:
# Configure the training
num_epochs = 1

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

2020-01-07 18:02:34 - Warmup-steps: 75


In [0]:
# Train the model
dev_data = SentencesDataset(examples=hinglish_reader.get_examples('dev.json'), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)



Convert dataset: 100%|██████████| 1318/1318 [00:01<00:00, 1011.08it/s]


2020-01-07 18:02:37 - Num sentences: 1318
2020-01-07 18:02:37 - Sentences 0 longer than max_seqence_length: 0
2020-01-07 18:02:37 - Sentences 1 longer than max_seqence_length: 0


In [0]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/742 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/742 [00:00<03:45,  3.29it/s][A
Iteration:   0%|          | 2/742 [00:00<03:34,  3.44it/s][A
Iteration:   0%|          | 3/742 [00:00<03:26,  3.59it/s][A
Iteration:   1%|          | 4/742 [00:01<03:21,  3.65it/s][A
Iteration:   1%|          | 5/742 [00:01<03:23,  3.62it/s][A
Iteration:   1%|          | 6/742 [00:01<03:19,  3.68it/s][A
Iteration:   1%|          | 7/742 [00:01<03:17,  3.72it/s][A
Iteration:   1%|          | 8/742 [00:02<03:13,  3.79it/s][A
Iteration:   1%|          | 9/742 [00:02<03:16,  3.72it/s][A
Iteration:   1%|▏         | 10/742 [00:02<03:15,  3.75it/s][A
Iteration:   1%|▏         | 11/742 [00:02<03:17,  3.71it/s][A
Iteration:   2%|▏         | 12/742 [00:03<03:13,  3.78it/s][A
Iteration:   2%|▏         | 13/742 [00:03<03:09,  3.84it/s][A
Iteration:   2%|▏         | 14/742 [00:03<03:11,  3.80it/s][A
Iteration:   2%|▏         | 

2020-01-07 18:05:57 - Evaluation the model on  dataset after epoch 0:



Convert Evaluating:   5%|▍         | 4/83 [00:00<00:06, 12.98it/s][A
Convert Evaluating:   7%|▋         | 6/83 [00:00<00:05, 12.90it/s][A
Convert Evaluating:  10%|▉         | 8/83 [00:00<00:05, 12.73it/s][A
Convert Evaluating:  12%|█▏        | 10/83 [00:00<00:05, 12.86it/s][A
Convert Evaluating:  14%|█▍        | 12/83 [00:00<00:05, 12.75it/s][A
Convert Evaluating:  17%|█▋        | 14/83 [00:01<00:05, 12.65it/s][A
Convert Evaluating:  19%|█▉        | 16/83 [00:01<00:05, 12.83it/s][A
Convert Evaluating:  22%|██▏       | 18/83 [00:01<00:05, 12.86it/s][A
Convert Evaluating:  24%|██▍       | 20/83 [00:01<00:04, 12.87it/s][A
Convert Evaluating:  27%|██▋       | 22/83 [00:01<00:04, 12.85it/s][A
Convert Evaluating:  29%|██▉       | 24/83 [00:01<00:04, 12.90it/s][A
Convert Evaluating:  31%|███▏      | 26/83 [00:02<00:04, 12.77it/s][A
Convert Evaluating:  34%|███▎      | 28/83 [00:02<00:04, 12.66it/s][A
Convert Evaluating:  36%|███▌      | 30/83 [00:02<00:04, 12.70it/s][A
Convert 

2020-01-07 18:06:04 - Cosine-Similarity :	Pearson: -0.0252	Spearman: 0.0365
2020-01-07 18:06:04 - Manhattan-Distance:	Pearson: -0.0784	Spearman: -0.0541
2020-01-07 18:06:04 - Euclidean-Distance:	Pearson: -0.0803	Spearman: -0.0563
2020-01-07 18:06:04 - Dot-Product-Similarity:	Pearson: 0.3813	Spearman: 0.3411
2020-01-07 18:06:04 - Save model to output/training_nli_bert-base-uncased-2020-01-07_18-02-02
2020-01-07 18:06:04 - Configuration saved in output/training_nli_bert-base-uncased-2020-01-07_18-02-02/0_BERT/config.json


Epoch: 100%|██████████| 1/1 [03:27<00:00, 207.33s/it]

2020-01-07 18:06:05 - Model weights saved in output/training_nli_bert-base-uncased-2020-01-07_18-02-02/0_BERT/pytorch_model.bin





In [0]:
##############################################################################
#
# Load the stored model and evaluate its performance on Hinglish dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_data = SentencesDataset(examples=hinglish_reader.get_examples("test.json"), model=model)
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

model.evaluate(evaluator)

2020-01-07 18:06:05 - Load pretrained SentenceTransformer: output/training_nli_bert-base-uncased-2020-01-07_18-02-02
2020-01-07 18:06:05 - Load SentenceTransformer from folder: output/training_nli_bert-base-uncased-2020-01-07_18-02-02
2020-01-07 18:06:05 - loading configuration file output/training_nli_bert-base-uncased-2020-01-07_18-02-02/0_BERT/config.json
2020-01-07 18:06:05 - Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

2020-01-07 18:06:05 - loading weights

Convert dataset: 100%|██████████| 3295/3295 [00:03<00:00, 1042.32it/s]
Convert Evaluating:   1%|          | 2/206 [00:00<00:17, 11.45it/s]

2020-01-07 18:06:15 - Num sentences: 3295
2020-01-07 18:06:15 - Sentences 0 longer than max_seqence_length: 0
2020-01-07 18:06:15 - Sentences 1 longer than max_seqence_length: 0
2020-01-07 18:06:15 - Evaluation the model on  dataset:


Convert Evaluating: 100%|██████████| 206/206 [00:16<00:00, 12.99it/s]


2020-01-07 18:06:31 - Cosine-Similarity :	Pearson: -0.0221	Spearman: 0.0428
2020-01-07 18:06:31 - Manhattan-Distance:	Pearson: -0.0691	Spearman: -0.0471
2020-01-07 18:06:31 - Euclidean-Distance:	Pearson: -0.0711	Spearman: -0.0489
2020-01-07 18:06:31 - Dot-Product-Similarity:	Pearson: 0.3755	Spearman: 0.3359


0.3358669790344215

In [0]:
!tar -zcvf model_files.tar.gz output/training_nli_bert-base-uncased-2020-01-07_18-02-02

In [0]:
from google.colab import files
# files.download('train.json')
# files.download('dev.json')
files.download('model_files.tar.gz')