In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.manifold import TSNE

import torch

!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer, util



Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=6e9565dd93cb072b877176c8979e2b7f6b7e7a2beece7382a4a566e95df5e86c
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2
[0m

In [2]:
dataset = pd.read_csv('../input/reduced-edos/EDOS 1M - 1.csv')
X,y = dataset["uttr"], dataset["eb+_emot"]

print(X.head(5))
print(type(X), end="\n\n")
print(y.head(5))
print(type(y), end="\n\n")

classes = np.sort(y.unique())
classes

0                   Damn ! What the hell is going on ?
1                                       Fucking pigs .
2               We 'll soon leave . I 'm also fed up .
3    Oh , what the fuck ? ! Is this another dream ?...
4    Are you mad ? We can hardly breathe . We 'll b...
Name: uttr, dtype: object
<class 'pandas.core.series.Series'>

0    angry
1    angry
2    angry
3    angry
4    angry
Name: eb+_emot, dtype: object
<class 'pandas.core.series.Series'>



array(['acknowledging', 'afraid', 'agreeing', 'angry', 'annoyed',
       'anticipating', 'anxious', 'apprehensive', 'ashamed', 'caring',
       'confident', 'consoling', 'content', 'devastated', 'disappointed',
       'disgusted', 'embarrassed', 'encouraging', 'excited', 'faithful',
       'furious', 'grateful', 'guilty', 'hopeful', 'impressed', 'jealous',
       'joyful', 'lonely', 'neutral', 'nostalgic', 'prepared', 'proud',
       'questioning', 'sad', 'sentimental', 'suggesting', 'surprised',
       'sympathizing', 'terrified', 'trusting', 'wishing'], dtype=object)

# Preparation

In [3]:
# train, validation and test split
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.3, stratify=y)

# redifine valid_y in order to contain integers
valid_y = np.argmax(pd.get_dummies(valid_y).to_numpy(), axis=1)

print("Training size: ", len(train_X))
print("validation size: ", len(valid_X))

Training size:  246188
validation size:  105510


In [4]:
# support function to compute top-k accuracy
def top_k_accuracy(true_y, pred_y, k):
    # Get the top k predictions for each sample in the validation dataset
    pred_y_top_k = np.argsort(pred_y, axis=1)[:, -k:]

    # Get the ground truth labels for the validation dataset
    true_labels = np.asarray(true_y)

    # Compute top k accuracy
    top_k_accuracy = np.mean(np.any(pred_y_top_k == true_labels.reshape(true_labels.shape[0], 1), axis=1))
    return f"Top-{k} accuracy: " + str(top_k_accuracy)

In [5]:
# support function to print TSNE representation
tsne = TSNE(n_components=3, n_iter=250)
colors = [ f'rgb({int(color[0]*255)},{int(color[1]*255)},{int(color[2]*255)})' for color in sns.color_palette(None, len(classes))]
color_discrete_map = dict(zip(classes, colors))

def plot_tsne(tsne_embedding, title=None):
    x, y, z = np.transpose(tsne_embedding)
    fig = px.scatter_3d(x=x, y=y, z=z, title=title, text=classes, color=classes, color_discrete_map=color_discrete_map)
    fig.update_traces(marker=dict(size=3,line=dict(width=2)))
    fig.show()

# Assessing semantic similarity through sentences embeddings

In [6]:
model = SentenceTransformer('all-distilroberta-v1')

# get validation embeddings
validation_embeddings = model.encode(valid_X.to_numpy(), convert_to_tensor=True).cpu()
validation_embeddings.size()

Downloading (…)87e68/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)5afc487e68/README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading (…)fc487e68/config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e68/data_config.json:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading (…)afc487e68/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)87e68/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading (…)7e68/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)afc487e68/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)c487e68/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/3298 [00:00<?, ?it/s]

torch.Size([105510, 768])

## Using Classes embeddings

In [7]:
# get classes embeddings
classes_embeddings = model.encode(classes, convert_to_tensor=True).cpu()
classes_embeddings.size()

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

torch.Size([41, 768])

In [8]:
# get tsne representation of classes
tsne_embedding = tsne.fit_transform(classes_embeddings.cpu().numpy())
plot_tsne(tsne_embedding, title="Classes embeddings")

In [9]:
# compute predictions
similarity_matrix = torch.vstack(
    [util.cos_sim(embedding, classes_embeddings)[0] for embedding in validation_embeddings]
)
similarity_matrix.shape

torch.Size([105510, 41])

In [10]:
# validate the model
validation_preds = similarity_matrix.cpu().numpy()

print('Results for Semantic Similarity classification with Classes embeddings:')
print(classification_report(valid_y, np.argmax(validation_preds, axis=1), target_names=classes))
print(top_k_accuracy(valid_y, validation_preds, 3))

Results for Semantic Similarity classification with Classes embeddings:
               precision    recall  f1-score   support

acknowledging       0.02      0.01      0.01      2574
       afraid       0.30      0.33      0.32      2574
     agreeing       0.21      0.08      0.12      2573
        angry       0.16      0.15      0.15      2573
      annoyed       0.25      0.13      0.17      2573
 anticipating       0.20      0.20      0.20      2573
      anxious       0.26      0.40      0.31      2573
 apprehensive       0.02      0.00      0.00      2573
      ashamed       0.20      0.14      0.16      2574
       caring       0.18      0.30      0.22      2574
    confident       0.12      0.04      0.06      2574
    consoling       0.01      0.00      0.00      2573
      content       0.01      0.00      0.01      2573
   devastated       0.22      0.29      0.25      2574
 disappointed       0.30      0.13      0.18      2573
    disgusted       0.38      0.19      0.25   

## Using Labels embeddings

In [11]:
# get training embeddings
training_embeddings = model.encode(train_X.to_numpy(), convert_to_tensor=True)
training_embeddings.size()

Batches:   0%|          | 0/7694 [00:00<?, ?it/s]

torch.Size([246188, 768])

In [12]:
# get mean embedding per label
train_y = train_y.tolist()
label_embeddings = []
for i in tqdm(range(len(classes))):
    count = 0
    tmp = np.zeros((training_embeddings.shape[1]))
    for j in range(len(training_embeddings)):
        if train_y[j] == classes[i]:
            count += 1
            tmp = np.add(training_embeddings[j].cpu().numpy(), tmp)
    # TODO: weight for label confidence
    label_embeddings.append(tmp / count)

100%|██████████| 41/41 [00:13<00:00,  3.09it/s]


In [13]:
label_embeddings = np.array(label_embeddings, dtype=np.float32)
label_embeddings.shape

(41, 768)

In [14]:
# get tsne representation of label embeddings
tsne_embedding = tsne.fit_transform(label_embeddings)
plot_tsne(tsne_embedding, title="Labels embeddings")

In [15]:
# compute predictions
similarity_matrix = torch.vstack(
    [util.cos_sim(embedding, label_embeddings)[0] for embedding in validation_embeddings]
)
similarity_matrix.shape

torch.Size([105510, 41])

In [16]:
# validate the model
validation_preds = similarity_matrix.cpu().numpy()

print('Results for Semantic Similarity classification with Labels embeddings:')
print(classification_report(valid_y, np.argmax(validation_preds, axis=1), target_names=classes))
print(top_k_accuracy(valid_y, validation_preds, 3))

Results for Semantic Similarity classification with Labels embeddings:
               precision    recall  f1-score   support

acknowledging       0.30      0.30      0.30      2574
       afraid       0.54      0.47      0.50      2574
     agreeing       0.39      0.33      0.35      2573
        angry       0.48      0.40      0.43      2573
      annoyed       0.37      0.42      0.39      2573
 anticipating       0.36      0.48      0.41      2573
      anxious       0.55      0.52      0.53      2573
 apprehensive       0.41      0.25      0.31      2573
      ashamed       0.43      0.38      0.40      2574
       caring       0.32      0.40      0.36      2574
    confident       0.33      0.31      0.32      2574
    consoling       0.47      0.46      0.47      2573
      content       0.49      0.40      0.44      2573
   devastated       0.44      0.49      0.46      2574
 disappointed       0.40      0.37      0.38      2573
    disgusted       0.52      0.67      0.58    