In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.manifold import TSNE

import torch

!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer, util



Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=4bb208d03b02188882c2baa0967a3297016a001249ce412f109eb9f04ee43aa8
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2
[0m

In [2]:
dataset = pd.read_csv('../input/edos-1m/EDOS 1M.csv')
#dataset = pd.read_csv('../input/reduced-edos/EDOS 1M - 1.csv')  # BEST PERFORMANCES
X = dataset["uttr"]
y = dataset["eb+_emot"]

print(X.head(5))
print(type(X), end="\n\n")
print(y.head(5))
print(type(y), end="\n\n")

classes = np.sort(y.unique())
classes

0    You moron ! What fool washes diapers by the we...
1                                   You useless fool !
2                                 How dare you sleep !
3                        Up ! Go and clean the house .
4                                  Clean the kitchen .
Name: uttr, dtype: object
<class 'pandas.core.series.Series'>

0       angry
1     furious
2     furious
3    prepared
4    prepared
Name: eb+_emot, dtype: object
<class 'pandas.core.series.Series'>



array(['acknowledging', 'afraid', 'agreeing', 'angry', 'annoyed',
       'anticipating', 'anxious', 'apprehensive', 'ashamed', 'caring',
       'confident', 'consoling', 'content', 'devastated', 'disappointed',
       'disgusted', 'embarrassed', 'encouraging', 'excited', 'faithful',
       'furious', 'grateful', 'guilty', 'hopeful', 'impressed', 'jealous',
       'joyful', 'lonely', 'neutral', 'nostalgic', 'prepared', 'proud',
       'questioning', 'sad', 'sentimental', 'suggesting', 'surprised',
       'sympathizing', 'terrified', 'trusting', 'wishing'], dtype=object)

# Preparation

In [3]:
# train, validation and test split
indexes_mask = np.arange(len(X))
train_ind, valid_ind = train_test_split(indexes_mask, test_size=0.3, stratify=y)
train_X = X[train_ind]
train_y = y[train_ind].tolist()
valid_X = X[valid_ind]
valid_y = y[valid_ind]

# redifine valid_y in order to contain integers
valid_y = np.argmax(pd.get_dummies(valid_y).to_numpy(), axis=1)

print("Training size: ", len(train_X))
print("validation size: ", len(valid_X))

Training size:  1980598
validation size:  848828


In [4]:
# support function to compute top-k accuracy
def top_k_accuracy(true_y, pred_y, k):
    # Get the top k predictions for each sample in the validation dataset
    pred_y_top_k = np.argsort(pred_y, axis=1)[:, -k:]

    # Get the ground truth labels for the validation dataset
    true_labels = np.asarray(true_y)

    # Compute top k accuracy
    top_k_accuracy = np.mean(np.any(pred_y_top_k == true_labels.reshape(true_labels.shape[0], 1), axis=1))
    return f"Top-{k} accuracy: " + str(top_k_accuracy)

In [5]:
# support function to print TSNE representation
tsne = TSNE(n_components=3, n_iter=250)
colors = [ f'rgb({int(color[0]*255)},{int(color[1]*255)},{int(color[2]*255)})' for color in sns.color_palette(None, len(classes))]
color_discrete_map = dict(zip(classes, colors))

def plot_tsne(tsne_embedding, title=None):
    x, y, z = np.transpose(tsne_embedding)
    fig = px.scatter_3d(x=x, y=y, z=z, title=title, text=classes, color=classes, color_discrete_map=color_discrete_map)
    fig.update_traces(marker=dict(size=3,line=dict(width=2)))
    fig.show()

# Assessing semantic similarity through sentences embeddings

In [6]:
model = SentenceTransformer('all-distilroberta-v1')

# get validation embeddings
validation_embeddings = model.encode(valid_X.to_numpy(), convert_to_tensor=True).cpu()
validation_embeddings.size()

Downloading (…)87e68/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)5afc487e68/README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading (…)fc487e68/config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e68/data_config.json:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading (…)afc487e68/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)87e68/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading (…)7e68/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)afc487e68/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)c487e68/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/26526 [00:00<?, ?it/s]

torch.Size([848828, 768])

## Using Classes embeddings

In [7]:
# get classes embeddings
classes_embeddings = model.encode(classes, convert_to_tensor=True).cpu()
classes_embeddings.size()

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

torch.Size([41, 768])

In [8]:
# get tsne representation of classes
tsne_embedding = tsne.fit_transform(classes_embeddings.cpu().numpy())
plot_tsne(tsne_embedding, title="Classes embeddings")

In [9]:
# compute predictions
similarity_matrix = torch.vstack(
    [util.cos_sim(embedding, classes_embeddings)[0] for embedding in validation_embeddings]
)
similarity_matrix.shape

torch.Size([848828, 41])

In [10]:
# validate the model
validation_preds = similarity_matrix.cpu().numpy()

print('Results for Semantic Similarity classification with Classes embeddings:')
print(classification_report(valid_y, np.argmax(validation_preds, axis=1), target_names=classes))
print(top_k_accuracy(valid_y, validation_preds, 3))

Results for Semantic Similarity classification with Classes embeddings:
               precision    recall  f1-score   support

acknowledging       0.05      0.01      0.02     41449
       afraid       0.24      0.32      0.28     14785
     agreeing       0.25      0.09      0.13     28969
        angry       0.08      0.14      0.10     10477
      annoyed       0.11      0.13      0.12      9022
 anticipating       0.28      0.19      0.23     30130
      anxious       0.04      0.40      0.07      2573
 apprehensive       0.01      0.00      0.00     14018
      ashamed       0.06      0.13      0.09      4439
       caring       0.10      0.30      0.15     12842
    confident       0.12      0.05      0.07     25475
    consoling       0.00      0.01      0.00      5177
      content       0.01      0.00      0.01     19371
   devastated       0.07      0.28      0.11      5262
 disappointed       0.11      0.13      0.11      5489
    disgusted       0.12      0.18      0.14   

## Using Labels embeddings

In [11]:
# get training embeddings
training_embeddings = model.encode(train_X.to_numpy(), convert_to_tensor=True)
training_embeddings.size()

Batches:   0%|          | 0/61894 [00:00<?, ?it/s]

torch.Size([1980598, 768])

In [12]:
# get mean embedding per label
label_embeddings = []
for i in tqdm(range(len(classes))):
    tmp = np.zeros((training_embeddings.shape[1]))
    for j in range(len(training_embeddings)):
        if train_y[j] == classes[i]:
            tmp = np.add(training_embeddings[j].cpu().numpy(), tmp)
    label_embeddings.append(tmp)

100%|██████████| 41/41 [01:46<00:00,  2.59s/it]


In [13]:
label_embeddings = np.array(label_embeddings, dtype=np.float32)
label_embeddings.shape

(41, 768)

In [14]:
# get tsne representation of label embeddings
tsne_embedding = tsne.fit_transform(label_embeddings)
plot_tsne(tsne_embedding, title="Labels embeddings")

In [15]:
# compute predictions
similarity_matrix = torch.vstack(
    [util.cos_sim(embedding, label_embeddings)[0] for embedding in validation_embeddings]
)
similarity_matrix.shape

torch.Size([848828, 41])

In [16]:
# validate the model
validation_preds = similarity_matrix.cpu().numpy()

print('Results for Semantic Similarity classification with Labels embeddings:')
print(classification_report(valid_y, np.argmax(validation_preds, axis=1), target_names=classes))
print(top_k_accuracy(valid_y, validation_preds, 3))

Results for Semantic Similarity classification with Labels embeddings:
               precision    recall  f1-score   support

acknowledging       0.41      0.30      0.34     41449
       afraid       0.43      0.45      0.44     14785
     agreeing       0.32      0.30      0.31     28969
        angry       0.27      0.39      0.32     10477
      annoyed       0.18      0.41      0.25      9022
 anticipating       0.40      0.48      0.43     30130
      anxious       0.14      0.53      0.22      2573
 apprehensive       0.27      0.26      0.27     14018
      ashamed       0.13      0.37      0.19      4439
       caring       0.22      0.40      0.28     12842
    confident       0.36      0.31      0.33     25475
    consoling       0.17      0.48      0.25      5177
      content       0.37      0.41      0.39     19371
   devastated       0.17      0.49      0.25      5262
 disappointed       0.14      0.38      0.21      5489
    disgusted       0.18      0.66      0.28    

## Using Weighted Labels embeddings
Since the dataset provides labels confidence we can try to weight our embeddings basing on those values to obtain a weighted average

In [17]:
confidences = dataset["label_confidence"]
confidences_X = confidences[train_ind].tolist()

In [18]:
# get mean embedding per label, weighted on their confidence
label_embeddings = []
for i in tqdm(range(len(classes))):
    tmp = np.zeros((training_embeddings.shape[1]))
    for j in range(len(training_embeddings)):
        if train_y[j] == classes[i]:
            tmp = np.add(training_embeddings[j].cpu().numpy()*confidences_X[i], tmp)
    label_embeddings.append(tmp)

100%|██████████| 41/41 [01:59<00:00,  2.92s/it]


In [19]:
label_embeddings = np.array(label_embeddings, dtype=np.float32)
label_embeddings.shape

(41, 768)

In [20]:
# get tsne representation of label embeddings
tsne_embedding = tsne.fit_transform(label_embeddings)
plot_tsne(tsne_embedding, title="Weighted Labels embeddings")

In [21]:
# compute predictions
similarity_matrix = torch.vstack(
    [util.cos_sim(embedding, label_embeddings)[0] for embedding in validation_embeddings]
)
similarity_matrix.shape

torch.Size([848828, 41])

In [22]:
# validate the model
validation_preds = similarity_matrix.cpu().numpy()

print('Results for Semantic Similarity classification with Weighted Labels embeddings:')
print(classification_report(valid_y, np.argmax(validation_preds, axis=1), target_names=classes))
print(top_k_accuracy(valid_y, validation_preds, 3))

Results for Semantic Similarity classification with Weighted Labels embeddings:
               precision    recall  f1-score   support

acknowledging       0.41      0.30      0.34     41449
       afraid       0.43      0.45      0.44     14785
     agreeing       0.32      0.30      0.31     28969
        angry       0.27      0.39      0.32     10477
      annoyed       0.18      0.41      0.25      9022
 anticipating       0.40      0.48      0.43     30130
      anxious       0.14      0.53      0.22      2573
 apprehensive       0.27      0.26      0.27     14018
      ashamed       0.13      0.37      0.19      4439
       caring       0.22      0.40      0.28     12842
    confident       0.36      0.31      0.33     25475
    consoling       0.17      0.48      0.25      5177
      content       0.37      0.41      0.39     19371
   devastated       0.17      0.49      0.25      5262
 disappointed       0.14      0.38      0.21      5489
    disgusted       0.18      0.66     