In [146]:
# files
import utils

# packages
import numpy as np 
import pandas as pd 
import os
import torch
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import plotly.express as px
import plotly
import sklearn.cluster as sk
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer
from mpl_toolkits.mplot3d import Axes3D 

In [2]:
df_train = pd.read_csv("data/train.csv").fillna("")
sentences = pd.read_csv("data/sentences.csv").fillna("")

In [8]:
tr_qids = pd.Series(df_train['qid1'].tolist() + df_train['qid2'].tolist())
qs_tr = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)

MODEL_CLASSES = {
    'bert': (           BertForSequenceClassification,          BertTokenizer,          'bert-base-uncased')
}
model_class, tokenizer, pretrained_model = MODEL_CLASSES['bert']
tokenizer = tokenizer.from_pretrained(pretrained_model)

# Dictionary with unique sentences
unique_dic = tr_qids.value_counts()

In [9]:
(class_to_nodes, node_to_class) = utils.clusters(np.array(df_train[["qid1", "qid2","is_duplicate"]]))
print("There are", len(class_to_nodes), "different question classes")

There are 448743 different question classes


In [101]:
def list_from_string(emb):
    emb = emb[1:-1]
    emb = emb.split(', ')
    emb = list(map(lambda x: int(x), emb))
    return emb

[2129, 2064, 4274, 3177, 2022, 3445, 2011, 23707, 2083, 1040, 3619, 1029]

In [157]:
N_sizes = 10
N_classes = 2
count = [N_classes]*N_sizes

first = True
for key, value in class_to_nodes.items():
    size = len(value)
    if size >= 2 and size < N_sizes+2 and count[size-2] > 0:
        count[size-2] -= 1
        for val in value:
            question = sentences.iloc[val-1,0]
            embedding = list_from_string(sentences.iloc[val-1,1])
            
            if len(embedding) > 34:
                embedding = embedding[0:34]
            else:
                embedding.extend([0] * (70 - len(embedding)))
            emb = pd.DataFrame([embedding])
            q = pd.DataFrame([val, key, question], index=["id","class","question"]).transpose()
            
            if first:
                res = pd.concat([q, emb], axis=1)
                first = False
            else:
                res = res.append(pd.concat([q, emb], axis=1))

In [161]:
def draw_plot (data, method, num_clusters, kind):
    embeddings = np.array(data.iloc[:,3:37])

    if method == "TSNE":
        if kind == "2D":
            tsne = TSNE(n_components=2,verbose=1, perplexity=40, n_iter=300, random_state=510)
            node_embeddings = tsne.fit_transform(embeddings)
            data['x'] = node_embeddings[:,0]
            data['y'] = node_embeddings[:,1]

            fig_2d = px.scatter(data, x="x", y="y", title= "Question embeddings 2D", hover_data=["id","class", "question"], color="class")
            plotly.offline.plot(fig_2d, filename='2d_tsne.html')

        elif kind == "3D":
            tsne = TSNE(n_components= 3,verbose=1, perplexity=40, n_iter=300, random_state=510)
            node_embeddings_3d = tsne.fit_transform(embeddings)

            data['x'] = node_embeddings_3d[:,0]
            data['y'] = node_embeddings_3d[:,1]
            data['z'] = node_embeddings_3d[:,2]

            fig_3d = px.scatter_3d(data, x="x", y="y", z="z", title= "Question embeddings 3D", hover_data=["id","class", "question"], color="class")
            plotly.offline.plot(fig_3d, filename='3d_tsne.html')

    elif method == "PCA":
        if kind == "2D":
            pca = PCA(n_components=2)
            principalComponents = pca.fit_transform(embeddings)

            data['x'] = principalComponents[:,0]
            data['y'] = principalComponents[:,1]

            fig_2d = px.scatter(data, x="x", y="y", title= "Question embeddings 2D", hover_data=["id","class", "question"], color="class")
            plotly.offline.plot(fig_2d, filename='2d_pca.html')

        elif kind == "3D":
            pca = PCA(n_components=3)
            principalComponents = pca.fit_transform(embeddings)

            data['x'] = principalComponents[:,0]
            data['y'] = principalComponents[:,1]
            data['z'] = principalComponents[:,2]

            fig_3d = px.scatter_3d(data, x="x", y="y", z="z", title= "Question embeddings 3D", hover_data=["id","class", "question"], color="class")
            plotly.offline.plot(fig_3d, filename='3d_pca.html')

    else:
        print("Choose between TSNE or PCA")

In [165]:
draw_plot (res, "TSNE", num_clusters, "2D")
draw_plot (res, "TSNE", num_clusters, "3D")
draw_plot (res, "PCA", num_clusters, "2D")
draw_plot (res, "PCA", num_clusters, "3D")

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 130 samples in 0.001s...
[t-SNE] Computed neighbors for 130 samples in 0.018s...
[t-SNE] Computed conditional probabilities for sample 130 / 130
[t-SNE] Mean sigma: 5891.869992
[t-SNE] KL divergence after 250 iterations with early exaggeration: 54.211716
[t-SNE] KL divergence after 300 iterations: 0.565273
