In [1]:
import sys
sys.path.append('../') #everything is build from main folder

# files
import utils

# packages
import numpy as np 
import pandas as pd 
import os
import torch
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import plotly.express as px
import plotly
import sklearn.cluster as sk
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer
from mpl_toolkits.mplot3d import Axes3D

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [8]:
df_train = pd.read_csv("../data/train.csv").fillna("")
sentences = pd.read_csv("../data/sentences.csv").fillna("")

In [9]:
tr_qids = pd.Series(df_train['qid1'].tolist() + df_train['qid2'].tolist())
qs_tr = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)

MODEL_CLASSES = {
    'bert': (           BertForSequenceClassification,          BertTokenizer,          'bert-base-uncased')
}
model_class, tokenizer, pretrained_model = MODEL_CLASSES['bert']
tokenizer = tokenizer.from_pretrained(pretrained_model)

# Dictionary with unique sentences
unique_dic = tr_qids.value_counts()

In [10]:
(class_to_nodes, node_to_class) = utils.clusters(np.array(df_train[["qid1", "qid2","is_duplicate"]]))
print("There are", len(class_to_nodes), "different question classes")

There are 448743 different question classes


In [11]:
N_sizes = 10
N_classes = 2
count = [N_classes]*N_sizes

first = True
for key, value in class_to_nodes.items():
    size = len(value)
    # Extract questions from the selected classes
    if size >= 2 and size < N_sizes+2 and count[size-2] > 0:
        count[size-2] -= 1
        for val in value:
            question = sentences.iloc[val-1,0]
            embedding = utils.list_from_string(sentences.iloc[val-1,1])
            # Truncate
            if len(embedding) > 34:
                embedding = embedding[0:34]
            else:
                embedding.extend([0] * (70 - len(embedding)))
            emb = pd.DataFrame([embedding])
            q = pd.DataFrame([val, key, question], index=["id","class","question"]).transpose()
            
            if first:
                res = pd.concat([q, emb], axis=1)
                first = False
            else:
                res = res.append(pd.concat([q, emb], axis=1))

In [12]:
def draw_plot (data, method, kind, filename):
    '''
    Plot with dimensionality reduced embeddings in order to show the class distribution
    - input:    data: dataframe with class, question, embeddings
                method: TSNE or PCA
                kind: 3D or 2D
    . output:   plot file
    '''
    embeddings = np.array(data.iloc[:,3:37])
    s = [15 for n in range(len(embeddings))]

    if method == "TSNE":
        if kind == "2D":
            print("Computing " + method + " " + kind + " plot...")
            tsne = TSNE(n_components=2,verbose=0, perplexity=40, n_iter=300, random_state=510)
            node_embeddings = tsne.fit_transform(embeddings)
            data['x'] = node_embeddings[:,0]
            data['y'] = node_embeddings[:,1]
            
            fig_2d = px.scatter(data, x="x", y="y", title= "TSNE: Question embeddings 2D", hover_data=["id","class", "question"], color="class", size = s)
            plotly.offline.plot(fig_2d, filename="../plots/"+filename)
            print("-> Output created: " + "plots/"+filename + "\n")

        elif kind == "3D":
            print("Computing " + method + " " + kind + " plot...")
            tsne = TSNE(n_components= 3,verbose=0, perplexity=40, n_iter=300, random_state=510)
            node_embeddings_3d = tsne.fit_transform(embeddings)

            data['x'] = node_embeddings_3d[:,0]
            data['y'] = node_embeddings_3d[:,1]
            data['z'] = node_embeddings_3d[:,2]

            fig_3d = px.scatter_3d(data, x="x", y="y", z="z", title= "TSNE: Question embeddings 3D", hover_data=["id","class", "question"], color="class")
            plotly.offline.plot(fig_3d, filename="../plots/"+filename)
            print("-> Output created: " + "plots/"+filename + "\n")

    elif method == "PCA":
        if kind == "2D":
            print("Computing " + method + " " + kind + " plot...")
            pca = PCA(n_components=2)
            principalComponents = pca.fit_transform(embeddings)

            data['x'] = principalComponents[:,0]
            data['y'] = principalComponents[:,1]

            fig_2d = px.scatter(data, x="x", y="y", title= "PCA: Question embeddings 2D", hover_data=["id","class", "question"], color="class", size = s)
            plotly.offline.plot(fig_2d, filename="../plots/"+filename)
            print("-> Output created: " + "plots/"+filename + "\n")

        elif kind == "3D":
            print("Computing " + method + " " + kind + " plot...")
            pca = PCA(n_components=3)
            principalComponents = pca.fit_transform(embeddings)

            data['x'] = principalComponents[:,0]
            data['y'] = principalComponents[:,1]
            data['z'] = principalComponents[:,2]

            fig_3d = px.scatter_3d(data, x="x", y="y", z="z", title= "PCA: Question embeddings 3D", hover_data=["id","class", "question"], color="class")
            plotly.offline.plot(fig_3d, filename="../plots/"+filename)
            print("-> Output created: " + "plots/"+filename + "\n")

    else:
        print("Choose between TSNE or PCA")

In [13]:
draw_plot (res, "TSNE", "2D", '2d_tsne.html')
draw_plot (res, "TSNE", "3D", '3d_tsne.html')
draw_plot (res, "PCA",  "2D", '2d_pca.html')
draw_plot (res, "PCA",  "3D", '3d_pca.html')

Computing TSNE 2D plot...
-> Output created: plots/2d_tsne.html

Computing TSNE 3D plot...
-> Output created: plots/3d_tsne.html

Computing PCA 2D plot...
-> Output created: plots/2d_pca.html

Computing PCA 3D plot...
-> Output created: plots/3d_pca.html

