In [None]:
from sklearn.manifold import MDS, SpectralEmbedding, TSNE, LocallyLinearEmbedding
from sklearn.decomposition import PCA, KernelPCA
import matplotlib.pyplot as plt
def dr(X, method="MDS"):
    if method == "MDS":
        embedding = MDS(n_components=2)
    elif method == "spectral":
        embedding = SpectralEmbedding(n_components=2)
    elif method == "tsne":
        embedding = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=3)
    elif method == "locallylinear":
        embedding = LocallyLinearEmbedding(n_components=2)
    elif method == "pca":
        embedding = PCA(n_components=2)
    elif method == "kernelpca":
        embedding = KernelPCA(n_components=2, kernel='sigmoid')
    X_transformed = embedding.fit_transform(X)
    return X_transformed

def plot(XY):
    plt.figure(figsize=(8, 6))
    plt.scatter(XY[:, 0], XY[:, 1])
    plt.title('Scatterplot')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.show()


In [None]:
import json
from pprint import pprint
import numpy as np
dataset = json.load(open('../data/result/chunk_embeddings/all.json'))
data_embeddings = [data['embedding'] for data in dataset.values()]
data_embeddings = [embedding for embedding in data_embeddings if embedding != 'No content' ]
data_embeddings = np.array(data_embeddings)
print(data_embeddings.shape)

dr_result = dr(data_embeddings, 'kernelpca')
plot(dr_result)

In [2]:
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w', encoding='utf-8') as fp:
        json.dump(data, fp, indent=4)

In [3]:
from collections import defaultdict
import json
dataset = json.load(open('../data/result/chunk_embeddings/1029/all_messages.json'))
chunk_messages = defaultdict(list)
for message in dataset:
    chunk_id = message['interview_id'] + "_" + str(message['chunk_index'])
    chunk_messages[chunk_id].append(message)

In [3]:
from pprint import pprint
import itertools
from numpy.linalg import norm
import numpy as np
import json

def similarity(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)
    return np.dot(v1,v2)/(norm(v1)*norm(v2))
all_chunks = json.load(open('../data/result/chunk_embeddings/1103/all_chunks.json'))
chunk_similarities = []
for c1, c2 in itertools.combinations(all_chunks, 2):
    # c1_embeddings = [message['embedding'] for message in chunk_messages[c1]]
    # c2_embeddings = [message['embedding'] for message in chunk_messages[c2]]
    # pairwise_similarities = [similarity(m1, m2) for m1 in c1_embeddings for m2 in c2_embeddings]
    # cluster_similarity = max(pairwise_similarities)
    c1_embedding = c1['embedding']
    c2_embedding = c2['embedding']
    cluster_similarity = similarity(c1_embedding, c2_embedding)
    c1_id = c1['id']
    c2_id = c2['id']
    chunk_similarities.append((c1_id, c2_id, cluster_similarity))
save_json(chunk_similarities, '../data/result/chunk_embeddings/1103/chunk_similarities.json')
