In [10]:
import json
import os

In [11]:
with open("data/traffic/causal_feature/processed_qa.json", "r") as f:
    graph_data = json.load(f)

In [None]:
# Count all noun concepts
# print(graph_data[0]['processed_question']['tokens'])
noun_vocab = []
for _graph in graph_data:
    tokens = _graph['processed_question']['tokens']
    for token in tokens:
        if token["pos"] in ["NOUN", "VERB"]:
            if token['text'] not in noun_vocab:
                noun_vocab.append(token['text'])
    tokens = _graph['processed_answer']['tokens']
    for token in tokens:
        if token["pos"] in ["NOUN", "VERB"]:
            if token['text'] not in noun_vocab:
                noun_vocab.append(token['text'])

In [13]:
len(noun_vocab)

450

In [None]:
from transformers import CLIPProcessor, CLIPModel
import torch

model_name = "CLIP/clip-vit-large-patch14"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
  return self.fget.__get__(instance, owner)()


In [None]:
inputs = processor(text=noun_vocab, return_tensors="pt", padding=True)
with torch.no_grad():
    text_features = model.get_text_features(**inputs)

print(text_features.size())

torch.Size([450, 768])


In [None]:
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans

def kmean(x, k=256):
    x = x.reshape([-1, 768])
    print("feature sample:", x.shape[0])
    kmeans = MiniBatchKMeans(n_clusters=k, random_state=43, verbose=True).fit(x)
    print("clustering done")
    cluster_centers = kmeans.cluster_centers_
    print("Get centers")
    labels = kmeans.labels_

    cluster_features = [x[labels == i] for i in range(k)]
    cluster_means = [np.mean(cluster, axis=0) for cluster in cluster_features]
    print("Get mean")
    return cluster_centers, cluster_means

In [15]:
text_features = text_features.numpy()
centers, means = kmean(text_features, k=64)

feature sample: 450
Init 1/3 with method k-means++


  super()._check_params_vs_input(X, default_n_init=3)


Inertia for init 1/3: 36428.82421875
Init 2/3 with method k-means++
Inertia for init 2/3: 37214.109375
Init 3/3 with method k-means++
Inertia for init 3/3: 37497.44921875
[MiniBatchKMeans] Reassigning 2 cluster centers.
Minibatch step 1/100: mean batch inertia: 87.54566840277778
Minibatch step 2/100: mean batch inertia: 54.40743489583333, ewa inertia: 54.40743489583333
Minibatch step 3/100: mean batch inertia: 53.46103732638889, ewa inertia: 53.46103732638889
Minibatch step 4/100: mean batch inertia: 51.7063671875, ewa inertia: 51.7063671875
Minibatch step 5/100: mean batch inertia: 51.54075954861111, ewa inertia: 51.54075954861111
Minibatch step 6/100: mean batch inertia: 51.96254774305555, ewa inertia: 51.96254774305555
Minibatch step 7/100: mean batch inertia: 50.982721354166664, ewa inertia: 50.982721354166664
Minibatch step 8/100: mean batch inertia: 50.827161458333336, ewa inertia: 50.827161458333336
Minibatch step 9/100: mean batch inertia: 52.48167534722222, ewa inertia: 52.481

In [16]:
# text_features = torch.from_numpy(text_features)
# centers = torch.from_numpy(centers)
means = torch.tensor(means)

torch.save({"original": text_features, "k_center": centers, "k_means": means}, "qa_noun_vocab.npy")