In [1]:
import nltk
from nltk import pos_tag
from nltk.chunk import ne_chunk
from nltk.tree import Tree
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans,DBSCAN
from sklearn.preprocessing import StandardScaler

import pandas as pd

In [2]:
# nltk.download()

In [3]:
df = pd.read_csv("data/Aoutput0_cut.csv")
df.iloc[140]

frame                                                   8400
caption    An image of a man cutting a tomato with a knif...
Name: 140, dtype: object

In [4]:
captions = df["caption"]
# tokens = nltk.word_tokenize(captions[0])
tokens = [nltk.word_tokenize(caption) for caption in captions]
tagged_tokens = [nltk.pos_tag(token) for token in tokens]
tagged_tokens[140]

[('An', 'DT'),
 ('image', 'NN'),
 ('of', 'IN'),
 ('a', 'DT'),
 ('man', 'NN'),
 ('cutting', 'VBG'),
 ('a', 'DT'),
 ('tomato', 'NN'),
 ('with', 'IN'),
 ('a', 'DT'),
 ('knife', 'NN'),
 ('on', 'IN'),
 ('a', 'DT'),
 ('kitchen', 'NN'),
 ('counter', 'NN'),
 ('.', '.')]

In [5]:
df['tagged_tokens'] = tagged_tokens

In [6]:
def extract_SVO(tagged_token):
    subject = ""
    verb = ""
    obj = ""

    grammar = "SOV: {<DT>?<JJ>?<N.*><V.*><IN>?<DT>?<JJ>?<N.*>}"
    cp = nltk.RegexpParser(grammar)
    tree = cp.parse(tagged_token)
    for subtree in tree.subtrees():
        if subtree.label() == "SOV":
            for leaf in subtree.leaves():
                if ('NN' in leaf[1] and subject==''):
                    subject = leaf[0]
                if 'VB' in leaf[1]:
                    verb = leaf[0]
                if ('NN' in leaf[1]):
                    obj = leaf[0]
    return subject, verb, obj

In [7]:
def extract_NV(tagged_token):
    words = []

    for tag in tagged_token:
        if ('NN' in tag[1]):
            words.append(tag[0])
        if 'VB' in tag[1]:
            words.append(tag[0])
    return words[1:]

In [8]:
svos = [extract_NV(tagged_token) for tagged_token in tagged_tokens]
df['svos'] = svos
df.iloc[100]

frame                                                         6000
caption          An image of a man opening a bag of food in a k...
tagged_tokens    [(An, DT), (image, NN), (of, IN), (a, DT), (ma...
svos                            [man, opening, bag, food, kitchen]
Name: 100, dtype: object

In [9]:

import torch
from transformers import BertTokenizer, BertModel

# 初始化tokenizer和model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(sentences):
    # 批量标记化文本并得到相应的输出
    inputs = tokenizer(sentences, return_tensors="pt", truncation=True, padding=True, max_length=512)

    with torch.no_grad():
        output = model(**inputs)
    
    # 使用BERT模型的最后一层的均值作为句子的嵌入
    embeddings = output.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

# 示例
# sentences = ["BERT is a great NLP model.", "Transformers library makes it easy.", "Text embeddings are useful."]
embeddings = get_bert_embeddings([" ".join(svo) for svo in svos])
# for i, embed in enumerate(embeddings):
#     print(f"Sentence {i + 1} Embedding Shape:", embed.shape)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Sentence 1 Embedding Shape: (768,)
Sentence 2 Embedding Shape: (768,)
Sentence 3 Embedding Shape: (768,)
Sentence 4 Embedding Shape: (768,)
Sentence 5 Embedding Shape: (768,)
Sentence 6 Embedding Shape: (768,)
Sentence 7 Embedding Shape: (768,)
Sentence 8 Embedding Shape: (768,)
Sentence 9 Embedding Shape: (768,)
Sentence 10 Embedding Shape: (768,)
Sentence 11 Embedding Shape: (768,)
Sentence 12 Embedding Shape: (768,)
Sentence 13 Embedding Shape: (768,)
Sentence 14 Embedding Shape: (768,)
Sentence 15 Embedding Shape: (768,)
Sentence 16 Embedding Shape: (768,)
Sentence 17 Embedding Shape: (768,)
Sentence 18 Embedding Shape: (768,)
Sentence 19 Embedding Shape: (768,)
Sentence 20 Embedding Shape: (768,)
Sentence 21 Embedding Shape: (768,)
Sentence 22 Embedding Shape: (768,)
Sentence 23 Embedding Shape: (768,)
Sentence 24 Embedding Shape: (768,)
Sentence 25 Embedding Shape: (768,)
Sentence 26 Embedding Shape: (768,)
Sentence 27 Embedding Shape: (768,)
Sentence 28 Embedding Shape: (768,)
S

In [10]:
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform([" ".join(svo) for svo in svos])
# X = StandardScaler().fit_transform(X.toarray())

X = embeddings

kmeans = KMeans(n_clusters=100)
# dbscan = DBSCAN(eps=0.5, min_samples=5)
kmeans.fit(X)
# labels = dbscan.fit_predict(X)
labels = kmeans.labels_
df['label'] = labels

  super()._check_params_vs_input(X, default_n_init=10)


In [11]:
intertia = kmeans.inertia_
intertia

20545.63656546344

In [13]:
# df.head(50)