# Cluestar

Poetry added dependencies:
- cluestar
- umap-learn (not umap)
- keybert

## DBPedia dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%cd ..
%cd ..

c:\Users\MartijnElands\Documents\Thesis\twister


In [22]:
from src.CustomDataLoader import CustomDataLoader

data_files = {"train": "DBPEDIA_train.csv", "test": "DBPEDIA_test.csv", "validation": "DBPEDIA_val.csv"}
loader = CustomDataLoader(name="DeveloperOats/DBPedia_Classes", data_files=data_files)

dataset = loader.load_huggingface_data()
subsets = list(dataset.keys())
dfs = loader.to_dataframe(data_dict=dataset, subsets=subsets)

Found cached dataset csv (C:/Users/MartijnElands/.cache/huggingface/datasets/DeveloperOats___csv/DeveloperOats--DBPedia_Classes-745bb82299e080ae/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 3/3 [00:00<00:00, 29.38it/s]
100%|██████████| 3/3 [00:19<00:00,  6.64s/it]


In [25]:
for key in dfs.keys():
    dfs[key] = loader.selectEqualFewshots(dataframe=dfs[key], shots=30, seed=20, text_name="text", label_name="l1")
    sampled_texts = dfs[key]["text"].to_list()

    print('Set:', key)
    print(sampled_texts[:3])
    print()

Set: train
['In Greek mythology, Epipole was a daughter of Trachion, of Carystus in Euboea. In the disguise of a man she went with the Greeks against Troy. But when Palamedes discovered her sex, she was stoned to death by the Greek army.', 'Julio César Turbay Ayala (18 June 1916 – 13 September 2005) was the 25th President of Colombia from 1978 to 1982.', "Transwa is Western Australia's regional public transport provider, linking 240 destinations, from Kalbarri in the north to Augusta in the south west to Esperance in the south east. The Transwa system provides transport to the major regional centres of Bunbury, Kalgoorlie, Northam, Geraldton and Albany. Transwa is part of the Public Transport Authority and was launched on 28 May 2003 replacing the Western Australian Government Railways Commission."]

Set: test
['Harold Dunbar Cooley (July 26, 1897 – January 15, 1974) was an American politician of the Democratic Party. He represented the Fourth Congressional district of North Carolina f

## Arrays

Let's try to plot it through arrays with TF-IDF

In [26]:
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

pipe = make_pipeline(TfidfVectorizer(), TruncatedSVD())

X = pipe.fit_transform(sampled_texts)

## Visualize

We can move through the space

In [29]:
from cluestar import plot_text

plot_text(X, sampled_texts, color_words=["agent", "work", "place", "species", "unitofwork", "event", "device", "sportsseason", "topicalconcept"])

## Improve through UMAP

In [30]:
from umap import UMAP

pipe = make_pipeline(TfidfVectorizer(), UMAP(min_dist=0.1, n_neighbors=5))

X = pipe.fit_transform(sampled_texts)

plot_text(X, sampled_texts)

Adding colours to the classes to see embedding space

In [31]:
plot_text(X, sampled_texts, color_words=["agent", "work", "place", "species", "unitofwork", "event", "device", "sportsseason", "topicalconcept"])

# KeyBERT

In [50]:
import pandas as pd
import numpy as np

def extract_specific_classes(dfs: dict[str, pd.DataFrame], classes: list, dfs_type: str = "train") -> pd.DataFrame:
    df = dfs[dfs_type]
    return df[df["l1"].isin(classes)]

In [52]:
classes = np.unique(dfs['train']['l1'])

In [53]:
from keybert import KeyBERT
import numpy as np

for i in classes:
    kw_model = KeyBERT()

    df = extract_specific_classes(dfs, [i])
    sampled_texts = df["text"].to_list()

    #doc = ' SEP '.join(sampled_texts)
    doc = ' '.join(sampled_texts)
    print(kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=None))

[('fiji', 0.4318), ('fijian', 0.4261), ('unions', 0.4143), ('union', 0.3934), ('fictu', 0.3797)]
[('ford', 0.3758), ('v8', 0.3754), ('valvetrain', 0.334), ('engines', 0.331), ('chevrolet', 0.312)]
[('competed', 0.3858), ('tournament', 0.3742), ('ufc', 0.37), ('lpga', 0.3684), ('competitions', 0.366)]
[('canyon', 0.4853), ('niles', 0.3827), ('valle', 0.37), ('vallejo', 0.3566), ('valley', 0.3433)]
[('trombiculidae', 0.5141), ('cobra', 0.4273), ('snouts', 0.4051), ('testudinidae', 0.395), ('gastropod', 0.3915)]
[('bobcats', 0.4631), ('ncaa', 0.4179), ('quinnipiac', 0.4064), ('collegiate', 0.3525), ('penn', 0.3333)]
[('yunnan', 0.4876), ('baisha', 0.4865), ('huangjing', 0.4796), ('xiyue', 0.4624), ('lijiang', 0.4312)]
[('judicial', 0.4426), ('court', 0.3929), ('courts', 0.3885), ('litigation', 0.3824), ('plaintiffs', 0.374)]
[('cadd', 0.3714), ('vocals', 0.3682), ('songwriter', 0.3557), ('albums', 0.3509), ('album', 0.3388)]


Additionally, remove stopwords

In [54]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MartijnElands\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [55]:
from keybert import KeyBERT

for i in classes:
    kw_model = KeyBERT()

    df = extract_specific_classes(dfs, [i])
    sampled_texts = df["text"].to_list()

    #doc = ' SEP '.join(sampled_texts)
    doc = ' '.join(sampled_texts)
    print(kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=stopwords.words('english')))

[('fiji', 0.4318), ('fijian', 0.4261), ('unions', 0.4143), ('union', 0.3934), ('fictu', 0.3797)]
[('ford', 0.3758), ('v8', 0.3754), ('valvetrain', 0.334), ('engines', 0.331), ('chevrolet', 0.312)]
[('competed', 0.3858), ('tournament', 0.3742), ('ufc', 0.37), ('lpga', 0.3684), ('competitions', 0.366)]
[('canyon', 0.4853), ('niles', 0.3827), ('valle', 0.37), ('vallejo', 0.3566), ('valley', 0.3433)]
[('trombiculidae', 0.5141), ('cobra', 0.4273), ('snouts', 0.4051), ('testudinidae', 0.395), ('gastropod', 0.3915)]
[('bobcats', 0.4631), ('ncaa', 0.4179), ('quinnipiac', 0.4064), ('collegiate', 0.3525), ('penn', 0.3333)]
[('yunnan', 0.4876), ('baisha', 0.4865), ('huangjing', 0.4796), ('xiyue', 0.4624), ('lijiang', 0.4312)]
[('judicial', 0.4426), ('court', 0.3929), ('courts', 0.3885), ('litigation', 0.3824), ('plaintiffs', 0.374)]
[('cadd', 0.3714), ('vocals', 0.3682), ('songwriter', 0.3557), ('albums', 0.3509), ('album', 0.3388)]
